Commit 9c94b1d2 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

added shap values to all cases

parent ae55edaa
Pipeline #47351 failed with stage
in 34 seconds
#!/usr/bin/env python
import utility.cluster as cluster_ut
import utility.dataset as dataset_ut
import config as cfg
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RandomizedSearchCV
from tools import file_reader, file_writer, preprocessor, feature_maker
from utility import cluster
from kmodes.kmodes import KModes
from kmodes import kmodes
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
......@@ -36,9 +32,9 @@ def main():
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
model = KModes(init=self.init, n_clusters=self.n_clusters,
model = kmodes.KModes(init=self.init, n_clusters=self.n_clusters,
n_init=self.n_init,
cat_dissim=cluster.calculate_disimmilarity,
cat_dissim=kmodes.ng_dissim,
n_jobs=-1)
model.fit(X.iloc[:, 4:].astype(str))
predictions = model.predict(X.iloc[:, 4:].astype(str))
......@@ -57,8 +53,7 @@ def main():
param_grid = [
{
'cluster_maker__init': ['random', 'Huang', 'Cao'],
'cluster_maker__n_clusters': [2, 5, 10, 20, 25, 30,
35, 40, 45, 50, 100],
'cluster_maker__n_clusters': [2, 5, 10, 20, 30, 40, 50, 100],
'cluster_maker__n_init': [1, 5, 10, 20]
}
]
......
FROM python:3.8
COPY . /app
WORKDIR /app
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r requirements.txt
RUN rm -rf requirements.txt
EXPOSE 80
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"]
\ No newline at end of file
import fastapi
import uvicorn
import pydantic
import shap
import os
import csv
import joblib
import pandas as pd
import api.utility as utility
import numpy as np
from typing import List
from numpy.lib.shape_base import column_stack
from pathlib import Path
ATS_RESOLUTION = 50
......@@ -20,26 +22,23 @@ class Features(pydantic.BaseModel):
@app.get('/')
def index():
return {'message': 'AIR API v. 0.1'}
return {'message': f'AIR API v. 0.1'}
@app.post('/predict_complete')
def predict_complete(incoming_data: Features):
data = incoming_data.dict()
df = prepare_data(data, 'complete')
model = utility.read_joblib('complete_xgboost.joblib')
model = read_joblib('complete_xgboost.joblib')
prediction = model.predict(df)
probability = model.predict_proba(df).max()
shap_values = get_shap_values(model, X_test=df)
_, shap_values = utility.get_shap_values(model, X_test=df)
shap_values_flat = [round(float(val), 3) for val in shap_values[0]]
shap_values_dict = dict(zip(df.columns, shap_values_flat))
return {
'prediction': int(prediction[0]),
'probability': float(probability),
'shap_values': shap_values_dict,
'shap_values': shap_values
}
@app.post('/predict_success')
......@@ -47,14 +46,16 @@ def predict_success(incoming_data: Features):
data = incoming_data.dict()
df = prepare_data(data, 'success')
model = utility.read_joblib('success_xgboost.joblib')
model = read_joblib('success_xgboost.joblib')
prediction = model.predict(df)
probability = model.predict_proba(df).max()
shap_values = get_shap_values(model, X_test=df)
return {
'prediction': int(prediction[0]),
'probability': float(probability)
'probability': float(probability),
'shap_values': shap_values
}
@app.post('/predict_fall')
......@@ -62,14 +63,16 @@ def predict_fall(incoming_data: Features):
data = incoming_data.dict()
df = prepare_data(data, 'fall')
model = utility.read_joblib('fall_xgboost.joblib')
model = read_joblib('fall_xgboost.joblib')
prediction = model.predict(df)
probability = model.predict_proba(df).max()
shap_values = get_shap_values(model, X_test=df)
return {
'prediction': int(prediction[0]),
'probability': float(probability)
'probability': float(probability),
'shap_values': shap_values
}
def prepare_data(data: dict, case: str) -> pd.DataFrame:
......@@ -77,7 +80,7 @@ def prepare_data(data: dict, case: str) -> pd.DataFrame:
new_data_df = pd.DataFrame.from_dict(new_data)
new_data_df['NumberAts'] = len(new_data_df['Ats'][0].split(","))
df = utility.split_categorical_columns(new_data_df, col='Ats', tag='Ats',
df = split_categorical_columns(new_data_df, col='Ats', tag='Ats',
resolution=ATS_RESOLUTION)
cols_ats = [str(i)+'Ats' for i in range(1, ATS_RESOLUTION+1)]
......@@ -91,7 +94,7 @@ def prepare_data(data: dict, case: str) -> pd.DataFrame:
new_data_df['Cluster'] = pd.to_numeric(new_data_df['Cluster'])
for i in range(1, ATS_RESOLUTION+1):
embedding = utility.read_embedding(f'{case}_{i}Ats.csv')
embedding = read_embedding(f'{case}_{i}Ats.csv')
column = f'{i}Ats'
df[column] = df[column].replace(to_replace=embedding)
df[column] = pd.to_numeric(df[column])
......@@ -99,10 +102,42 @@ def prepare_data(data: dict, case: str) -> pd.DataFrame:
df['Cluster'] = pd.to_numeric(df['Cluster'])
return df
def get_shap_values(model, X_train=None, X_test=None) -> List:
explainer = shap.TreeExplainer(model, data=X_train)
shap_values = explainer.shap_values(X_test)
shap_values_flat = [round(float(val), 3) for val in shap_values[0]]
shap_values_dict = dict(zip(X_test.columns, shap_values_flat))
return shap_values_dict
def get_cluster(ats):
model = utility.read_pickle('km.pickle')
model = read_joblib('km.joblib')
prediction = model.predict(ats)
return int(prediction[0])
def split_categorical_columns(df: pd.DataFrame, col: str,
tag: str, resolution: int) -> pd.DataFrame:
split = pd.DataFrame(df[col].str.split(pat=",", expand=True))
split = split.drop(split.iloc[:, resolution:], axis=1)
split = split.fillna(0)
df = pd.concat([df, split], axis=1)
df = df.drop(col, axis=1)
for i in range(0, resolution):
try:
df = df.rename(columns={i: f'{i+1}{tag}'})
except:
pass
return df
def read_embedding(filename: str):
dir_path = os.path.dirname(os.path.realpath(__file__))
with open(f'{dir_path}/embeddings/{filename}', 'r') as f:
reader = csv.reader(f)
embedding_dict = {rows[0]:rows[1] for rows in reader}
return embedding_dict
def read_joblib(filename: str) -> any:
dir_path = os.path.dirname(os.path.realpath(__file__))
return joblib.load(f'{dir_path}/models/{filename}')
if __name__ == "__main__":
uvicorn.run(app, port=8000, host="0.0.0.0")
\ No newline at end of file
pandas==1.2.3
shap==0.39.0
kmodes==0.11.0
fastapi==0.63.0
xgboost==1.3.3
uvicorn==0.13.4
\ No newline at end of file
import shap
from typing import List, Tuple
import numpy as np
from pathlib import Path
import pickle
import joblib
import csv
import pandas as pd
def split_categorical_columns(df: pd.DataFrame, col: str,
tag: str, resolution: int) -> pd.DataFrame:
split = pd.DataFrame(df[col].str.split(pat=",", expand=True))
split = split.drop(split.iloc[:, resolution:], axis=1)
split = split.fillna(0)
df = pd.concat([df, split], axis=1)
df = df.drop(col, axis=1)
for i in range(0, resolution):
try:
df = df.rename(columns={i: f'{i+1}{tag}'})
except:
pass
return df
def read_embedding(filename: str):
with open(Path.joinpath('data', filename), 'r') as f:
reader = csv.reader(f)
embedding_dict = {rows[0]:rows[1] for rows in reader}
return embedding_dict
def read_pickle(filename: str) -> any:
with open(Path.joinpath('models', filename), 'rb') as f:
data = pickle.load(f)
return data
def read_joblib(filename: str) -> any:
return joblib.load(Path.joinpath('models', filename))
def get_shap_values(model, X_train=None, X_test=None) -> np.ndarray:
explainer = shap.TreeExplainer(model, data=X_train)
shap_values = explainer.shap_values(X_test)
return shap_values
\ No newline at end of file
#!/usr/bin/env python
import os
import numpy as np
import pandas as pd
import config as cfg
import log as log
from typing import List
from kmodes import kmodes
from tools import file_reader, file_writer, preprocessor
from utility import cluster
from kmodes.kmodes import KModes
def main():
df = file_reader.read_csv(cfg.INTERIM_DATA_DIR, 'screenings.csv',
df = file_reader.read_csv(cfg.TESTS_FILES_DIR, 'screenings.csv',
converters={'CitizenId': str})
df = preprocessor.split_categorical_columns(df,
......@@ -22,10 +20,8 @@ def main():
header_list = ['CitizenId'] + cols_ats
df = df[header_list]
model = KModes(init='Cao', n_clusters=20, n_init=5,
cat_dissim=cluster.calculate_disimmilarity, n_jobs=-1)
model = kmodes.KModes(init='Cao', n_clusters=20, n_init=5, n_jobs=-1)
model.fit(df.iloc[:, 1:].astype(str))
file_writer.write_pickle(model, cfg.CLUSTERS_DIR, 'km.pickle')
predictions = model.predict(df.iloc[:, 1:].to_numpy())
clusters = pd.Series(predictions, name="Cluster")
......@@ -40,8 +36,12 @@ def main():
cluster_centroids = pd.DataFrame(dict([i for i in zip(range(0, len(model.cluster_centroids_)),
model.cluster_centroids_)])).transpose()
file_writer.write_joblib(model, cfg.CLUSTERS_DIR, 'km.joblib')
file_writer.write_csv(cluster_centroids, cfg.INTERIM_DATA_DIR, 'cluster_centroids.csv')
file_writer.write_csv(clusters, cfg.INTERIM_DATA_DIR, 'cl.csv')
def cols_exist(df: pd.DataFrame, cols: List[str]) -> bool:
return set(cols).issubset(df.columns)
if __name__ == '__main__':
main()
#!/usr/bin/env python
import numpy as np
import config as cfg
import pandas as pd
from tools import file_reader, file_writer, explainer
from utility import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
DATASET = 'fall_emb.csv'
MODEL_NAME = 'fall_xgboost.joblib'
MODEL_DIR = cfg.FALL_XGB_DIR
SAVE_MODEL = True
EXPLAIN = True
CSV_FILENAME = f"Fall best features.csv"
def main():
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, DATASET)
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
X = df.drop(['Fall'], axis=1)
y = df['Fall']
test_size = 100
X_test = X[-test_size:]
y_test = y[-test_size:]
X = X[:-test_size]
y = y[:-test_size]
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
params = {"n_estimators": 400,
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.1,
"eval_metric": "logloss",
"seed": 0
}
model = xgb.XGBClassifier(**params)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
y_valid_pred = 0*y
valid_acc, valid_pre, valid_recall = list(), list(), list()
for train_index, valid_index in skf.split(X, y):
X_train, X_valid = X.iloc[train_index,:], X.iloc[valid_index,:]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
optimize_rounds = True
early_stopping_rounds = 50
if optimize_rounds:
eval_set=[(X_valid, y_valid)]
fit_model = model.fit(X_train, y_train,
eval_set=eval_set,
eval_metric=metrics.gini_xgb,
early_stopping_rounds=early_stopping_rounds,
verbose=False)
else:
fit_model = model.fit(X_train, y_train)
pred = fit_model.predict_proba(X_valid)[:,1]
y_valid_pred.iloc[valid_index] = pred
y_valid_scores = (y_valid_pred.iloc[valid_index] > 0.5)
valid_acc.append(accuracy_score(y_valid, y_valid_scores))
valid_pre.append(precision_score(y_valid, y_valid_scores))
valid_recall.append(recall_score(y_valid, y_valid_scores))
print(f"Accuracy: {np.mean(valid_acc)}")
print(f"Precision: {np.mean(valid_pre)}")
print(f"Recall: {np.mean(valid_recall)}")
if SAVE_MODEL:
file_writer.write_joblib(model, MODEL_DIR, MODEL_NAME)
if EXPLAIN:
feature_names = X.columns
shap_explainer, shap_values = explainer.get_shap_tree_explainer(model, X, X_test)
importance_df = pd.DataFrame()
importance_df['feature'] = feature_names
importance_df['shap_values'] = np.around(abs(np.array(shap_values)[:,:]).mean(0), decimals=3)
importance_df['feat_imp'] = np.around(model.feature_importances_, decimals=3)
feat_importance_df_shap = importance_df.groupby('feature').mean().sort_values('shap_values',
ascending=False)
feat_importance_df_shap = feat_importance_df_shap.reset_index()
file_writer.write_csv(feat_importance_df_shap, cfg.REPORTS_DIR, CSV_FILENAME)
file_name_sum = 'fall_shap_summary'
file_name_exp = 'fall_shap_row_0'
explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
file_name_sum,
MODEL_DIR,
plot_type=None)
explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
file_name_sum + '_bar',
MODEL_DIR,
plot_type="bar")
explainer.explain_shap_instance(shap_explainer.expected_value,
shap_values[0,:], X_test.iloc[0], feature_names,
file_name_exp, MODEL_DIR)
if __name__ == '__main__':
main()
#!/usr/bin/env python
import numpy as np
import config as cfg
import pandas as pd
from tools import file_reader, file_writer, explainer
from utility import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
DATASET = 'success_emb.csv'
MODEL_NAME = 'success_xgboost.joblib'
MODEL_DIR = cfg.SUCCESS_XGB_DIR
SAVE_MODEL = True
EXPLAIN = True
CSV_FILENAME = f"Success best features.csv"
def main():
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, DATASET)
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
X = df.drop(['Success'], axis=1)
y = df['Success']
test_size = 100
X_test = X[-test_size:]
y_test = y[-test_size:]
X = X[:-test_size]
y = y[:-test_size]
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
params = {"n_estimators": 400,
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.1,
"eval_metric": "logloss",
"seed": 0
}
model = xgb.XGBClassifier(**params)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
y_valid_pred = 0*y
valid_acc, valid_pre, valid_recall = list(), list(), list()
for train_index, valid_index in skf.split(X, y):
X_train, X_valid = X.iloc[train_index,:], X.iloc[valid_index,:]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
optimize_rounds = True
early_stopping_rounds = 50
if optimize_rounds:
eval_set=[(X_valid, y_valid)]
fit_model = model.fit(X_train, y_train,
eval_set=eval_set,
eval_metric=metrics.gini_xgb,
early_stopping_rounds=early_stopping_rounds,
verbose=False)
else:
fit_model = model.fit(X_train, y_train)
pred = fit_model.predict_proba(X_valid)[:,1]
y_valid_pred.iloc[valid_index] = pred
y_valid_scores = (y_valid_pred.iloc[valid_index] > 0.5)
valid_acc.append(accuracy_score(y_valid, y_valid_scores))
valid_pre.append(precision_score(y_valid, y_valid_scores))
valid_recall.append(recall_score(y_valid, y_valid_scores))
print(f"Accuracy: {np.around(np.mean(valid_acc), decimals=3)}")
print(f"Precision: {np.around(np.mean(valid_pre), decimals=3)}")
print(f"Recall: {np.around(np.mean(valid_recall), decimals=3)}")
y_pred = model.predict(X_test)
file_writer.write_cm_plot(y_test, y_pred, cfg.REPORTS_PLOTS_DIR,
'success_xgb_cm.pdf', 'Success')
if SAVE_MODEL:
file_writer.write_joblib(model, MODEL_DIR, MODEL_NAME)
if EXPLAIN:
feature_names = X.columns
shap_explainer, shap_values = explainer.get_shap_tree_explainer(model, X, X_test)
importance_df = pd.DataFrame()
importance_df['feature'] = feature_names
importance_df['shap_values'] = np.around(abs(np.array(shap_values)[:,:]).mean(0), decimals=3)
importance_df['feat_imp'] = np.around(model.feature_importances_, decimals=3)
feat_importance_df_shap = importance_df.groupby('feature').mean().sort_values('shap_values',
ascending=False)
feat_importance_df_shap = feat_importance_df_shap.reset_index()
file_writer.write_csv(feat_importance_df_shap, cfg.REPORTS_DIR, CSV_FILENAME)
file_name_sum = 'success_shap_summary'
file_name_exp = 'success_shap_row_0'
explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
file_name_sum,
MODEL_DIR,
plot_type=None)
explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
file_name_sum + '_bar',
MODEL_DIR,
plot_type="bar")
explainer.explain_shap_instance(shap_explainer.expected_value,
shap_values[0,:], X_test.iloc[0], feature_names,
file_name_exp, MODEL_DIR)
if __name__ == '__main__':
main()
......@@ -7,80 +7,79 @@ from utility import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost as xgb
DATASET = 'complete_emb.csv'
MODEL_NAME = 'complete_xgboost.joblib'
MODEL_DIR = cfg.COMPLETE_XGB_DIR
SAVE_MODEL = True
EXPLAIN = True
CSV_FILENAME = f"Complete best features.csv"
DATA_DIR = cfg.TESTS_FILES_DIR
CASES = ["Complete", "Success", "Fall"]
def main():
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, DATASET)
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
for case in CASES:
if case == "Complete":
df = file_reader.read_csv(DATA_DIR, 'complete_emb.csv')
model_dir = cfg.COMPLETE_XGB_DIR
elif case == "Success":
df = file_reader.read_csv(DATA_DIR, 'success_emb.csv')
model_dir = cfg.SUCCESS_XGB_DIR
else:
df = file_reader.read_csv(DATA_DIR, 'fall_emb.csv')
model_dir = cfg.FALL_XGB_DIR
X = df.drop(['Complete'], axis=1)
y = df['Complete']
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
test_size = 100
X_test = X[-test_size:]
y_test = y[-test_size:]
X = X[:-test_size]
y = y[:-test_size]
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
X = df.drop([case], axis=1)
y = df[case]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
params = {"n_estimators": 400,
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.1,
"eval_metric": "logloss",
"seed": 0
}
model = xgb.XGBClassifier(**params)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
params = {"n_estimators": 400,
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.1,
"eval_metric": "logloss",
"seed": 0
}
model = xgb.XGBClassifier(**params)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
y_valid_pred = 0*y
valid_acc, valid_pre, valid_recall = list(), list(), list()
for train_index, valid_index in skf.split(X, y):
X_train, X_valid = X.iloc[train_index,:], X.iloc[valid_index,:]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
y_valid_pred = 0*y
valid_acc, valid_pre, valid_recall = list(), list(), list()
for train_index, valid_index in skf.split(X_train, y_train):
X_train_split, X_valid_split = X_train.iloc[train_index,:], X_train.iloc[valid_index,:]
y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]
optimize_rounds = True
early_stopping_rounds = 50
if optimize_rounds:
eval_set=[(X_valid, y_valid)]