Commit b28156a7 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

added a new case to measure success

parent fda3bd7d
Pipeline #46998 failed with stage
in 3 minutes and 24 seconds
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -9,12 +9,11 @@ import xgboost as xgb
import pandas as pd
from utility.metrics import gini_xgb
import shap
from sklearn.metrics import accuracy_score
from typing import List
CASE = "Complete"
COMPLETE_FILENAME = "complete_emb.csv"
FALL_FILENAME = "fall_emb.csv"
CASE = "Fall"
COMPLETE_FILENAME = "complete_count.csv"
FALL_FILENAME = "fall_count.csv"
CSV_FILENAME = f"{CASE} best features.csv"
PLOT_FILENAME = f"{CASE} SHAP feature values"
NUM_ITERATIONS = 5
......@@ -55,9 +54,11 @@ def get_best_shap_features(X: np.ndarray, y: np.ndarray,
scale_pos_weight = neg / pos
model = xgb.XGBClassifier(n_estimators=400,
max_depth=10,
learning_rate=0.1,
objective='binary:logistic',
scale_pos_weight=scale_pos_weight,
eval_metric='logloss',
use_label_encoder=False,
n_jobs=-1,
random_state=0,
seed=seed)
......@@ -74,17 +75,16 @@ def get_best_shap_features(X: np.ndarray, y: np.ndarray,
verbose=0)
y_val_pred = model.predict_proba(X_val)[:,1]
y_scores_new = (y_val_pred > 0.5)
acc_score = np.around(accuracy_score(y_val, y_scores_new), decimals=3)
acc_score_list.append(acc_score)
acc_score_list.append(np.around(accuracy_score(y_val, y_scores_new), decimals=3))
shap_values = shap.TreeExplainer(model).shap_values(X_train)
fold_importance_df = pd.DataFrame()
fold_importance_df['feature'] = cols
fold_importance_df['shap_values'] = np.around(abs(np.array(shap_values)[:,:]).mean(0), decimals = 2)
fold_importance_df['feat_imp'] = np.around(model.feature_importances_, decimals = 2)
fold_importance_df['shap_values'] = np.around(abs(np.array(shap_values)[:,:]).mean(0), decimals=2)
fold_importance_df['feat_imp'] = np.around(model.feature_importances_, decimals=2)
feat_importance_df = pd.concat([feat_importance_df, fold_importance_df])
mean_accuracy = np.mean(acc_score_list)
print(f"Mean accuracy: {mean_accuracy}")
mean_acc_score = np.mean(acc_score_list)
print(f"Mean accuracy: {mean_acc_score}")
feat_importance_df_shap = feat_importance_df.groupby('feature').mean().sort_values('shap_values',
ascending=False)
feat_importance_df_shap = feat_importance_df_shap.reset_index()
......
......@@ -54,18 +54,18 @@ def main():
test_size=0.3, random_state=0)
model = make_model(input_dim=X.shape[1])
no_class_weight_history = model.fit(np.array(X_train), np.array(y_train), epochs=50,
no_class_weight_history = model.fit(np.array(X_train), np.array(y_train), epochs=25,
validation_data=(np.array(X_valid), np.array(y_valid)), batch_size=16)
neg, pos = np.bincount(y_train)
initial_bias = np.log([pos/neg])
model = make_model(input_dim=X.shape[1], output_bias=initial_bias)
no_class_weight_bias_history = model.fit(np.array(X_train), np.array(y_train), epochs=50,
no_class_weight_bias_history = model.fit(np.array(X_train), np.array(y_train), epochs=25,
validation_data=(np.array(X_valid), np.array(y_valid)), batch_size=16)
class_weight = preprocessor.get_class_weight(neg, pos)
model = make_model(input_dim=X.shape[1])
class_weight_history = model.fit(np.array(X_train), np.array(y_train), epochs=50,
class_weight_history = model.fit(np.array(X_train), np.array(y_train), epochs=25,
validation_data=(np.array(X_valid), np.array(y_valid)), batch_size=16,
class_weight=class_weight)
......
#!/usr/bin/env python
import numpy as np
import pandas as pd
import config as cfg
from typing import List
from tools import file_reader, file_writer, preprocessor, classifiers
......@@ -9,7 +8,7 @@ tf.get_logger().setLevel('ERROR')
from pathlib import Path
NUM_ITER = 10
CASES = ["Complete", "Fall"]
CASES = ["Complete", "Success", "Fall"]
class Result:
def __init__(self, name, result):
......@@ -23,16 +22,17 @@ class CVResult:
self.rec = rec
self.rocauc = rocauc
ATS_COLS = [str(i)+'Ats' for i in range(1,11)]
ATS_COLS = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)] \
+ ['Cluster', 'LoanPeriod', 'NumberAts']
CLF_NAMES = ["MLP", "LR", "XGB", "RF", "SVM", "KNN"]
CLASSIFIERS = {
"MLP": classifiers.train_mlp_cv,
"LR": classifiers.train_lr_cv,
"XGB": classifiers.train_xgb_cv,
"RF": classifiers.train_rf_cv,
"SVM": classifiers.train_svm_cv,
"KNN": classifiers.train_knn_cv
}
"MLP": classifiers.train_mlp_cv,
"LR": classifiers.train_lr_cv,
"XGB": classifiers.train_xgb_cv,
"RF": classifiers.train_rf_cv,
"SVM": classifiers.train_svm_cv,
"KNN": classifiers.train_knn_cv
}
def load_complete():
ats = {str(i)+'Ats':str for i in range(1,11)}
......@@ -47,7 +47,14 @@ def load_fall():
'fall.csv',
converters=converters)
return df
def load_success():
converters = {str(i)+'Ats':str for i in range(1,11)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
'success.csv',
converters=converters)
return df
def main():
for case in CASES:
results_filename = f"{case} baseline results.txt"
......@@ -60,6 +67,11 @@ def main():
X = df.drop(['Complete'], axis=1)
y = df['Complete']
X = X.drop(ATS_COLS, axis=1)
elif case == "Success":
df = load_success()
X = df.drop(['Success'], axis=1)
y = df['Success']
X = X.drop(ATS_COLS, axis=1)
else:
df = load_fall()
X = df.drop(['Fall'], axis=1)
......@@ -87,6 +99,10 @@ def main():
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'complete_emb.csv')
X = df.drop(['Complete'], axis=1)
y = df['Complete']
elif case == "Success":
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'success_emb.csv')
X = df.drop(['Success'], axis=1)
y = df['Success']
else:
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'fall_emb.csv')
X = df.drop(['Fall'], axis=1)
......@@ -116,6 +132,10 @@ def main():
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'complete_count.csv')
X = df.drop(['Complete'], axis=1)
y = df['Complete']
elif case == "Success":
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'success_count.csv')
X = df.drop(['Success'], axis=1)
y = df['Success']
else:
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'fall_count.csv')
X = df.drop(['Fall'], axis=1)
......@@ -140,7 +160,7 @@ def train_clf(X: np.ndarray, y: np.ndarray, clf_name: str,
for k in range(NUM_ITER):
X = preprocessor.scale_data_minmax(X, n_scale_cols)
clf = CLASSIFIERS[clf_name]
_, res_acc, res_pre, res_rec, res_rocauc, res_probas = clf(X, y, case, k)
_, res_acc, res_pre, res_rec, res_rocauc, res_probas = clf(X, y, k)
make_and_print_scores(clf_name, k, res_acc, res_pre, res_rec, res_rocauc, results_filename)
y_pred_probas += res_probas[:,1]
y_pred_acc_mean.append(res_acc.mean())
......@@ -180,10 +200,9 @@ def make_plots(y_test: np.ndarray, results: np.ndarray,
roc_file_name = f"{case_name} version {case_number} - ROC curves.pdf"
results_list = list(results)
file_writer.write_roc_curve(y_test, results_list,
cfg.REPORTS_PLOTS_DIR, roc_file_name, case_subtitle)
cfg.REPORTS_PLOTS_DIR, roc_file_name, case_subtitle)
file_writer.write_accuracy_plot(results_list, NUM_ITER, CLF_NAMES, cfg.REPORTS_PLOTS_DIR,
f"{case_name} version {case_number} - Accuracy.pdf",
case_subtitle)
f"{case_name} version {case_number} - Accuracy.pdf", case_subtitle)
file_writer.write_precision_plot(results_list, NUM_ITER, CLF_NAMES, cfg.REPORTS_PLOTS_DIR,
f"{case_name} version {case_number} - Precision.pdf", case_subtitle)
file_writer.write_recall_plot(results_list, NUM_ITER, CLF_NAMES, cfg.REPORTS_PLOTS_DIR,
......
......@@ -23,9 +23,7 @@ def create_model(hp, input_dim=14):
hp_dropout = hp.Choice('dropout', values=[0.1, 0.2, 0.5])
model.add(tf.keras.layers.Dropout(hp_dropout))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
hp_learning_rate = hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3, 1e-4, 1e-5])
hp_momentum = hp.Choice('momentum', [0.0, 0.2, 0.4, 0.6, 0.8, 0.9])
model.compile(optimizer=tf.keras.optimizers.SGD(hp_learning_rate, momentum=hp_momentum),
model.compile(optimizer=tf.keras.optimizers.Adam(),
loss='binary_crossentropy',
metrics=['accuracy'])
return model
......
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from tools import data_loader
CASE = "Complete"
COMPLETE_FILENAME = "complete_with_embeddings.csv"
FALL_FILENAME = "fall_with_embeddings.csv"
SCALING_STRATEGY = "Standard"
def main():
pipeline = Pipeline([
('scaler', StandardScaler()),
('clf', 'passthrough')])
n_neighbors = [2, 4, 6, 8, 10, 15, 20, 50]
max_depth = [2, 6, 10, 50]
n_estimators = [100, 200, 400, 800]
max_features = [4, 8, 20, 36]
min_samples_leaf = [5, 10, 20, 40]
kernel = ['linear', 'poly', 'rbf']
param_grid = [
{
'clf': [KNeighborsClassifier()],
'clf__n_neighbors': n_neighbors
},
{
'clf': [RandomForestClassifier(random_state=0)],
'clf__max_depth': max_depth,
'clf__n_estimators': n_estimators,
'clf__max_features': max_features,
'clf__min_samples_leaf': min_samples_leaf
},
{
'clf': [LogisticRegression(random_state=0)]
},
{
'clf': [SVC(random_state=0)],
'clf__kernel': kernel,
},
{
'clf': [GaussianNB()]
}
]
if CASE == "Complete":
X, y = data_loader.CompleteDataLoader(COMPLETE_FILENAME) \
.load_data().prepare_data(SCALING_STRATEGY)
else:
X, y = data_loader.FallDataLoader(FALL_FILENAME) \
.load_data().prepare_data(SCALING_STRATEGY)
grid = GridSearchCV(pipeline, cv=5, n_jobs=-1, param_grid=param_grid, scoring="accuracy")
res = grid.fit(X, y)
def report(results, n_top=10):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})"
.format(results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
report(res.cv_results_)
if __name__ == "__main__":
main()
\ No newline at end of file
import numpy as np
import pandas as pd
import config as cfg
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from tools import data_loader, file_writer
from sklearn.model_selection import StratifiedKFold
CASE = "Complete"
COMPLETE_FILENAME = "complete_count.csv"
FALL_FILENAME = "fall_count.csv"
SCALING_STRATEGY = "Standard"
def main():
if CASE == "Complete":
X, y = data_loader.CompleteDataLoader(COMPLETE_FILENAME) \
.load_data().prepare_data(SCALING_STRATEGY)
else:
X, y = data_loader.FallDataLoader(FALL_FILENAME) \
.load_data().prepare_data(SCALING_STRATEGY)
params = {
'min_child_weight': [1, 5, 10, 20],
'gamma': [0.1, 0.2, 0.5, 1, 1.5, 2, 5, 10],
'subsample': [0.2, 0.4, 0.6, 0.8, 1.0],
'colsample_bytree': [0.2, 0.4, 0.6, 0.8, 1.0],
'max_depth': [1, 2, 3, 4, 5, 10, 20, 50]
}
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
xgb = XGBClassifier(learning_rate=0.1,
n_estimators=400,
scale_pos_weight=scale_pos_weight,
objective='binary:logistic',
eval_metric='logloss',
use_label_encoder=False,
seed=0)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
random_search = RandomizedSearchCV(xgb,
param_distributions=params,
n_iter=10,
scoring='neg_log_loss',
n_jobs=-1,
cv=skf,
verbose=3,
random_state=0)
random_search.fit(X, y)
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (5, 5))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
file_writer.write_csv(results, cfg.REPORTS_DIR, 'xgb-random-grid-search-results.csv')
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -39,8 +39,15 @@ FALL_XGB_DIR = Path.joinpath(ROOT_DIR, 'models/fall/xgboost')
FALL_CAT_DIR = Path.joinpath(ROOT_DIR, 'models/fall/catboost')
FALL_RF_DIR = Path.joinpath(ROOT_DIR, 'models/fall/random_forest')
FALL_EMB_DIR = Path.joinpath(ROOT_DIR, 'models/fall/embeddings')
GENERAL_FEATURES = ['Gender', 'Age', 'Cluster']
SUCCESS_DIR = Path.joinpath(ROOT_DIR, 'models/success')
SUCCESS_TF_DIR = Path.joinpath(ROOT_DIR, 'models/success/tensorflow')
SUCCESS_XGB_DIR = Path.joinpath(ROOT_DIR, 'models/success/xgboost')
SUCCESS_CAT_DIR = Path.joinpath(ROOT_DIR, 'models/success/catboost')
SUCCESS_RF_DIR = Path.joinpath(ROOT_DIR, 'models/success/random_forest')
SUCCESS_EMB_DIR = Path.joinpath(ROOT_DIR, 'models/success/embeddings')
GENERAL_FEATURES = ['Gender', 'Age', 'Cluster']
ATS_RESOLUTION = 50
THRESHOLD_WEEKS = 8
THRESHOLD_TRAINING = 10
FALL_EXERCISE_THRESHOLD = 3
......
......@@ -8,8 +8,7 @@ import numpy as np
from pathlib import Path
from sklearn.decomposition import PCA
CASES = ["Complete", "Fall"]
ATS_RESOLUTION = 50
CASES = ["Complete", "Success", "Fall"]
def main():
cl = file_reader.read_csv(cfg.INTERIM_DATA_DIR, 'cl.csv',
......@@ -31,10 +30,12 @@ def make_dataset_full(cl: pd.DataFrame, df: pd.DataFrame, case: str):
df = pd.merge(df, cl[['CitizenId', 'Cluster']], how='inner', on=['CitizenId'])
df['Cluster'] = df['Cluster'].fillna(0)
df = preprocessor.split_categorical_columns(df, col='Ats', tag='Ats', resolution=ATS_RESOLUTION)
df = preprocessor.split_categorical_columns(df, col='Ats', tag='Ats', resolution=cfg.ATS_RESOLUTION)
if case == "Complete":
df = feature_maker.make_complete_feature(df)
elif case == "Success":
df = feature_maker.make_success_feature(df)
else:
df = feature_maker.make_fall_feature(df)
......@@ -45,14 +46,14 @@ def make_dataset_full(cl: pd.DataFrame, df: pd.DataFrame, case: str):
return df
def make_dataset_count(case: str):
ats = {str(i)+'Ats':str for i in range(1,ATS_RESOLUTION+1)}
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
f'{case.lower()}.csv',
converters=ats)
num_cols = embedder.get_numerical_cols(df, case)
cols_ats = [str(i)+'Ats' for i in range(1,ATS_RESOLUTION+1)]
unique_ats = [df[f'{i}Ats'].unique() for i in range(1,ATS_RESOLUTION+1)]
cols_ats = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
unique_ats = [df[f'{i}Ats'].unique() for i in range(1, cfg.ATS_RESOLUTION+1)]
unique_ats = list(set(np.concatenate(unique_ats)))
df_ats = preprocessor.extract_cat_count(df, unique_ats, cols_ats, 'Ats_')
......@@ -64,7 +65,7 @@ def make_dataset_count(case: str):
return df
def make_dataset_emb(case: str):
ats = {str(i)+'Ats':str for i in range(1,ATS_RESOLUTION+1)}
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
f'{case.lower()}.csv',
converters=ats)
......@@ -83,6 +84,9 @@ def make_dataset_emb(case: str):
if case == "Complete":
artifacts_path = cfg.COMPLETE_EMB_DIR
epochs = 5
elif case == "Success":
artifacts_path = cfg.SUCCESS_EMB_DIR
epochs = 5
else:
artifacts_path = cfg.FALL_EMB_DIR
epochs = 20
......
......@@ -7,7 +7,6 @@ from tools import file_reader, file_writer, preprocessor
import tensorflow as tf
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tools import explainer
dataset = 'complete_with_embeddings.csv'
......
#!/usr/bin/env python
import numpy as np
import config as cfg
from tools import file_reader, file_writer, explainer, tree_classifier
from tools import file_reader, file_writer, explainer, classifiers
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
DATASET = 'complete_with_count.csv'
DATASET = 'complete_count.csv'
MODEL_NAME = 'random_forest.joblib'
MODEL_DIR = cfg.COMPLETE_RF_DIR
SAVE_MODEL = True
......@@ -23,15 +23,13 @@ def main():
X = X[:-test_size]
y = y[:-test_size]
model, result_acc, result_pre, result_recall, result_rocauc, _ = tree_classifier.train_rf_cv(X, y)
model, result_acc, result_pre, result_recall, result_rocauc, _ = classifiers.train_rf_cv(X, y)
print(f"Mean valid accuracy: {round(np.mean(result_acc), 3)}")
print(f"Mean valid precision: {round(np.mean(result_pre), 3)}")
print(f"Mean valid recall: {round(np.mean(result_recall), 3)}")
print(f"Mean valid AUC: {round(np.mean(result_rocauc), 3)}")
y_test_pred = model.predict(X_test)
print(f"Confusion matrix:\n {confusion_matrix(y_test, y_test_pred)}\n")
print(f"Classification report:\n {classification_report(y_test, y_test_pred)}\n")
......
#!/usr/bin/env python
import numpy as np
import config as cfg
import pandas as pd
from tools import file_reader, file_writer, explainer
from utility import metrics
from sklearn.metrics import accuracy_score
......@@ -8,11 +9,12 @@ from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
DATASET = 'complete_emb.csv'
DATASET = 'complete_count.csv'
MODEL_NAME = 'xgboost.joblib'
MODEL_DIR = cfg.COMPLETE_XGB_DIR
SAVE_MODEL = True
EXPLAIN = True
CSV_FILENAME = f"Complete best features.csv"
def main():
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, DATASET)
......@@ -31,9 +33,11 @@ def main():
scale_pos_weight = neg / pos
params = {"n_estimators": 400,
"max_depth": 6,
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.1,
"eval_metric": "logloss",
"seed": 0
}
model = xgb.XGBClassifier(**params)
......@@ -68,6 +72,10 @@ def main():
print(f"Accuracy: {np.around(np.mean(valid_acc), decimals=3)}")
print(f"Precision: {np.around(np.mean(valid_pre), decimals=3)}")
print(f"Recall: {np.around(np.mean(valid_recall), decimals=3)}")
y_pred = model.predict(X_test)
file_writer.write_cm_plot(y_test, y_pred, cfg.REPORTS_PLOTS_DIR,
'complete_xgb_cm.pdf', 'Complete')
if SAVE_MODEL:
file_writer.write_model(model, MODEL_DIR, MODEL_NAME)
......@@ -75,6 +83,14 @@ def main():
if EXPLAIN:
feature_names = X.columns
shap_explainer, shap_values = explainer.get_shap_tree_explainer(model, X, X_test)
importance_df = pd.DataFrame()
importance_df['feature'] = feature_names
importance_df['shap_values'] = np.around(abs(np.array(shap_values)[:,:]).mean(0), decimals=3)
importance_df['feat_imp'] = np.around(model.feature_importances_, decimals=3)
feat_importance_df_shap = importance_df.groupby('feature').mean().sort_values('shap_values',
ascending=False)
feat_importance_df_shap = feat_importance_df_shap.reset_index()
file_writer.write_csv(feat_importance_df_shap, cfg.REPORTS_DIR, CSV_FILENAME)
file_name_sum = 'complete_shap_summary'
file_name_exp = 'complete_shap_row_0'
explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
......
#!/usr/bin/env python
import numpy as np
import config as cfg
import pandas as pd
from tools import file_reader, file_writer, explainer
from utility import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
......@@ -8,11 +9,12 @@ from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
DATASET = 'fall_with_count.csv'
DATASET = 'fall_count.csv'
MODEL_NAME = 'xgboost.joblib'
MODEL_DIR = cfg.FALL_XGB_DIR
SAVE_MODEL = True
EXPLAIN = True
CSV_FILENAME = f"Fall best features.csv"
def main():
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, DATASET)
......@@ -31,10 +33,11 @@ def main():
scale_pos_weight = neg / pos
params = {"n_estimators": 400,
"max_depth": 6,
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.1,
"eval_metric": "logloss",
"seed": 0
}
model = xgb.XGBClassifier(**params)
......@@ -76,6 +79,14 @@ def main():
if EXPLAIN:
feature_names = X.columns
shap_explainer, shap_values = explainer.get_shap_tree_explainer(model, X, X_test)
importance_df = pd.DataFrame()
importance_df['feature'] = feature_names
importance_df['shap_values'] = np.around(abs(np.array(shap_values)[:,:]).mean(0), decimals=3)
importance_df['feat_imp'] = np.around(model.feature_importances_, decimals=3)
feat_importance_df_shap = importance_df.groupby('feature').mean().sort_values('shap_values',
ascending=False)
feat_importance_df_shap = feat_importance_df_shap.reset_index()
file_writer.write_csv(feat_importance_df_shap, cfg.REPORTS_DIR, CSV_FILENAME)
file_name_sum = 'fall_shap_summary'
file_name_exp = 'fall_shap_row_0'
explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
......
#!/usr/bin/env python
import numpy as np
import config as cfg
import pandas as pd
from tools import file_reader, file_writer, explainer
from utility import metrics
from sklearn.metrics import accuracy_score