Commit bce4cd47 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

improved scripts for analysis

parent e6451caf
Pipeline #67888 passed with stage
in 3 minutes and 27 seconds
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -17,4 +17,5 @@ fastapi==0.63.0
uvicorn==0.13.4
fastapi-jwt-auth==0.5.0
PyYAML==5.4.1
tensorflow-privacy==0.6.2
\ No newline at end of file
tensorflow-privacy==0.6.2
imbalanced-learn==0.8.0
\ No newline at end of file
......@@ -10,40 +10,51 @@ import pandas as pd
from utility.metrics import gini_xgb
import shap
from typing import List
from utility.settings import load_settings
CASE = "Complete"
COMPLETE_FILENAME = "complete_emb.csv"
COMPLIANCE_FILENAME = "compliance_emb.csv"
FALL_FILENAME = "fall_emb.csv"
CSV_FILENAME = f"{CASE} best features.csv"
PLOT_FILENAME = f"{CASE} SHAP feature values"
NUM_ITERATIONS = 5
def main():
if CASE == "Complete":
dl = data_loader.CompleteDataLoader(COMPLETE_FILENAME).load_data()
else:
dl = data_loader.FallDataLoader(FALL_FILENAME).load_data()
X, y = dl.get_data()
cols = X.columns
X = np.array(X)
y = np.array(y)
cases = ["Complete", "Compliance", "Fall", "Risk"]
for case in cases:
if case == "Complete":
settings = load_settings("complete_emb.yaml")
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Compliance":
settings = load_settings("compliance_emb.yaml")
dl = data_loader.ComplianceDataLoader("compliance_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Fall":
settings = load_settings("fall_emb.yaml")
dl = data_loader.FallDataLoader("fall_emb.csv", settings).load_data()
X, y = dl.get_data()
else:
settings = load_settings("risk_emb.yaml")
dl = data_loader.RiskDataLoader("risk_emb.csv", settings).load_data()
X, y = dl.get_data()
features = dl.get_features()
X, y = dl.prepare_data()
X = np.array(X)
y = np.array(y)
total_shap_df = pd.DataFrame()
for seed in range(NUM_ITERATIONS):
print('#'*40, '{} of {} iterations'.format(seed+1, NUM_ITERATIONS), '#' * 40)
temp_shap_df = get_best_shap_features(X, y, cols, seed)
total_shap_df = pd.concat([total_shap_df, temp_shap_df])
shap_sorted_df = total_shap_df.groupby('feature').mean().sort_values('shap_values',
ascending=False)
shap_sorted_df = shap_sorted_df.reset_index()
total_shap_df = pd.DataFrame()
for seed in range(NUM_ITERATIONS):
print('#'*40, '{} of {} iterations'.format(seed+1, NUM_ITERATIONS), '#' * 40)
temp_shap_df = get_best_shap_features(X, y, features, seed)
total_shap_df = pd.concat([total_shap_df, temp_shap_df])
shap_sorted_df = total_shap_df.groupby('feature').mean().sort_values('shap_values',
ascending=False)
shap_sorted_df = shap_sorted_df.reset_index()
importances = shap_sorted_df['shap_values']
features = shap_sorted_df['feature']
file_writer.write_shap_importance_plot(features, importances, pt.REPORTS_PLOTS_DIR, PLOT_FILENAME)
file_writer.write_csv(shap_sorted_df, pt.REPORTS_DIR, CSV_FILENAME)
importances = shap_sorted_df['shap_values']
features = shap_sorted_df['feature']
plot_file_Name = f"{case} SHAP feature values"
csv_file_name = f"{case} best features.csv"
file_writer.write_shap_importance_plot(features, importances, pt.REPORTS_PLOTS_DIR, plot_file_Name)
file_writer.write_csv(shap_sorted_df, pt.REPORTS_DIR, csv_file_name)
def get_best_shap_features(X: np.ndarray, y: np.ndarray,
cols: List[str], seed: int):
......
......@@ -4,96 +4,95 @@ import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from utility.settings import load_settings
import csv
from typing import List
import paths as pt
from pathlib import Path
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import make_pipeline, Pipeline
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
CASE = "Complete"
COMPLETE_FILENAME = "complete_emb.csv"
COMPLIANCE_FILENAME = "compliance_emb.csv"
FALL_FILENAME = "fall_emb.csv"
COLORS = plt.rcParams['axes.prop_cycle'].by_key()['color']
METRICS = [
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='auc'),
]
def make_model(input_dim, output_bias=None):
if output_bias is not None:
output_bias = tf.keras.initializers.Constant(output_bias)
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(128,
input_dim=input_dim,
activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1, activation='sigmoid',
bias_initializer=output_bias))
model.compile(loss='binary_crossentropy',
optimizer=tf.keras.optimizers.Adam(),
metrics=METRICS)
return model
def make_model(input_dim, class_weight=None):
def make_keras_model():
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(80,
input_dim=input_dim,
activation='relu'))
model.add(tf.keras.layers.Dropout(0.35))
model.add(tf.keras.layers.Dense(20, activation='relu'))
model.add(tf.keras.layers.Dropout(0.15))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dropout(0.15))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer="Adam")
return model
return KerasClassifier(make_keras_model, epochs=20, batch_size=32,
class_weight=class_weight, verbose=False)
def make_plots(history, label, n, metric):
plt.plot(history.epoch, history.history[metric],
color=COLORS[n], label='Train ' + label)
plt.plot(history.epoch, history.history[f'val_{metric}'],
color=COLORS[n], label='Val ' + label,
linestyle="--")
plt.xlabel('Epoch')
plt.ylabel(metric)
plt.legend()
def main():
if CASE == "Complete":
X, y = data_loader.CompleteDataLoader(COMPLETE_FILENAME) \
.load_data().prepare_data(normalization_strategy="BoxCox",
scaling_strategy="Standard")
elif CASE == "Compliance":
X, y = data_loader.ComplianceDataLoader(COMPLIANCE_FILENAME) \
.load_data().prepare_data()
else:
X, y = data_loader.FallDataLoader(FALL_FILENAME) \
.load_data().prepare_data()
cases = ["Complete", "Compliance", "Fall", "Risk"]
for case in cases:
output_filename = f"{case} model balance results.csv"
header = ['clf', 'version', 'accuracy', 'precision',
'recall', 'roc_auc', 'pr_auc']
with open(Path.joinpath(pt.REPORTS_DIR, output_filename), 'w',
encoding='UTF8', newline='') as f:
writer = csv.writer(f)
writer.writerow(header)
if case == "Complete":
settings = load_settings("complete_emb.yaml")
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Compliance":
settings = load_settings("compliance_emb.yaml")
dl = data_loader.ComplianceDataLoader("compliance_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Fall":
settings = load_settings("fall_emb.yaml")
dl = data_loader.FallDataLoader("fall_emb.csv", settings).load_data()
X, y = dl.get_data()
else:
settings = load_settings("risk_emb.yaml")
dl = data_loader.RiskDataLoader("risk_emb.csv", settings).load_data()
X, y = dl.get_data()
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
test_size=0.3, random_state=0)
model = make_model(input_dim=X.shape[1])
no_class_weight_history = model.fit(np.array(X_train), np.array(y_train), epochs=50,
validation_data=(np.array(X_valid), np.array(y_valid)), batch_size=16, verbose=False)
neg, pos = np.bincount(y_train)
initial_bias = np.log([pos/neg])
model = make_model(input_dim=X.shape[1], output_bias=initial_bias)
no_class_weight_bias_history = model.fit(np.array(X_train), np.array(y_train), epochs=50,
validation_data=(np.array(X_valid), np.array(y_valid)), batch_size=16, verbose=False)
class_weight = preprocessor.get_class_weight(neg, pos)
model = make_model(input_dim=X.shape[1])
class_weight_history = model.fit(np.array(X_train), np.array(y_train), epochs=50,
validation_data=(np.array(X_valid), np.array(y_valid)), batch_size=16,
class_weight=class_weight, verbose=False)
make_plots(no_class_weight_history, f"{CASE} no class weight", 0, 'loss')
make_plots(no_class_weight_bias_history, f"{CASE} no class weight bias", 1, 'loss')
make_plots(class_weight_history, f"{CASE} with class weight", 2, 'loss')
plt.show()
make_plots(no_class_weight_history, f"{CASE} no class weight", 0, 'accuracy')
make_plots(no_class_weight_bias_history, f"{CASE} no class weight bias", 1, 'accuracy')
make_plots(class_weight_history, f"{CASE} with class weight", 2, 'accuracy')
plt.show()
make_plots(no_class_weight_history, f"{CASE} no class weight", 0, 'precision')
make_plots(no_class_weight_bias_history, f"{CASE} no class weight bias", 1, 'precision')
make_plots(class_weight_history, f"{CASE} with class weight", 2, 'precision')
plt.show()
make_plots(no_class_weight_history, f"{CASE} no class weight", 0, 'recall')
make_plots(no_class_weight_bias_history, f"{CASE} no class weight bias", 1, 'recall')
make_plots(class_weight_history, f"{CASE} with class weight", 2, 'recall')
plt.show()
make_plots(no_class_weight_history, f"{CASE} no class weight", 0, 'auc')
make_plots(no_class_weight_bias_history, f"{CASE} no class weight bias", 1, 'auc')
make_plots(class_weight_history, f"{CASE} with class weight", 2, 'auc')
plt.show()
X, y = dl.prepare_data()
versions = ['NoCW', 'CW', 'Oversampling']
metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'average_precision']
for version in versions:
if version == "NoCW":
model = make_model(input_dim=X.shape[1])
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
results = cross_validate(model, X, y, cv=kfold, scoring=metrics)
elif version == "CW":
neg, pos = np.bincount(y)
class_weight = preprocessor.get_class_weight(neg, pos)
model = make_model(input_dim=X.shape[1],
class_weight=class_weight)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
results = cross_validate(model, X, y, cv=kfold, scoring=metrics)
else:
adasyn = ADASYN(random_state=0)
model = make_model(input_dim=X.shape[1])
pipeline = Pipeline([('sampling', adasyn), ('model', model)])
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
results = cross_validate(pipeline, X, y, cv=kfold, scoring=metrics)
with open(Path.joinpath(pt.REPORTS_DIR, output_filename), 'a',
encoding='UTF8', newline='') as f:
writer = csv.writer(f)
data = ["MLP", version,
round(np.mean(results['test_accuracy']*100), 3),
round(np.mean(results['test_precision']*100), 3),
round(np.mean(results['test_recall']*100), 3),
round(np.mean(results['test_roc_auc']*100), 3),
round(np.mean(results['test_average_precision']*100), 3)]
writer.writerow(data)
if __name__ == '__main__':
main()
\ No newline at end of file
#!/usr/bin/env python
import numpy as np
import paths as pt
from typing import List
from tools import file_reader, file_writer, preprocessor, data_loader
from tools.classifiers import KnnClassifier, SvmClassifier, LrClassifier
from tools.classifiers import XgbClassifier, RfClassifier, MlpClassifier
import tensorflow as tf
from pathlib import Path
import csv
from utility.settings import load_settings
from utility.metrics import compute_mean, compute_std
tf.get_logger().setLevel('ERROR')
class Result:
def __init__(self, name, result):
self.name = name
self.result = result
class CVResult:
def __init__(self, acc, pre, rec, rocauc):
self.acc = acc
self.pre = pre
self.rec = rec
self.rocauc = rocauc
def get_version_subtitle(version):
if version == "NoAts":
return "without Ats and/or Ex columns"
elif version == "Embedded":
return "with embeddings"
else:
return "with counts"
def prepare_data(X, y, settings):
features_to_normalize = settings['features_to_normalize']
features_to_scale = settings['features_to_scale']
X = preprocessor.normalize_data(X, features_to_normalize)
X = preprocessor.scale_data(X, features_to_scale)
X = np.array(X)
y = np.array(y)
return X, y
def load_data_embedded(case, settings):
if case == "Complete":
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Compliance":
dl = data_loader.ComplianceDataLoader("compliance_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Fall":
dl = data_loader.FallDataLoader("fall_emb.csv", settings).load_data()
X, y = dl.get_data()
else:
dl = data_loader.RiskDataLoader("risk_emb.csv", settings).load_data()
X, y = dl.get_data()
return X, y
def load_data_counts(case, settings):
if case == "Complete":
dl = data_loader.CompleteDataLoader("complete_counts.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Compliance":
dl = data_loader.ComplianceDataLoader("compliance_counts.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Fall":
dl = data_loader.FallDataLoader("fall_counts.csv", settings).load_data()
X, y = dl.get_data()
else:
dl = data_loader.RiskDataLoader("risk_counts.csv", settings).load_data()
X, y = dl.get_data()
return X, y
def main():
num_iter = 2
cases = ["Complete", "Compliance", "Fall", "Risk"]
for case in cases:
settings = load_settings(f'{case.lower()}_emb.yaml')
output_filename = f"{case} versions results.csv"
header = ['clf', 'version', 'accuracy', 'precision',
'recall', 'roc_auc', 'pr_auc']
with open(Path.joinpath(pt.REPORTS_DIR, output_filename), 'w',
encoding='UTF8', newline='') as f:
writer = csv.writer(f)
writer.writerow(header)
versions = ['NoAts', 'Embedded', 'Counts']
for version in versions:
if version == 'NoAts':
ats_cols = [f"{i}Ats" for i in range(1, 11)]
X, y = load_data_embedded(case, settings)
X = X.drop(ats_cols, axis=1)
X, y = prepare_data(X, y, settings)
results = train_clf(X, y, version, output_filename, num_iter)
subtitle = get_version_subtitle(version)
make_plots(results, num_iter, case, version, subtitle)
elif version == "Embedded":
X, y = load_data_embedded(case, settings)
X, y = prepare_data(X, y, settings)
results = train_clf(X, y, version, output_filename, num_iter)
subtitle = get_version_subtitle(version)
make_plots(results, num_iter, case, version, subtitle)
elif version == "Counts":
X, y = load_data_counts(case, settings)
X, y = prepare_data(X, y, settings)
results = train_clf(X, y, version, output_filename, num_iter)
subtitle = get_version_subtitle(version)
make_plots(results, num_iter, case, version, subtitle)
def train_clf(X, y, version, output_filename, num_iter):
iteration_results = list()
for k in range(num_iter):
results = {'KNN': KnnClassifier(X, y).evaluate(k),
'SVM': SvmClassifier(X, y).evaluate(k),
'LR': LrClassifier(X, y).evaluate(k),
'XGB': XgbClassifier(X, y).evaluate(k),
'RF': RfClassifier(X, y).evaluate(k),
'MLP': MlpClassifier(X, y).evaluate(k)}
iteration_results.append(results)
for result in iteration_results:
for clf_name, value in result.items():
with open(Path.joinpath(pt.REPORTS_DIR, output_filename), 'a',
encoding='UTF8', newline='') as f:
writer = csv.writer(f)
data = [clf_name, version,
f"{compute_mean(value.accuracy)} ({compute_std(value.accuracy)})",
f"{compute_mean(value.precision)} ({compute_std(value.precision)})",
f"{compute_mean(value.recall)} ({compute_std(value.recall)})",
f"{compute_mean(value.rocauc)} ({compute_std(value.rocauc)})",
f"{compute_mean(value.prauc)} ({compute_std(value.prauc)})"]
writer.writerow(data)
return iteration_results
def make_plots(results: np.ndarray, num_iter: int,
case: str, version: str, case_subtitle):
metrics = ['Accuracy', 'Precision', 'Recall', 'ROC_AUC', 'PR_AUC']
means, stds = list(), list()
clf_names = list()
for metric in metrics:
for result in results:
for i, (key, value) in enumerate(result.items()):
clf_names.append(key)
if metric == "Accuracy":
means[i] = compute_mean(value.accuracy)
stds[i] = compute_std(value.accuracy)
elif metric == "Precision":
means = compute_mean(value.precision)
stds = compute_std(value.precision)
elif metric == "Recall":
means = compute_mean(value.recall)
stds = compute_std(value.recall)
elif metric == "ROC_AUC":
means = compute_mean(value.rocauc)
stds = compute_std(value.rocauc)
elif metric == "PR_AUC":
means = compute_mean(value.prauc)
stds = compute_std(value.prauc)
file_name = f"{case} version {version} - {metric}.pdf"
file_writer.write_cv_plot(means, stds, metric, num_iter,
clf_names, pt.REPORTS_PLOTS_DIR,
file_name, case_subtitle)
if __name__ == '__main__':
main()
#!/usr/bin/env python
import numpy as np
import paths as pt
from typing import List
from tools import file_reader, file_writer, preprocessor
from tools.classifiers import KnnClassifier, SvmClassifier, LrClassifier
from tools.classifiers import XgbClassifier, RfClassifier, MlpClassifier
import tensorflow as tf
from pathlib import Path
tf.get_logger().setLevel('ERROR')
NUM_ITER = 10
CASES = ["Complete", "Compliance", "Fall", "Fall_test"]
class Result:
def __init__(self, name, result):
self.name = name
self.result = result
class CVResult:
def __init__(self, acc, pre, rec, rocauc):
self.acc = acc
self.pre = pre
self.rec = rec
self.rocauc = rocauc
ATS_COLS = [str(i)+'Ats' for i in range(1, 10+1)] \
+ ['Cluster', 'LoanPeriod', 'NumberAts']
EX_COLS = [str(i)+'Ex' for i in range(1, 9+1)] + ['NumberEx']
CLF_NAMES = ["MLP", "LR", "XGB", "RF", "SVM", "KNN"]
def load_complete():
ats = {str(i)+'Ats':str for i in range(1,11)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
'complete.csv',
converters=ats)
return df
def load_fall():
converters = {str(i)+'Ats':str for i in range(1,11)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
'fall.csv',
converters=converters)
return df
def load_compliance():
converters = {str(i)+'Ats':str for i in range(1,11)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
'compliance.csv',
converters=converters)
return df
def load_fall_test():
ex = {str(i)+'Ex':str for i in range(1, 9+1)}
ats = {str(i)+'Ats':str for i in range(1, 10+1)}
converters = {**ex, **ats}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
'fall_test.csv',
converters=converters)
return df
def main():
for case in CASES:
results_filename = f"{case} baseline results.txt"
# Version 1
with open(Path.joinpath(pt.REPORTS_DIR, results_filename), "w+") as text_file:
text_file.write(f"{case} version 1 - without Ats and/or Ex columns")
if case == "Complete":
df = load_complete()
X = df.drop(['Complete'], axis=1)
y = df['Complete']
X = X.drop(ATS_COLS, axis=1)
elif case == "Compliance":
df = load_compliance()
X = df.drop(['Compliance'], axis=1)
y = df['Compliance']
X = X.drop(ATS_COLS, axis=1)
elif case == "Fall":
df = load_fall()
X = df.drop(['Fall'], axis=1)
y = df['Fall']
X = X.drop(ATS_COLS, axis=1)
else:
df = load_fall_test()
X = df.drop(['Fall'], axis=1)
y = df['Fall']
X = X.drop(ATS_COLS + EX_COLS, axis=1)
X = np.array(X)
y = np.array(y)
y_pred_probas, results_mean, results_std = [], [], []
for clf_name in CLF_NAMES:
y_pred_proba, result_mean, result_std = train_clf(X, y, clf_name,
results_filename, case, n_scale_cols=X.shape[1])
y_pred_probas.append(y_pred_proba)
results_mean.append(result_mean)
results_std.append(result_std)
make_plots(y, zip(CLF_NAMES, y_pred_probas, results_mean, results_std),
case, 1, "without Ats and/or Ex columns")
# Version 2
with open(Path.joinpath(pt.REPORTS_DIR, results_filename), "a") as text_file:
text_file.write("\n\n")
text_file.write(f"{case} version 2 - with embeddings")
if case == "Complete":
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'complete_emb.csv')
X = df.drop(['Complete'], axis=1)
y = df['Complete']
elif case == "Compliance":
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'compliance_emb.csv')
X = df.drop(['Compliance'], axis=1)
y = df['Compliance']
elif case == "Fall":
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'fall_emb.csv')
X = df.drop(['Fall'], axis=1)
y = df['Fall']
else:
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
X = df.drop(['Fall'], axis=1)
y = df['Fall']
X = np.array(X)
y = np.array(y)
ats_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
ex_cols = df.filter(regex='((\d+)[Ex])\w+', axis=1)
n_scale_cols = X.shape[1] - ats_cols.shape[1] - ex_cols.shape[1]