Commit 52f9c4f6 authored by thecml's avatar thecml
Browse files

added cuml risk score, added risk case

parent a8bf8bd5
Pipeline #95446 passed with stage
in 4 minutes and 58 seconds
---
# Dataset Stuff -------------------------------------------------
#
target_name: "Risk"
model_path: models/risk/embeddings
risk_period_months: 6
# Embedding Hyperparams --------------------------------------
train_ratio: 0.8
batch_size: 32
num_epochs_ats: 10
num_epochs_ex: 5
verbose: True
network_layers: [128]
optimizer: "Adam"
# Settings for data loader -------------------------------------------------
#
features_to_normalize: ['BirthYear', 'LoanPeriod', 'NumberSplit',
'NumberScreening', 'NumberWeeks', 'MeanEvaluation',
'NumberTraining', 'NumberTrainingWeek',
'TimeBetweenTraining', 'NumberWeeksNoTraining',
'Needs', 'Physics', 'NumberAts', 'NumberEx']
features_to_scale: ['Gender', 'BirthYear',
'LoanPeriod', 'NumberSplit', 'NumberScreening',
'NumberWeeks', 'MeanEvaluation',
'NumberTraining', 'NumberTrainingWeek',
'TimeBetweenTraining', 'NumberWeeksNoTraining',
'Needs', 'Physics', 'NumberAts', 'NumberEx']
# Settings for data script -------------------------------------------------
#
features: ['Gender', 'BirthYear', 'LoanPeriod', 'NumberSplit',
'NumberScreening', 'NumberWeeks',
'MeanEvaluation', 'NumberTraining',
'NumberTrainingWeek', 'TimeBetweenTraining',
'NumberWeeksNoTraining', 'Needs', 'Physics']
# Settings for dataset -------------------------------------------------
#
use_real_ats_names: False
ats_resolution: 10
ex_resolution: 9
\ No newline at end of file
No preview for this file type
......@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
......@@ -21,7 +21,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
......@@ -39,7 +39,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"outputs": [
{
......@@ -217,7 +217,7 @@
"[686 rows x 8 columns]"
]
},
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
......@@ -228,7 +228,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
......@@ -240,7 +240,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 5,
"metadata": {},
"outputs": [
{
......@@ -251,7 +251,7 @@
" random_state=20)"
]
},
"execution_count": 8,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
......@@ -268,7 +268,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 6,
"metadata": {},
"outputs": [
{
......@@ -277,7 +277,7 @@
"0.6759696016771488"
]
},
"execution_count": 9,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
......@@ -288,7 +288,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 7,
"metadata": {},
"outputs": [
{
......@@ -403,7 +403,7 @@
"5 72.0 1091.0 1.0 1.0 36.0 2.0 34.0 2.0"
]
},
"execution_count": 10,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
......@@ -423,7 +423,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [
{
......@@ -438,7 +438,7 @@
"dtype: float64"
]
},
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
......@@ -449,7 +449,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [
{
......@@ -478,7 +478,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"outputs": [
{
......@@ -507,7 +507,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"outputs": [
{
......@@ -689,7 +689,7 @@
"<IPython.core.display.HTML object>"
]
},
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
......
......@@ -7,6 +7,8 @@ from sklearn.model_selection import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from io import StringIO
import shutil
EPOCHS = 200
......@@ -21,15 +23,21 @@ def main():
make_dataset_full.main(ats_resolution=ats_res)
make_dataset_emb.main(ats_resolution=ats_res)
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'complete_emb.csv')
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = 'complete_emb.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile)
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
X = df.drop([target_name], axis=1)
y = df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
stratify=y,
random_state=0)
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
......
import pandas as pd
import numpy as np
from tools import preprocessor, file_reader
import paths as pt
import os
import csv
import joblib
from pathlib import Path
def main():
model = file_reader.read_joblib(pt.RISK_XGB_DIR,
'fall_test_xgboost.joblib')
for gender in range(0, 2):
input_data = {"Gender": [gender],
"BirthYear": [72],
"Cluster": [10],
"LoanPeriod": [360],
"NumberSplit": [0],
"NumberScreening": [2],
"NumberWeeks": [3],
"MeanEvaluation": [4],
"NumberFalls": [1],
"NumberTraining": [8],
"NumberTrainingWeek": [1],
"TimeBetweenTraining": [3.5],
"NumberWeeksNoTraining": [1],
"Needs": [40],
"Physics": [43],
"Ex": ["8058,8062,8066"],
"Ats": ["222718,093307,181210"]}
new_data_df = pd.DataFrame.from_dict(input_data)
new_data_df['NumberAts'] = len(new_data_df['Ats'][0].split(","))
new_data_df['NumberEx'] = len(new_data_df['Ex'][0].split(","))
df = preprocessor.split_cat_columns(new_data_df, col_to_split='Ats',
tag='Ats',
resolution=10)
df = preprocessor.split_cat_columns(df, col_to_split='Ex',
tag='Ex',
resolution=10)
cols_ats = [str(i)+'Ats' for i in range(1, 10+1)]
cols_ex = [str(i)+'Ex' for i in range(1, 9+1)]
header_list = ['Gender', 'BirthYear', "Cluster",
"LoanPeriod", "NumberSplit", "NumberScreening",
"NumberWeeks", "MeanEvaluation", "NumberFalls",
"NumberTraining", "NumberTrainingWeek", "TimeBetweenTraining",
"NumberWeeksNoTraining", "NumberCancels", "NumberCancelsWeek",
"Needs", "Physics", "NumberAts", "NumberEx"] + cols_ats + cols_ex
df = df.reindex(columns=header_list)
df = df.fillna('0')
for i in range(1, 10+1):
path = Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings')
embedding = file_reader.read_embedding(path, f'fall_test_{i}Ats.csv')
column = f'{i}Ats'
df[column] = df[column].replace(to_replace=embedding)
df[column] = pd.to_numeric(df[column])
for i in range(1, 9+1):
path = Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings')
embedding = file_reader.read_embedding(path, f'fall_test_{i}Ex.csv')
column = f'{i}Ex'
df[column] = df[column].replace(to_replace=embedding)
df[column] = pd.to_numeric(df[column])
prediction = model.predict(df)
probability = model.predict_proba(df).max()
print(f"Using gender {gender}, predicted " +
f"{int(prediction[0])} with probability {round(float(probability), 3)*100}%")
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -49,22 +49,21 @@ def main():
writer.writerow(header)
if case == "Complete":
settings = load_settings("complete_emb.yaml")
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
settings = load_settings(pt.CONFIGS_DIR, "complete.yaml")
dl = data_loader.CompleteDataLoader(pt.PROCESSED_DATA_DIR,
"complete_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Compliance":
settings = load_settings("compliance_emb.yaml")
dl = data_loader.ComplianceDataLoader("compliance_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Fall":
settings = load_settings("fall_emb.yaml")
dl = data_loader.AlarmDataLoader("fall_emb.csv", settings).load_data()
settings = load_settings(pt.CONFIGS_DIR, "compliance.yaml")
dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
"compliance_emb.csv", settings).load_data()
X, y = dl.get_data()
else:
settings = load_settings("risk_emb.yaml")
dl = data_loader.FallDataLoader("risk_emb.csv", settings).load_data()
settings = load_settings(pt.CONFIGS_DIR, "fall.yaml")
dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
"fall_emb.csv", settings).load_data()
X, y = dl.get_data()
X, y = dl.prepare_data()
versions = ['NoCW', 'CW', 'Oversampling']
metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'f1']
......
......@@ -9,6 +9,8 @@ from pathlib import Path
import csv
from utility.settings import load_settings
from utility.metrics import compute_mean, compute_std
from io import BytesIO
import shutil
NUM_ITER = 1
......@@ -38,10 +40,14 @@ def load_data_embedded(case, settings):
dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
"compliance_emb.csv", settings).load_data()
X, y = dl.get_data()
else:
elif case == "Fall":
dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
"fall_emb.csv", settings).load_data()
X, y = dl.get_data()
else:
dl = data_loader.RiskDataLoader(pt.PROCESSED_DATA_DIR,
"risk_emb.csv", settings).load_data()
X, y = dl.get_data()
return X, y
def load_data_count(case, settings):
......@@ -53,10 +59,14 @@ def load_data_count(case, settings):
dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
"compliance_count.csv", settings).load_data()
X, y = dl.get_data()
else:
elif case == "Fall":
dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
"fall_count.csv", settings).load_data()
X, y = dl.get_data()
else:
dl = data_loader.RiskDataLoader(pt.PROCESSED_DATA_DIR,
"risk_count.csv", settings).load_data()
X, y = dl.get_data()
return X, y
def load_data_ohe(case, settings):
......@@ -68,17 +78,22 @@ def load_data_ohe(case, settings):
dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
"compliance_ohe.csv", settings).load_data()
X, y = dl.get_data()
else:
elif case == "Fall":
dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
"fall_ohe.csv", settings).load_data()
X, y = dl.get_data()
else:
dl = data_loader.RiskDataLoader(pt.PROCESSED_DATA_DIR,
"risk_ohe.csv", settings).load_data()
X, y = dl.get_data()
return X, y
def main():
clf_names = ['KNN', 'SVM', 'LR', 'XGB', 'RF', 'MLP']
num_clfs = len(clf_names)
metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'f1']
cases = ["Complete", "Compliance", "Fall"]
#cases = ["Complete", "Compliance", "Fall", "Risk"]
cases = ["Risk"]
for case in cases:
target_settings = load_settings(pt.CONFIGS_DIR, f'{case.lower()}.yaml')
data_settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
......@@ -91,14 +106,9 @@ def main():
encoding='UTF8', newline='') as f:
writer = csv.writer(f)
writer.writerow(header)
versions = ['NoAts', 'Embedded', 'Counts', 'OneHot']
versions = ['Embedded', 'Counts', 'OneHot']
for version in versions:
if version == 'NoAts':
ats_resolution = data_settings['ats_resolution']
ats_cols = [f"{i}Ats" for i in range(1, ats_resolution+1)]
X, y = load_data_embedded(case, target_settings)
X = X.drop(ats_cols, axis=1)
elif version == "Embedded":
if version == "Embedded":
X, y = load_data_embedded(case, target_settings)
elif version == "Counts":
X, y = load_data_count(case, target_settings)
......@@ -137,7 +147,7 @@ def train_clf(X, y, version, output_filename, metrics, num_iter):
return iteration_results
def make_plots(results: np.ndarray, metrics: List[str], num_iter: int,
num_clfs: int, clf_names, case: str, version: str, case_subtitle):
num_clfs: int, clf_names, case: str, version: str, subtitle):
for metric in metrics:
total_means, total_stds = list(), list()
for iter_result in results:
......@@ -149,10 +159,15 @@ def make_plots(results: np.ndarray, metrics: List[str], num_iter: int,
total_stds.append(stds)
total_means = np.stack(total_means, axis=-1)
total_stds = np.stack(total_stds, axis=-1)
outfile = BytesIO()
file_path = pt.REPORTS_PLOTS_DIR
file_name = f"{case} version {version} - {metric}.pdf"
file_writer.write_cv_plot(total_means, total_stds, metric,
num_iter, clf_names, pt.REPORTS_PLOTS_DIR,
file_name, case_subtitle)
with open(Path.joinpath(file_path, file_name), 'wb') as fd:
file_writer.write_cv_plot(means, stds, metric, num_iter,
clf_names, file_name, subtitle,
outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == '__main__':
main()
......@@ -10,6 +10,9 @@ from utility.metrics import gini_xgb
import shap
from typing import List
from utility.settings import load_settings
from io import BytesIO, StringIO
import shutil
from pathlib import Path
NUM_ITERATIONS = 1
......@@ -17,16 +20,19 @@ def main():
cases = ["Complete", "Compliance", "Fall"]
for case in cases:
if case == "Complete":
settings = load_settings("complete.yaml")
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
settings = load_settings(pt.CONFIGS_DIR, "complete.yaml")
dl = data_loader.CompleteDataLoader(pt.PROCESSED_DATA_DIR,
"complete_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Compliance":
settings = load_settings("compliance.yaml")
dl = data_loader.ComplianceDataLoader("compliance_emb.csv", settings).load_data()
settings = load_settings(pt.CONFIGS_DIR, "compliance.yaml")
dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
"compliance_emb.csv", settings).load_data()
X, y = dl.get_data()
else:
settings = load_settings("fall.yaml")
dl = data_loader.FallDataLoader("fall_emb.csv", settings).load_data()
settings = load_settings(pt.CONFIGS_DIR, "fall.yaml")
dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
"fall_emb.csv", settings).load_data()
X, y = dl.get_data()
features = dl.get_features()
......@@ -46,10 +52,24 @@ def main():
importances = list(shap_sorted_df['shap_values'])
features = list(shap_sorted_df['feature'])
plot_file_name = f"{case} SHAP feature values"
csv_file_name = f"{case} model features.csv"
file_writer.write_shap_importance_plot(features, importances, pt.REPORTS_PLOTS_DIR, plot_file_name)
file_writer.write_csv(shap_sorted_df, pt.REPORTS_DIR, csv_file_name)
outfile = BytesIO()
file_path = pt.REPORTS_PLOTS_DIR
file_name = f"{case} SHAP feature values"
with open(Path.joinpath(file_path, file_name), 'wb') as fd:
file_writer.write_shap_importance_plot(features, importances,
file_name,
outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
outfile = StringIO()
file_path = pt.REPORTS_PLOTS_DIR
file_name = f"{case} model features.csv"
with open(Path.joinpath(file_path, file_name), 'w', newline='') as fd:
file_writer.write_csv(shap_sorted_df, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
def get_best_shap_features(X: np.ndarray, y: np.ndarray,
cols: List[str], seed: int):
......
......@@ -12,18 +12,18 @@ from tools import data_loader, file_writer, file_reader
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
import matplotlib.pyplot as plt
from utility.settings import load_settings
from io import StringIO
import shutil
def main():
# Load settings
with open(Path.joinpath(pt.CONFIGS_DIR, "fall.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
settings = load_settings(pt.CONFIGS_DIR, "fall.yaml")
protected_col_name = "Gender"
y_col_name="Fall"
# Load the data
file_name = "fall_emb.csv"
dl = data_loader.AlarmDataLoader(file_name, settings).load_data()
dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR, file_name, settings).load_data()
X, y = dl.get_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
......@@ -81,16 +81,15 @@ def main():
# Save metrics
df_evaluate_proc = metrics.get_cm_by_protected_variable(df_subset, protected_col_name,
y_col_name, "output")
file_writer.write_csv(df_evaluate_proc, pt.INTERIM_DATA_DIR, "model"+str(i)
+ "_" + protected_col_name + ".csv")
write_csv(df_evaluate_proc, pt.INTERIM_DATA_DIR, f"model{i}_{protected_col_name}.csv")
df_evaluate_together = df_subset.copy()
df_evaluate_together[protected_col_name] = "all"
df_evaluate_all = metrics.get_cm_by_protected_variable(df_evaluate_together, protected_col_name,
y_col_name, "output")
file_writer.write_csv(df_evaluate_all, pt.INTERIM_DATA_DIR, "model"+str(i)
+ "_" + protected_col_name + "_all.csv")
df_evaluate_all = metrics.get_cm_by_protected_variable(df_evaluate_together,
protected_col_name,
y_col_name, "output")
write_csv(df_evaluate_all, pt.INTERIM_DATA_DIR, f"model{i}_{protected_col_name}_all.csv")
valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))
valid_pre.append(precision_score(y_valid_split, y_valid_scores))
valid_recall.append(recall_score(y_valid_split, y_valid_scores))
......@@ -98,7 +97,7 @@ def main():
i=i+1
file_writer.write_csv(df_test, pt.INTERIM_DATA_DIR, "all_test_data.csv")
write_csv(df_test, pt.INTERIM_DATA_DIR, "all_test_data.csv")
# Evaluate
y_pred = model.predict(X_test)
......@@ -114,33 +113,36 @@ def main():
'NPV', 'PPV', 'TNR', 'TPR','TP','TN','FN', 'FP']
df_out = pd.DataFrame(columns=column_names)
for i in range(5):
data = file_reader.read_csv(pt.INTERIM_DATA_DIR, f'model{i}_{protected_col_name}_all.csv')
for i in range(5):
data = read_csv(pt.INTERIM_DATA_DIR, f'model{i}_{protected_col_name}_all.csv')
for group in ["all"]:
for measure in measures:
value = float(data[data[protected_col_name] == group][measure])
df_out=df_out.append({'Group': group, "ML":"XGBoost"+str(i),
"Measure":measure, "Value":value}, ignore_index=True)
file_writer.write_csv(df_out, pt.INTERIM_DATA_DIR, 'XGBoost_metrics_crossvalidated_all.csv')
write_csv(df_out, pt.INTERIM_DATA_DIR, "XGBoost_metrics_crossvalidated_all.csv")
global_all_bar=sns.barplot(data=df_out[df_out["Measure"].isin(["FPR","FNR","TPR","TNR"])],
x="Group", y="Value", ci=95,hue="Measure")
global_all_bar.set_title('All')