Commit 52f9c4f6 authored by thecml's avatar thecml
Browse files

added cuml risk score, added risk case

parent a8bf8bd5
Pipeline #95446 passed with stage
in 4 minutes and 58 seconds
---
# Dataset Stuff -------------------------------------------------
#
target_name: "Risk"
model_path: models/risk/embeddings
risk_period_months: 6
# Embedding Hyperparams --------------------------------------
train_ratio: 0.8
batch_size: 32
num_epochs_ats: 10
num_epochs_ex: 5
verbose: True
network_layers: [128]
optimizer: "Adam"
# Settings for data loader -------------------------------------------------
#
features_to_normalize: ['BirthYear', 'LoanPeriod', 'NumberSplit',
'NumberScreening', 'NumberWeeks', 'MeanEvaluation',
'NumberTraining', 'NumberTrainingWeek',
'TimeBetweenTraining', 'NumberWeeksNoTraining',
'Needs', 'Physics', 'NumberAts', 'NumberEx']
features_to_scale: ['Gender', 'BirthYear',
'LoanPeriod', 'NumberSplit', 'NumberScreening',
'NumberWeeks', 'MeanEvaluation',
'NumberTraining', 'NumberTrainingWeek',
'TimeBetweenTraining', 'NumberWeeksNoTraining',
'Needs', 'Physics', 'NumberAts', 'NumberEx']
# Settings for data script -------------------------------------------------
#
features: ['Gender', 'BirthYear', 'LoanPeriod', 'NumberSplit',
'NumberScreening', 'NumberWeeks',
'MeanEvaluation', 'NumberTraining',
'NumberTrainingWeek', 'TimeBetweenTraining',
'NumberWeeksNoTraining', 'Needs', 'Physics']
# Settings for dataset -------------------------------------------------
#
use_real_ats_names: False
ats_resolution: 10
ex_resolution: 9
\ No newline at end of file
No preview for this file type
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -217,7 +217,7 @@ ...@@ -217,7 +217,7 @@
"[686 rows x 8 columns]" "[686 rows x 8 columns]"
] ]
}, },
"execution_count": 6, "execution_count": 3,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
...@@ -228,7 +228,7 @@ ...@@ -228,7 +228,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
...@@ -240,7 +240,7 @@ ...@@ -240,7 +240,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -251,7 +251,7 @@ ...@@ -251,7 +251,7 @@
" random_state=20)" " random_state=20)"
] ]
}, },
"execution_count": 8, "execution_count": 5,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
...@@ -268,7 +268,7 @@ ...@@ -268,7 +268,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -277,7 +277,7 @@ ...@@ -277,7 +277,7 @@
"0.6759696016771488" "0.6759696016771488"
] ]
}, },
"execution_count": 9, "execution_count": 6,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
...@@ -288,7 +288,7 @@ ...@@ -288,7 +288,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -403,7 +403,7 @@ ...@@ -403,7 +403,7 @@
"5 72.0 1091.0 1.0 1.0 36.0 2.0 34.0 2.0" "5 72.0 1091.0 1.0 1.0 36.0 2.0 34.0 2.0"
] ]
}, },
"execution_count": 10, "execution_count": 7,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
...@@ -423,7 +423,7 @@ ...@@ -423,7 +423,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -438,7 +438,7 @@ ...@@ -438,7 +438,7 @@
"dtype: float64" "dtype: float64"
] ]
}, },
"execution_count": 7, "execution_count": 8,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
...@@ -449,7 +449,7 @@ ...@@ -449,7 +449,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -478,7 +478,7 @@ ...@@ -478,7 +478,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -507,7 +507,7 @@ ...@@ -507,7 +507,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 11,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -689,7 +689,7 @@ ...@@ -689,7 +689,7 @@
"<IPython.core.display.HTML object>" "<IPython.core.display.HTML object>"
] ]
}, },
"execution_count": 10, "execution_count": 11,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
......
...@@ -7,6 +7,8 @@ from sklearn.model_selection import train_test_split ...@@ -7,6 +7,8 @@ from sklearn.model_selection import train_test_split
import xgboost as xgb import xgboost as xgb
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib.lines import Line2D from matplotlib.lines import Line2D
from io import StringIO
import shutil
EPOCHS = 200 EPOCHS = 200
...@@ -21,15 +23,21 @@ def main(): ...@@ -21,15 +23,21 @@ def main():
make_dataset_full.main(ats_resolution=ats_res) make_dataset_full.main(ats_resolution=ats_res)
make_dataset_emb.main(ats_resolution=ats_res) make_dataset_emb.main(ats_resolution=ats_res)
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'complete_emb.csv') infile = StringIO()
df = df.sample(frac=1, random_state=0).reset_index(drop=True) file_path = pt.PROCESSED_DATA_DIR
file_name = 'complete_emb.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile)
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
X = df.drop([target_name], axis=1) X = df.drop([target_name], axis=1)
y = df[target_name] y = df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
stratify=y, stratify=y,
random_state=0) random_state=0)
neg, pos = np.bincount(y) neg, pos = np.bincount(y)
scale_pos_weight = neg / pos scale_pos_weight = neg / pos
......
import pandas as pd
import numpy as np
from tools import preprocessor, file_reader
import paths as pt
import os
import csv
import joblib
from pathlib import Path
def main():
model = file_reader.read_joblib(pt.RISK_XGB_DIR,
'fall_test_xgboost.joblib')
for gender in range(0, 2):
input_data = {"Gender": [gender],
"BirthYear": [72],
"Cluster": [10],
"LoanPeriod": [360],
"NumberSplit": [0],
"NumberScreening": [2],
"NumberWeeks": [3],
"MeanEvaluation": [4],
"NumberFalls": [1],
"NumberTraining": [8],
"NumberTrainingWeek": [1],
"TimeBetweenTraining": [3.5],
"NumberWeeksNoTraining": [1],
"Needs": [40],
"Physics": [43],
"Ex": ["8058,8062,8066"],
"Ats": ["222718,093307,181210"]}
new_data_df = pd.DataFrame.from_dict(input_data)
new_data_df['NumberAts'] = len(new_data_df['Ats'][0].split(","))
new_data_df['NumberEx'] = len(new_data_df['Ex'][0].split(","))
df = preprocessor.split_cat_columns(new_data_df, col_to_split='Ats',
tag='Ats',
resolution=10)
df = preprocessor.split_cat_columns(df, col_to_split='Ex',
tag='Ex',
resolution=10)
cols_ats = [str(i)+'Ats' for i in range(1, 10+1)]
cols_ex = [str(i)+'Ex' for i in range(1, 9+1)]
header_list = ['Gender', 'BirthYear', "Cluster",
"LoanPeriod", "NumberSplit", "NumberScreening",
"NumberWeeks", "MeanEvaluation", "NumberFalls",
"NumberTraining", "NumberTrainingWeek", "TimeBetweenTraining",
"NumberWeeksNoTraining", "NumberCancels", "NumberCancelsWeek",
"Needs", "Physics", "NumberAts", "NumberEx"] + cols_ats + cols_ex
df = df.reindex(columns=header_list)
df = df.fillna('0')
for i in range(1, 10+1):
path = Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings')
embedding = file_reader.read_embedding(path, f'fall_test_{i}Ats.csv')
column = f'{i}Ats'
df[column] = df[column].replace(to_replace=embedding)
df[column] = pd.to_numeric(df[column])
for i in range(1, 9+1):
path = Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings')
embedding = file_reader.read_embedding(path, f'fall_test_{i}Ex.csv')
column = f'{i}Ex'
df[column] = df[column].replace(to_replace=embedding)
df[column] = pd.to_numeric(df[column])
prediction = model.predict(df)
probability = model.predict_proba(df).max()
print(f"Using gender {gender}, predicted " +
f"{int(prediction[0])} with probability {round(float(probability), 3)*100}%")
if __name__ == "__main__":
main()
\ No newline at end of file
...@@ -49,22 +49,21 @@ def main(): ...@@ -49,22 +49,21 @@ def main():
writer.writerow(header) writer.writerow(header)
if case == "Complete": if case == "Complete":
settings = load_settings("complete_emb.yaml") settings = load_settings(pt.CONFIGS_DIR, "complete.yaml")
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data() dl = data_loader.CompleteDataLoader(pt.PROCESSED_DATA_DIR,
"complete_emb.csv", settings).load_data()
X, y = dl.get_data() X, y = dl.get_data()
elif case == "Compliance": elif case == "Compliance":
settings = load_settings("compliance_emb.yaml") settings = load_settings(pt.CONFIGS_DIR, "compliance.yaml")
dl = data_loader.ComplianceDataLoader("compliance_emb.csv", settings).load_data() dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
X, y = dl.get_data() "compliance_emb.csv", settings).load_data()
elif case == "Fall":
settings = load_settings("fall_emb.yaml")
dl = data_loader.AlarmDataLoader("fall_emb.csv", settings).load_data()
X, y = dl.get_data() X, y = dl.get_data()
else: else:
settings = load_settings("risk_emb.yaml") settings = load_settings(pt.CONFIGS_DIR, "fall.yaml")
dl = data_loader.FallDataLoader("risk_emb.csv", settings).load_data() dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
"fall_emb.csv", settings).load_data()
X, y = dl.get_data() X, y = dl.get_data()
X, y = dl.prepare_data() X, y = dl.prepare_data()
versions = ['NoCW', 'CW', 'Oversampling'] versions = ['NoCW', 'CW', 'Oversampling']
metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'f1'] metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'f1']
......
...@@ -9,6 +9,8 @@ from pathlib import Path ...@@ -9,6 +9,8 @@ from pathlib import Path
import csv import csv
from utility.settings import load_settings from utility.settings import load_settings
from utility.metrics import compute_mean, compute_std from utility.metrics import compute_mean, compute_std
from io import BytesIO
import shutil
NUM_ITER = 1 NUM_ITER = 1
...@@ -38,10 +40,14 @@ def load_data_embedded(case, settings): ...@@ -38,10 +40,14 @@ def load_data_embedded(case, settings):
dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR, dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
"compliance_emb.csv", settings).load_data() "compliance_emb.csv", settings).load_data()
X, y = dl.get_data() X, y = dl.get_data()
else: elif case == "Fall":
dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR, dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
"fall_emb.csv", settings).load_data() "fall_emb.csv", settings).load_data()
X, y = dl.get_data() X, y = dl.get_data()
else:
dl = data_loader.RiskDataLoader(pt.PROCESSED_DATA_DIR,
"risk_emb.csv", settings).load_data()
X, y = dl.get_data()
return X, y return X, y
def load_data_count(case, settings): def load_data_count(case, settings):
...@@ -53,10 +59,14 @@ def load_data_count(case, settings): ...@@ -53,10 +59,14 @@ def load_data_count(case, settings):
dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR, dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
"compliance_count.csv", settings).load_data() "compliance_count.csv", settings).load_data()
X, y = dl.get_data() X, y = dl.get_data()
else: elif case == "Fall":
dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR, dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
"fall_count.csv", settings).load_data() "fall_count.csv", settings).load_data()
X, y = dl.get_data() X, y = dl.get_data()
else:
dl = data_loader.RiskDataLoader(pt.PROCESSED_DATA_DIR,
"risk_count.csv", settings).load_data()
X, y = dl.get_data()
return X, y return X, y
def load_data_ohe(case, settings): def load_data_ohe(case, settings):
...@@ -68,17 +78,22 @@ def load_data_ohe(case, settings): ...@@ -68,17 +78,22 @@ def load_data_ohe(case, settings):
dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR, dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
"compliance_ohe.csv", settings).load_data() "compliance_ohe.csv", settings).load_data()
X, y = dl.get_data() X, y = dl.get_data()
else: elif case == "Fall":
dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR, dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
"fall_ohe.csv", settings).load_data() "fall_ohe.csv", settings).load_data()
X, y = dl.get_data() X, y = dl.get_data()
else:
dl = data_loader.RiskDataLoader(pt.PROCESSED_DATA_DIR,
"risk_ohe.csv", settings).load_data()
X, y = dl.get_data()
return X, y return X, y
def main(): def main():
clf_names = ['KNN', 'SVM', 'LR', 'XGB', 'RF', 'MLP'] clf_names = ['KNN', 'SVM', 'LR', 'XGB', 'RF', 'MLP']
num_clfs = len(clf_names) num_clfs = len(clf_names)
metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'f1'] metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'f1']
cases = ["Complete", "Compliance", "Fall"] #cases = ["Complete", "Compliance", "Fall", "Risk"]
cases = ["Risk"]
for case in cases: for case in cases:
target_settings = load_settings(pt.CONFIGS_DIR, f'{case.lower()}.yaml') target_settings = load_settings(pt.CONFIGS_DIR, f'{case.lower()}.yaml')
data_settings = load_settings(pt.CONFIGS_DIR, "data.yaml") data_settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
...@@ -91,14 +106,9 @@ def main(): ...@@ -91,14 +106,9 @@ def main():
encoding='UTF8', newline='') as f: encoding='UTF8', newline='') as f:
writer = csv.writer(f) writer = csv.writer(f)
writer.writerow(header) writer.writerow(header)
versions = ['NoAts', 'Embedded', 'Counts', 'OneHot'] versions = ['Embedded', 'Counts', 'OneHot']
for version in versions: for version in versions:
if version == 'NoAts': if version == "Embedded":
ats_resolution = data_settings['ats_resolution']
ats_cols = [f"{i}Ats" for i in range(1, ats_resolution+1)]
X, y = load_data_embedded(case, target_settings)
X = X.drop(ats_cols, axis=1)
elif version == "Embedded":
X, y = load_data_embedded(case, target_settings) X, y = load_data_embedded(case, target_settings)
elif version == "Counts": elif version == "Counts":
X, y = load_data_count(case, target_settings) X, y = load_data_count(case, target_settings)
...@@ -137,7 +147,7 @@ def train_clf(X, y, version, output_filename, metrics, num_iter): ...@@ -137,7 +147,7 @@ def train_clf(X, y, version, output_filename, metrics, num_iter):
return iteration_results return iteration_results
def make_plots(results: np.ndarray, metrics: List[str], num_iter: int, def make_plots(results: np.ndarray, metrics: List[str], num_iter: int,
num_clfs: int, clf_names, case: str, version: str, case_subtitle): num_clfs: int, clf_names, case: str, version: str, subtitle):
for metric in metrics: for metric in metrics:
total_means, total_stds = list(), list() total_means, total_stds = list(), list()
for iter_result in results: for iter_result in results:
...@@ -149,10 +159,15 @@ def make_plots(results: np.ndarray, metrics: List[str], num_iter: int, ...@@ -149,10 +159,15 @@ def make_plots(results: np.ndarray, metrics: List[str], num_iter: int,
total_stds.append(stds) total_stds.append(stds)
total_means = np.stack(total_means, axis=-1) total_means = np.stack(total_means, axis=-1)
total_stds = np.stack(total_stds, axis=-1) total_stds = np.stack(total_stds, axis=-1)
outfile = BytesIO()
file_path = pt.REPORTS_PLOTS_DIR
file_name = f"{case} version {version} - {metric}.pdf" file_name = f"{case} version {version} - {metric}.pdf"
file_writer.write_cv_plot(total_means, total_stds, metric, with open(Path.joinpath(file_path, file_name), 'wb') as fd:
num_iter, clf_names, pt.REPORTS_PLOTS_DIR, file_writer.write_cv_plot(means, stds, metric, num_iter,
file_name, case_subtitle) clf_names, file_name, subtitle,
outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -10,6 +10,9 @@ from utility.metrics import gini_xgb ...@@ -10,6 +10,9 @@ from utility.metrics import gini_xgb