Commit e8b5e276 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

improved scripts

parent 86f84d7c
......@@ -1183,7 +1183,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
"version": "3.8.11"
},
"orig_nbformat": 4
},
......
......@@ -11,8 +11,8 @@ import os
import csv
import joblib
import pandas as pd
import numpy as np
import io
from pathlib import Path
from typing import List, Optional
from fastapi import Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse
......@@ -96,14 +96,12 @@ class InputData(pydantic.BaseModel):
class AlarmOutputData(pydantic.BaseModel):
EventTimes: list
SurvivalProbs: list
HazardProbs: list
AlarmArguments: list
class TrainingOutputData(pydantic.BaseModel):
CompleteProb: float
FallProb: float
Compliance: int
CompleteArguments: list
FallArguments: list
@app.get('/')
def index():
......@@ -188,13 +186,20 @@ def predict_alarm(incoming_data: InputData):
label_encoders = read_pickle("alarm_labels.pkl")
df_for_alarm = add_label_encoding(df.copy(), label_encoders, ats_resolution)
surv_probs = model.predict_survival_function(df_for_alarm, return_array=True)
hazard_probs = model.predict_cumulative_hazard_function(df_for_alarm, return_array=True)
surv_func = model.predict_survival_function(df_for_alarm, return_array=True)
event_times = [int(x) for x in model.event_times_]
surv_probs = [float(x) for x in surv_func[0]]
df_surv = pd.DataFrame({'SurvProbs': surv_probs[0]}, index=event_times)
index_at_one_year = event_times.index(365)
drop_after_one_year = 1 - float(df_surv.iloc[index_at_one_year][0])
alarm_arguments = generate_alarm_arguments(df, ats_resolution, drop_after_one_year)
return {
'EventTimes': [int(x) for x in model.event_times_],
'SurvivalProbs': [float(x) for x in surv_probs[0]],
'HazardProbs': [float(x) for x in hazard_probs[0]]
'EventTimes': event_times,
'SurvivalProbs': surv_probs,
'AlarmArguments': alarm_arguments
}
@app.post('/predict_training', response_model=TrainingOutputData, tags=["ai"])
......@@ -220,16 +225,13 @@ def predict_training(incoming_data: InputData):
else:
compliance_prob = 0
compliance = 0 if compliance_prob < 0.5 else 1
complete_arguments = generate_arguments(df, ats_resolution, "Complete", float(complete_prob))
fall_arguments = generate_arguments(df, ats_resolution, "Fall", float(fall_prob))
complete_arguments = generate_complete_arguments(df, ats_resolution, complete_prob)
return {
'CompleteProb': float(complete_prob),
'FallProb': float(fall_prob),
'Compliance': int(compliance),
'CompleteArguments': complete_arguments,
'FallArguments': fall_arguments,
'CompleteArguments': complete_arguments
}
def validate_data(incoming_data: InputData):
......@@ -267,9 +269,8 @@ def add_label_encoding(df: pd.DataFrame, encoders, ats_resolution: int) -> pd.Da
df.loc[:, col_name] = le.transform(df.loc[:, col_name].astype(str))
return df
def generate_arguments(df: pd.DataFrame, ats_resolution: int, case: str, prob: float):
def generate_complete_arguments(df: pd.DataFrame, ats_resolution: int, prob: float):
arguments = list()
gender_argument = "Kvinder" if int(df.iloc[0].Gender) == 0 else "Mænd"
arguments.append(gender_argument)
......@@ -293,9 +294,37 @@ def generate_arguments(df: pd.DataFrame, ats_resolution: int, case: str, prob: f
loan_period_argument = f"og en gennemsnitlig låneperiode på {loan_period} dage"
arguments.append(loan_period_argument)
arguments.append("gennemfører" if case == "Complete" else "falder")
arguments.append("gennemfører")
arguments.append(f"med {int(round(prob*100, 0))}% sandsynlighed")
return arguments
def generate_alarm_arguments(df: pd.DataFrame, ats_resolution: int, pct_drop: float):
arguments = list()
gender_argument = "Kvinder" if int(df.iloc[0].Gender) == 0 else "Mænd"
arguments.append(gender_argument)
age_argument = f"på {121 - int(df.iloc[0].BirthYear)} år"
arguments.append(age_argument)
number_ats = int(df.iloc[0].NumberAts)
if int(number_ats) < 1:
arguments.append("uden hjælpemidler i eget hjem")
else:
arguments.append("med følgende hjælpemidler i eget hjem:")
for i in range(1, ats_resolution+1):
ats_name = get_ats_name_from_hmi(df.iloc[0][f'{i}Ats'])
if ats_name != "":
arguments.append(f'Et {i}. hjælpemiddel af typen {ats_name}')
else:
arguments.append(f'Uden et {i}. hjælpemiddel.')
loan_period = int(df.iloc[0].LoanPeriod)
loan_period_argument = f"og en gennemsnitlig låneperiode på {loan_period} dage"
arguments.append(loan_period_argument)
arguments.append("får efter et år en nødalarm")
arguments.append(f"med {int(round(pct_drop*100, 0))}% sandsynlighed")
return arguments
def load_settings(file_name):
......
......@@ -4,7 +4,7 @@ load_and_clean_data.py
Script to load the raw data and then clean it.
"""
from tools import file_writer, raw_loader, cleaner
from tools import raw_loader, cleaner
from utility.data import write_pickle
import paths as pt
......@@ -45,4 +45,4 @@ def main():
write_pickle(ic, pt.INTERIM_DATA_DIR, 'ic.pkl')
if __name__ == "__main__":
main()
\ No newline at end of file
main()
......@@ -136,4 +136,4 @@ def main():
write_csv(df_alarm, file_path, file_name)
if __name__ == "__main__":
main()
\ No newline at end of file
main()
#!/usr/bin/env python
import paths as pt
from pathlib import Path
from tools import file_writer, data_loader
from sklearn.preprocessing import LabelEncoder
from tools import data_loader
from utility.settings import load_settings
from sksurv.ensemble import RandomSurvivalForest
from io import BytesIO
import shutil
import pandas as pd
import numpy as np
def main():
data_settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
target_settings = load_settings(pt.CONFIGS_DIR, "alarm.yaml")
ats_resolution = data_settings['ats_resolution']
dl = data_loader.AlarmDataLoader(pt.PROCESSED_DATA_DIR,
"alarm_data.pkl",
"alarm_emb.csv",
target_settings).load_data()
X, y = dl.get_data()
labels_enc = dict()
ats_cols = [f"{i}Ats" for i in range(1, ats_resolution+1)]
for col_name in ats_cols:
le = LabelEncoder()
le.fit(X.loc[:, col_name].astype(str))
labels_enc[col_name] = le
X.loc[:, col_name] = le.transform(X.loc[:, col_name].astype(str))
X = X[:1000]
y = y[:1000]
model = RandomSurvivalForest(n_estimators=200, max_depth=3,
n_jobs=-1, random_state=0)
model.fit(X, y)
sample = X.iloc[10]
surv_probs = model.predict_survival_function([sample], return_array=True)
event_times = [int(x) for x in model.event_times_]
with open(Path.joinpath(pt.MODELS_DIR, "alarm_labels.pkl"), 'wb') as fd:
outfile = BytesIO()
file_writer.write_pickle(labels_enc, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
with open(Path.joinpath(pt.MODELS_DIR, "alarm_rsf.joblib"), 'wb') as fd:
outfile = BytesIO()
file_writer.write_joblib(model, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
df = pd.DataFrame({'SurvProbs': surv_probs[0]}, index=event_times)
index_at_year = event_times.index(365)
drop_after_one_year = 1 - float(df.iloc[index_at_year][0])
print(int(round(drop_after_one_year*100, 0)))
if __name__ == '__main__':
main()
\ No newline at end of file
......@@ -58,4 +58,4 @@ def main():
shutil.copyfileobj(outfile, fd)
if __name__ == '__main__':
main()
\ No newline at end of file
main()
......@@ -69,8 +69,10 @@ class RawLoader2021(BaseRawLoader2021):
df_aa['Seq'] = df_aa.groupby(['ID', 'Kategori ISO nummer']).cumcount()
df_aa = df_aa[['ID', 'Birth Year', 'Gender', 'Kategori ISO nummer', 'Kørselsdato', 'Seq']]
df_aa['LendDate'] = df_aa.apply(lambda x: x['Kørselsdato'] if x['Seq'] % 2 == 0 else pd.NaT, axis=1)
df_aa['ReturnDate'] = df_aa.apply(lambda x: x['Kørselsdato'] if x['Seq'] % 2 == 1 else pd.NaT, axis=1)
df_aa['LendDate'] = df_aa.apply(lambda x: x['Kørselsdato']
if x['Seq'] % 2 == 0 else pd.NaT, axis=1)
df_aa['ReturnDate'] = df_aa.apply(lambda x: x['Kørselsdato']
if x['Seq'] % 2 == 1 else pd.NaT, axis=1)
df_aa['ReturnDate'] = df_aa.groupby(['ID', 'Kategori ISO nummer'])['ReturnDate'].shift(-1)
df_aa = df_aa.dropna(subset=['LendDate', 'ReturnDate'], thresh=1)
......@@ -349,4 +351,4 @@ class RawLoader2021(BaseRawLoader2021):
df['Gender'] = pd.Series.astype(df['Gender'], dtype=str)
df['BirthYear'] = pd.Series.astype(df['BirthYear'], dtype=int)
return df
\ No newline at end of file
return df
......@@ -20,4 +20,4 @@ def main():
tune_compliance_xgb_wb.main()
if __name__ == "__main__":
main()
\ No newline at end of file
main()
......@@ -4,15 +4,15 @@ tune_alarm_boost_wb.py
Grad. boost tune script for Alarm case on WanDB
"""
from utility.settings import load_settings
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.metrics import (concordance_index_censored,
concordance_index_ipcw,
integrated_brier_score)
from sklearn.model_selection import KFold
from utility.settings import load_settings
import numpy as np
from tools import data_loader
import paths as pt
import numpy as np
import os
os.environ["WANDB_SILENT"] = "true"
......@@ -128,4 +128,4 @@ def train_model():
wandb.log({"brier_score": brier_score_mean})
if __name__ == "__main__":
main()
\ No newline at end of file
main()
......@@ -66,7 +66,6 @@ def train_model():
config = wandb.config
# Load data
data_settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
target_settings = load_settings(pt.CONFIGS_DIR, "alarm.yaml")
dl = data_loader.AlarmDataLoader(pt.PROCESSED_DATA_DIR,
"alarm_emb.csv",
......@@ -115,4 +114,4 @@ def train_model():
wandb.log({"brier_score": brier_score_mean})
if __name__ == "__main__":
main()
\ No newline at end of file
main()
......@@ -98,9 +98,9 @@ def train_model():
res_validate = cross_validate(model, X, y, cv=skf, scoring=metrics)
# Evaluate performance
accuracy = res_validate[f'test_accuracy']
avg_prec = res_validate[f'test_average_precision']
f1 = res_validate[f'test_f1']
accuracy = res_validate['test_accuracy']
avg_prec = res_validate['test_average_precision']
f1 = res_validate['test_f1']
# Log to wandb
wandb.log({"accuracy": accuracy})
......@@ -108,4 +108,4 @@ def train_model():
wandb.log({"f1": f1})
if __name__ == "__main__":
main()
\ No newline at end of file
main()
......@@ -122,9 +122,9 @@ def train_model():
res_validate = cross_validate(model, X, y, cv=skf, scoring=metrics)
# Evaluate performance
accuracy = res_validate[f'test_accuracy']
avg_prec = res_validate[f'test_average_precision']
f1 = res_validate[f'test_f1']
accuracy = res_validate['test_accuracy']
avg_prec = res_validate['test_average_precision']
f1 = res_validate['test_f1']
# Log to wandb
wandb.log({"accuracy": accuracy})
......@@ -132,4 +132,4 @@ def train_model():
wandb.log({"f1": f1})
if __name__ == "__main__":
main()
\ No newline at end of file
main()
......@@ -98,9 +98,9 @@ def train_model():
res_validate = cross_validate(model, X, y, cv=skf, scoring=metrics)
# Evaluate performance
accuracy = res_validate[f'test_accuracy']
avg_prec = res_validate[f'test_average_precision']
f1 = res_validate[f'test_f1']
accuracy = res_validate['test_accuracy']
avg_prec = res_validate['test_average_precision']
f1 = res_validate['test_f1']
# Log to wandb
wandb.log({"accuracy": accuracy})
......@@ -108,4 +108,4 @@ def train_model():
wandb.log({"f1": f1})
if __name__ == "__main__":
main()
\ No newline at end of file
main()
......@@ -122,9 +122,9 @@ def train_model():
res_validate = cross_validate(model, X, y, cv=skf, scoring=metrics)
# Evaluate performance
accuracy = res_validate[f'test_accuracy']
avg_prec = res_validate[f'test_average_precision']
f1 = res_validate[f'test_f1']
accuracy = res_validate['test_accuracy']
avg_prec = res_validate['test_average_precision']
f1 = res_validate['test_f1']
# Log to wandb
wandb.log({"accuracy": accuracy})
......@@ -132,4 +132,4 @@ def train_model():
wandb.log({"f1": f1})
if __name__ == "__main__":
main()
\ No newline at end of file
main()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment