Commit 5920ed0a authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

updated the way we make arguments

parent 57d15185
Pipeline #86885 passed with stage
in 3 minutes and 20 seconds
This diff is collapsed.
......@@ -44,15 +44,15 @@ def get_df_w_metrics(df, protected_col_name, y_target_name, y_pred_name):
def main():
# Load settings
with open(Path.joinpath(pt.CONFIGS_DIR, "compliance_emb.yaml"), 'r') as stream:
with open(Path.joinpath(pt.CONFIGS_DIR, "fall_emb.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
protected_col_name = "Gender_Male"
y_col_name="Compliance"
y_col_name="Fall"
# Load the data
file_name = "compliance_emb.csv"
dl = data_loader.ComplianceDataLoader(file_name, settings).load_data()
file_name = "fall_emb.csv"
dl = data_loader.FallDataLoader(file_name, settings).load_data()
X, y = dl.get_data()
X = X.drop(['Gender_Female'], axis=1)
......@@ -67,7 +67,7 @@ def main():
"use_label_encoder": False,
"learning_rate": 0.1,
"eval_metric": "logloss",
"seed": 0
"random_state": 0
}
model = xgb.XGBClassifier(**params)
......
......@@ -90,11 +90,8 @@ class OutputData(pydantic.BaseModel):
CompleteProb: float
FallProb: float
Compliance: int
CompleteShapValues: dict
FallShapValues: dict
ClusterId: int
ClusterValues: List
Arguments: List
CompleteArguments: list
FallArguments: list
@app.get('/')
def index():
......@@ -186,7 +183,6 @@ def predict(incoming_data: InputData):
data['Ats'] = '0'
df = prepare_data(data, ats_resolution)
arguments = generate_arguments(df, ats_resolution)
complete_model = read_joblib(f'complete_xgboost.joblib')
compliance_model = read_joblib(f'compliance_xgboost.joblib')
......@@ -202,24 +198,24 @@ def predict(incoming_data: InputData):
compliance_prob = compliance_model.predict_proba(df_for_compliance).flatten()[1]
else:
compliance_prob = 0
cluster_id = int(df.iloc[0]['Cluster'])
cluster_converter = {str(i):str for i in range(1, 20)}
clusters = read_dataframe('clusters.csv', converters=cluster_converter)
cluster_values = list(clusters.iloc[:, cluster_id])
complete_shap_values = get_shap_values(complete_model, X_test=df_for_complete)
fall_shap_values = get_shap_values(fall_model, X_test=df_for_fall)
compliance = 0 if compliance_prob < 0.5 else 1
#cluster_id = int(df.iloc[0]['Cluster'])
#cluster_converter = {str(i):str for i in range(1, 20)}
#clusters = read_dataframe('clusters.csv', converters=cluster_converter)
#cluster_values = list(clusters.iloc[:, cluster_id])
#complete_shap_values = get_shap_values(complete_model, X_test=df_for_complete)
#fall_shap_values = get_shap_values(fall_model, X_test=df_for_fall)
complete_arguments = generate_arguments(df, ats_resolution, "Complete", float(complete_prob))
fall_arguments = generate_arguments(df, ats_resolution, "Fall", float(fall_prob))
return {
'CompleteProb': float(complete_prob),
'FallProb': float(fall_prob),
'Compliance': int(compliance),
'CompleteShapValues': complete_shap_values,
'FallShapValues': fall_shap_values,
'ClusterId': int(df.iloc[0]['Cluster']),
'ClusterValues': cluster_values,
'Arguments': arguments
'CompleteArguments': complete_arguments,
'FallArguments': fall_arguments,
}
def add_embeddings(df: pd.DataFrame, case: str, ats_resolution: int) -> pd.DataFrame:
......@@ -230,23 +226,34 @@ def add_embeddings(df: pd.DataFrame, case: str, ats_resolution: int) -> pd.DataF
df[column] = pd.to_numeric(df[column])
return df
def generate_arguments(df: pd.DataFrame, ats_resolution: int):
def generate_arguments(df: pd.DataFrame, ats_resolution: int, case: str, prob: float):
arguments = list()
gender = 'kvinde' if df.iloc[0].Gender_Male == 0 else 'mand'
arguments.append(f'Personen er en {gender}.')
arguments.append(f'Personen er {int(121 - df.iloc[0].BirthYear)} år gammel.')
arguments.append(f'Personen har i gennemsnit lånt hjælpemidler i {int(df.iloc[0].LoanPeriod)} dage.')
arguments.append(f'Personen har {int(df.iloc[0].NumberAts)} hjælpemidle(r) i hjemmet.')
gender_argument = "Mænd" if int(df.iloc[0].Gender_Male) == 1 else "Kvinder"
arguments.append(gender_argument)
age_argument = f"på {121 - int(df.iloc[0].BirthYear)} år"
arguments.append(age_argument)
number_ats = int(df.iloc[0].NumberAts)
if int(number_ats) < 1:
arguments.append("uden hjælpemidler i eget hjem")
else:
arguments.append("med følgende hjælpemidler i eget hjem:")
for i in range(1, ats_resolution+1):
ats_name = get_ats_name_from_hmi(df.iloc[0][f'{i}Ats'])
if ats_name != "":
arguments.append(f'Et {i}. hjælpemiddel af typen {ats_name}')
else:
arguments.append(f'Uden et {i}. hjælpemiddel.')
loan_period_argument = f"og en gennemsnitlig låneperiode på {int(df.iloc[0].LoanPeriod)} dage"
arguments.append(loan_period_argument)
arguments.append("gennemfører" if case == "Complete" else "falder")
arguments.append(f"med {int(round(prob*100, 0))}% sandsynlighed")
ats_arguments = list()
for i in range(1, ats_resolution+1):
ats_name = get_ats_name_from_hmi(df.iloc[0][f'{i}Ats'])
if ats_name != "":
ats_arguments.append(f'Personen har et hjælpemiddel af typen {ats_name} som sit {i}. hjælpemiddel.')
else:
ats_arguments.append(f'Personen har ikke et {i}. hjælpemiddel.')
arguments.extend(ats_arguments)
return arguments
def load_settings(file_name):
......
......@@ -2,92 +2,64 @@
import numpy as np
import pandas as pd
import paths as pt
from tools import file_reader, file_writer, explainer
from tools import file_reader, file_writer, data_loader
from utility import metrics
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost as xgb
from pathlib import Path
import yaml
CASES = ["Complete", "Compliance", "Fall", "Risk"]
DATASET_VERSION = 'emb'
def main(dataset_version : str = 'emb'):
def main():
for case in CASES:
if case == "Complete":
if dataset_version == 'ohe':
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'complete_count.csv')
else:
df = file_reader.read_csv(pt.TESTS_FILES_DIR, 'complete_emb.csv')
target_name = "Complete"
with open(Path.joinpath(pt.CONFIGS_DIR, "complete_emb.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
file_name = f'complete_{DATASET_VERSION}.csv'
dl = data_loader.CompleteDataLoader(file_name, settings).load_data()
X, y = dl.prepare_data()
elif case == "Compliance":
if dataset_version == 'ohe':
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'compliance_count.csv')
else:
df = file_reader.read_csv(pt.TESTS_FILES_DIR, 'compliance_emb.csv')
target_name = "Compliance"
with open(Path.joinpath(pt.CONFIGS_DIR, "compliance_emb.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
file_name = f'compliance_{DATASET_VERSION}.csv'
dl = data_loader.ComplianceDataLoader(file_name, settings).load_data()
X, y = dl.prepare_data()
elif case == "Fall":
if dataset_version == 'ohe':
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'fall_count.csv')
else:
df = file_reader.read_csv(pt.TESTS_FILES_DIR, 'fall_emb.csv')
target_name = "Fall"
with open(Path.joinpath(pt.CONFIGS_DIR, "fall_emb.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
file_name = f'fall_{DATASET_VERSION}.csv'
dl = data_loader.FallDataLoader(file_name, settings).load_data()
X, y = dl.prepare_data()
else:
if dataset_version == 'ohe':
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'risk_count.csv')
else:
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'risk_emb.csv')
target_name = "Risk"
with open(Path.joinpath(pt.CONFIGS_DIR, "risk_emb.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
file_name = f'risk_{DATASET_VERSION}.csv'
dl = data_loader.RiskDataLoader(file_name, settings).load_data()
X, y = dl.prepare_data()
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
X = df.drop([target_name, 'Rand'], axis=1)
y = df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
stratify=y, random_state=0)
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
params = {"n_estimators": 400,
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.07,
"learning_rate": 0.1,
"eval_metric": "logloss",
"random_state": 0
}
model = xgb.XGBClassifier(**params)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
y_valid_pred = 0*y
valid_acc, valid_pre, valid_recall, valid_roc_auc = list(), list(), list(), list()
for train_index, valid_index in skf.split(X_train, y_train):
X_train_split, X_valid_split = X_train.iloc[train_index,:], X_train.iloc[valid_index,:]
y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]
model.fit(X_train, y_train)
optimize_rounds = True
early_stopping_rounds = 200
if optimize_rounds:
eval_set=[(X_valid_split, y_valid_split)]
fit_model = model.fit(X_train_split, y_train_split,
eval_set=eval_set,
eval_metric=metrics.gini_xgb,
early_stopping_rounds=early_stopping_rounds,
verbose=False)
else:
fit_model = model.fit(X_train_split, y_train_split)
pred = fit_model.predict_proba(X_valid_split)[:,1]
y_valid_pred.iloc[valid_index] = pred
y_valid_scores = (y_valid_pred.iloc[valid_index] > 0.5)
valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))
valid_pre.append(precision_score(y_valid_split, y_valid_scores))
valid_recall.append(recall_score(y_valid_split, y_valid_scores))
valid_roc_auc.append(roc_auc_score(y_valid_split, y_valid_pred.iloc[valid_index]))
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]
file_writer.write_joblib(model, pt.MODELS_DIR, f'{case.lower()}_xgboost.joblib')
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment