Commit 4350076b authored by thecml's avatar thecml
Browse files

updated scripts and notebooks

parent 94e72200
Pipeline #99819 failed with stage
in 1 minute and 2 seconds
This diff is collapsed.
This diff is collapsed.
%% Cell type:code id: tags:
```
import yaml
from pathlib import Path
import pandas as pd
import numpy as np
import paths as pt
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from utility import metrics
from sklearn.metrics import confusion_matrix
from tools import data_loader, file_writer
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
# Load settings
with open(Path.joinpath(pt.CONFIGS_DIR, "complete_emb.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
protected_col_name = 'Gender'
y_col_name="Complete"
# Load the data
file_name = "complete_emb.csv"
dl = data_loader.CompleteDataLoader(file_name, settings).load_data()
X, y = dl.get_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
stratify=y, random_state=0)
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
params = {"n_estimators": 400,
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.1,
"eval_metric": "logloss",
"seed": 0
}
model = xgb.XGBClassifier(**params)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
df_test = pd.DataFrame([],columns=list(X.columns)+["Complete"]+["output"]+["output_prob"])
```
%% Cell type:code id: tags:
```
def get_df_w_metrics(df, protected_col_name, y_target_name, y_pred_name):
confusion_df = pd.DataFrame(columns=[protected_col_name, "FPR", "FNR"])
for name in list(df[protected_col_name].unique()):
a=df[df[protected_col_name]==name][y_target_name]
b=df[df[protected_col_name]==name][y_pred_name]
TN, FP, FN, TP = confusion_matrix(list(a), list(b),labels=[0, 1]).ravel()
TPR = TP/(TP+FN)
TNR = TN/(TN+FP)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
FPR = FP/(FP+TN)
FNR = FN/(TP+FN)
FDR = FP/(TP+FP)
ACC = (TP+TN)/(TP+FP+FN+TN)
LRplus=TPR/FPR
LRminus=FNR/TNR
F1=2*(PPV*TPR)/(PPV+TPR)
confusion_df = confusion_df.append({protected_col_name:name, "TPR":TPR, "TNR":TNR, "FPR":FPR,
"FNR":FNR, "PPV":PPV, "NPV":NPV, "FDR":FDR, "ACC":ACC,
"F1":F1, "LRplus":LRplus, "LRminus":LRminus, "TN":TN,
"FP":FP, "FN":FN, "TP":TP}, ignore_index=True)
return confusion_df
```
%% Cell type:code id: tags:
```
i=0
y_valid_pred = 0*y
valid_acc, valid_pre, valid_recall, valid_roc_auc = list(), list(), list(), list()
for train_index, valid_index in skf.split(X_train, y_train):
X_train_split, X_valid_split = X_train.iloc[train_index,:], X_train.iloc[valid_index,:]
y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]
optimize_rounds = True
early_stopping_rounds = 50
if optimize_rounds:
eval_set=[(X_valid_split, y_valid_split)]
fit_model = model.fit(X_train_split, y_train_split,
eval_set=eval_set,
eval_metric=metrics.gini_xgb,
early_stopping_rounds=early_stopping_rounds,
verbose=False)
else:
fit_model = model.fit(X_train_split, y_train_split)
pred = fit_model.predict_proba(X_valid_split)[:,1]
y_valid_pred.iloc[valid_index] = pred
y_valid_scores = (y_valid_pred.iloc[valid_index] > 0.5)
# Save data
y_true_pd=y_valid_split.to_frame().reset_index(drop=True)
y_pred_pd=y_valid_scores.apply(lambda x: 1 if x == True else 0).to_frame().reset_index(drop=True).rename(columns={"Complete" : "output"})
y_pred_prob_pd = pd.DataFrame(pred, columns = ["output_prob"])
df_subset = pd.concat([X_valid_split.reset_index(drop=True), y_true_pd, y_pred_pd, y_pred_prob_pd], axis=1)
df_test = df_test.append(df_subset, ignore_index=True)
# Save metrics
df_evaluate_proc = get_df_w_metrics(df_subset, protected_col_name, y_col_name, "output")
file_writer.write_csv(df_evaluate_proc, pt.INTERIM_DATA_DIR, "model"+str(i) + "_" + protected_col_name + ".csv")
df_evaluate_together = df_subset.copy()
df_evaluate_all = get_df_w_metrics(df_evaluate_together, protected_col_name, y_col_name, "output")
file_writer.write_csv(df_evaluate_all, pt.INTERIM_DATA_DIR, "model"+str(i) + "_" + protected_col_name + "_all.csv")
valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))
valid_pre.append(precision_score(y_valid_split, y_valid_scores))
valid_recall.append(recall_score(y_valid_split, y_valid_scores))
valid_roc_auc.append(roc_auc_score(y_valid_split, y_valid_pred.iloc[valid_index]))
i=i+1
```
%% Cell type:code id: tags:
```
file_writer.write_csv(df_test, pt.INTERIM_DATA_DIR, "all_test_data.csv")
```
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
file = 'C:\\Users\\cml\\Downloads\\AIR export\\Aalborg\\Hjælpemidler minus 50_Rasmus_Details-CPR.xlsx'
df = pd.read_excel(file, engine='openpyxl', converters={'ID': str, 'Kategori ISO nummer': str})
df['Seq'] = df.groupby(['ID', 'Kategori ISO nummer']).cumcount()
df = df[['ID', 'Birth Year', 'Gender', 'Kategori ISO nummer', 'Kørselsdato', 'Seq']]
df['LendDate'] = df.apply(lambda x: x['Kørselsdato'] if x['Seq'] % 2 == 0 else pd.NaT, axis=1)
df['ReturnDate'] = df.apply(lambda x: x['Kørselsdato'] if x['Seq'] % 2 == 1 else pd.NaT, axis=1)
df['ReturnDate'] = df.groupby(['ID', 'Kategori ISO nummer'])['ReturnDate'].shift(-1)
df = df.dropna(subset=['LendDate', 'ReturnDate'], thresh=1)
#df['ReturnDate'] = df['ReturnDate'].shift(-1)
#df = df.dropna(subset=['LendDate', 'ReturnDate'], thresh=1)
#df = df.drop(['Kørselsdato', 'Seq'], axis=1)
```
%% Cell type:code id: tags:
``` python
df
```
%% Output
ID Birth Year Gender Kategori ISO nummer Kørselsdato Seq \
0 2429541786 23 FEMALE 22271812 06/08/19 0
1 2429541786 23 FEMALE 12060611 19/02/18 0
2 2429541786 23 FEMALE 12072401 19/02/18 0
3 2430269034 26 FEMALE 22271812 09/03/20 0
5 2430269034 26 FEMALE 12362124 14/10/19 0
... ... ... ... ... ... ...
72044 74711770410 99 FEMALE 18301509 13/06/19 0
72046 74711770410 99 FEMALE 12220308 28/09/21 0
72048 74711770410 99 FEMALE 12220308 25/08/16 2
72050 74711770410 99 FEMALE 04330301 07/09/21 0
72051 74711770410 99 FEMALE 99999999 28/09/21 0
LendDate ReturnDate
0 06/08/19 NaN
1 19/02/18 NaN
2 19/02/18 NaN
3 09/03/20 09/11/20
5 14/10/19 29/07/21
... ... ...
72044 13/06/19 07/09/21
72046 28/09/21 07/09/21
72048 25/08/16 29/09/21
72050 07/09/21 28/09/21
72051 28/09/21 NaN
[48292 rows x 8 columns]
This diff is collapsed.
This diff is collapsed.