Commit b6406ffa authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

improved bias script, adjusted settings

parent 5920ed0a
Pipeline #87251 passed with stage
in 3 minutes
......@@ -12,7 +12,7 @@ threshold_training: 10
train_ratio: 0.8
batch_size: 32
num_epochs: 5
verbose: True
verbose: False
network_layers: [128]
optimizer: "Adam"
......
......@@ -12,7 +12,7 @@ threshold_training: 10
train_ratio: 0.8
batch_size: 32
num_epochs: 10
verbose: True
verbose: False
network_layers: [128]
optimizer: "Adam"
......
......@@ -12,7 +12,7 @@ threshold_training: 10
train_ratio: 0.8
batch_size: 32
num_epochs: 10
verbose: True
verbose: False
network_layers: [128]
optimizer: "Adam"
......
......@@ -12,7 +12,7 @@ train_ratio: 0.8
batch_size: 32
num_epochs_ats: 10
num_epochs_ex: 5
verbose: True
verbose: False
network_layers: [128]
optimizer: "Adam"
......
......@@ -8,40 +8,11 @@ import xgboost as xgb
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from utility import metrics
from sklearn.metrics import confusion_matrix
from tools import data_loader, file_writer, file_reader
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
import matplotlib.pyplot as plt
def get_df_w_metrics(df, protected_col_name, y_target_name, y_pred_name):
confusion_df = pd.DataFrame(columns=[protected_col_name, "FPR", "FNR"])
for name in list(df[protected_col_name].unique()):
a=df[df[protected_col_name]==name][y_target_name]
b=df[df[protected_col_name]==name][y_pred_name]
TN, FP, FN, TP = confusion_matrix(list(a), list(b),labels=[0, 1]).ravel()
TPR = TP/(TP+FN)
TNR = TN/(TN+FP)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
FPR = FP/(FP+TN)
FNR = FN/(TP+FN)
FDR = FP/(TP+FP)
ACC = (TP+TN)/(TP+FP+FN+TN)
LRplus=TPR/FPR
LRminus=FNR/TNR
F1=2*(PPV*TPR)/(PPV+TPR)
confusion_df = confusion_df.append({protected_col_name:name, "TPR":TPR, "TNR":TNR, "FPR":FPR,
"FNR":FNR, "PPV":PPV, "NPV":NPV, "FDR":FDR, "ACC":ACC,
"F1":F1, "LRplus":LRplus, "LRminus":LRminus, "TN":TN,
"FP":FP, "FN":FN, "TP":TP}, ignore_index=True)
return confusion_df
def main():
# Load settings
with open(Path.joinpath(pt.CONFIGS_DIR, "fall_emb.yaml"), 'r') as stream:
......@@ -90,7 +61,7 @@ def main():
eval_set=eval_set,
eval_metric=metrics.gini_xgb,
early_stopping_rounds=early_stopping_rounds,
verbose=False)
verbose=False)
else:
fit_model = model.fit(X_train_split, y_train_split)
......@@ -100,20 +71,26 @@ def main():
# Save data
y_true_pd=y_valid_split.to_frame().reset_index(drop=True)
y_pred_pd=y_valid_scores.apply(lambda x: 1 if x == True else 0).to_frame().reset_index(drop=True).rename(columns={y_col_name : "output"})
y_valid_scores = y_valid_scores.apply(lambda x: 1 if x == True else 0).to_frame()
y_pred_pd=y_valid_scores.reset_index(drop=True).rename(columns={y_col_name : "output"})
y_pred_prob_pd = pd.DataFrame(pred, columns = ["output_prob"])
df_subset = pd.concat([X_valid_split.reset_index(drop=True), y_true_pd, y_pred_pd, y_pred_prob_pd], axis=1)
df_subset = pd.concat([X_valid_split.reset_index(drop=True), y_true_pd,
y_pred_pd, y_pred_prob_pd], axis=1)
df_test = df_test.append(df_subset, ignore_index=True)
# Save metrics
df_evaluate_proc = get_df_w_metrics(df_subset, protected_col_name, y_col_name, "output")
file_writer.write_csv(df_evaluate_proc, pt.INTERIM_DATA_DIR, "model"+str(i) + "_" + protected_col_name + ".csv")
df_evaluate_proc = metrics.get_cm_by_protected_variable(df_subset, protected_col_name,
y_col_name, "output")
file_writer.write_csv(df_evaluate_proc, pt.INTERIM_DATA_DIR, "model"+str(i)
+ "_" + protected_col_name + ".csv")
df_evaluate_together = df_subset.copy()
df_evaluate_together[protected_col_name] = "all"
df_evaluate_all = get_df_w_metrics(df_evaluate_together, protected_col_name, y_col_name, "output")
file_writer.write_csv(df_evaluate_all, pt.INTERIM_DATA_DIR, "model"+str(i) + "_" + protected_col_name + "_all.csv")
df_evaluate_all = metrics.get_cm_by_protected_variable(df_evaluate_together, protected_col_name,
y_col_name, "output")
file_writer.write_csv(df_evaluate_all, pt.INTERIM_DATA_DIR, "model"+str(i)
+ "_" + protected_col_name + "_all.csv")
valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))
valid_pre.append(precision_score(y_valid_split, y_valid_scores))
......@@ -134,17 +111,21 @@ def main():
# Save the confusion data for all
column_names = ["Group", "ML", "Measure", "Value"]
measures = ['FPR', 'FNR', 'ACC', 'F1', 'FDR', 'LRminus', 'LRplus',
'NPV', 'PPV', 'TNR', 'TPR','TP','TN','FN', 'FP']
df_out = pd.DataFrame(columns=column_names)
for i in range(5):
data = file_reader.read_csv(pt.INTERIM_DATA_DIR, f'model{i}_{protected_col_name}_all.csv')
for group in ["all"]:
for measure in ['FPR', 'FNR', 'ACC', 'F1', 'FDR', 'LRminus','LRplus', 'NPV', 'PPV', 'TNR', 'TPR','TP','TN','FN','FP']:
for measure in measures:
value = float(data[data[protected_col_name] == group][measure])
df_out=df_out.append({'Group': group, "ML":"XGBoost"+str(i), "Measure":measure, "Value":value}, ignore_index=True)
df_out=df_out.append({'Group': group, "ML":"XGBoost"+str(i),
"Measure":measure, "Value":value}, ignore_index=True)
file_writer.write_csv(df_out, pt.INTERIM_DATA_DIR, 'XGBoost_metrics_crossvalidated_all.csv')
global_all_bar=sns.barplot(data=df_out[df_out["Measure"].isin(["FPR","FNR","TPR","TNR"])],x="Group", y="Value", ci=95,hue="Measure")
global_all_bar=sns.barplot(data=df_out[df_out["Measure"].isin(["FPR","FNR","TPR","TNR"])],
x="Group", y="Value", ci=95,hue="Measure")
global_all_bar.set_title('All')
global_all_bar.get_figure().savefig(Path.joinpath(pt.REPORTS_PLOTS_DIR, f"{protected_col_name}_barplot_all.pdf"))
......@@ -155,12 +136,14 @@ def main():
for i in range(5):
data = file_reader.read_csv(pt.INTERIM_DATA_DIR, f'model{i}_{protected_col_name}.csv')
for group in [0.0, 1.0]:
for measure in ['FPR', 'FNR', 'ACC', 'F1', 'FDR', 'LRminus','LRplus', 'NPV', 'PPV', 'TNR', 'TPR','TP','TN','FN','FP']:
for measure in measures:
value=float(data[data[protected_col_name]==group][measure])
df_out=df_out.append({'Group': group,"ML":"XGBoost"+str(i),"Measure":measure,"Value":value}, ignore_index=True)
df_out=df_out.append({'Group': group,"ML":"XGBoost"+str(i),
"Measure":measure,"Value":value}, ignore_index=True)
file_writer.write_csv(df_out, pt.INTERIM_DATA_DIR, f'XGBoost_metrics_crossvalidated_{protected_col_name}.csv')
global_proc_bar=sns.barplot(data=df_out[df_out["Measure"].isin(["FPR","FNR","TPR","TNR"])],x="Group", y="Value", ci=95,hue="Measure")
global_proc_bar=sns.barplot(data=df_out[df_out["Measure"].isin(["FPR","FNR","TPR","TNR"])],
x="Group", y="Value", ci=95,hue="Measure")
global_proc_bar.set_title(f'Proctected: {protected_col_name}')
global_all_bar.get_figure().savefig(Path.joinpath(pt.REPORTS_PLOTS_DIR, "barplot_proc.pdf"))
......@@ -227,7 +210,7 @@ def main():
ax[i].set_ylabel('',fontsize=20)
ax[i].set_xlabel("",fontsize=20)
plt.savefig(Path.joinpath(pt.REPORTS_PLOTS_DIR, "XGBoost Gender Metrics.pdf"), dpi=300, bbox_inches = "tight")
plt.savefig(Path.joinpath(pt.REPORTS_PLOTS_DIR, "XGBoost Gender Metrics.pdf"), dpi=300, bbox_inches="tight")
# Calculate relation between male/female
frame=all_data_gender
......@@ -236,8 +219,10 @@ def main():
for i in list(frame["Model"].unique()):
for j in list(frame["Metric"].unique()) :
if j not in ["Mean_y_target","Mean_y_hat_prob"]:
female_val = frame[(frame["Model"]==i) & (frame["Metric"]==j) & (frame["Gender"]=="Female")]["Value"].mean()
male_val = frame[(frame["Model"]==i) & (frame["Metric"]==j) & (frame["Gender"]=="Male")]["Value"].mean()
female_val = frame[(frame["Model"]==i) & (frame["Metric"]==j)
& (frame["Gender"]=="Female")]["Value"].mean()
male_val = frame[(frame["Model"]==i) & (frame["Metric"]==j)
& (frame["Gender"]=="Male")]["Value"].mean()
relation=female_val/male_val
newFrame=newFrame.append({"Model":i,"Metric":j,"Relation":relation},ignore_index=True)
......
......@@ -47,15 +47,13 @@ def main():
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
params = {"n_estimators": 400,
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.1,
"eval_metric": "logloss",
"random_state": 0
}
"learning_rate": 0.1,
"scale_pos_weight": scale_pos_weight,
"objective": "binary:logistic",
"random_state": 0,
"use_label_encoder": False,
"eval_metric": 'logloss'}
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)
......
......@@ -58,7 +58,6 @@ class XgbClassifier(BaseClassifer):
scale_pos_weight = neg / pos
params = {"n_estimators": 400,
"learning_rate": 0.1,
"max_depth": 4,
"scale_pos_weight": scale_pos_weight,
"objective": "binary:logistic",
"random_state": 0,
......
import numpy as np
import pandas as pd
import xgboost
from typing import List, Tuple
from sklearn.metrics import confusion_matrix
def compute_mean(values: List):
return round(np.mean(values)*100, 3)
......@@ -26,4 +28,32 @@ def eval_gini(y_true: np.array, y_prob: np.array) -> float:
gini += y_i * delta
delta += 1 - y_i
gini = 1 - 2 * gini / (ntrue * (n - ntrue))
return gini
\ No newline at end of file
return gini
def get_cm_by_protected_variable(df, protected_col_name, y_target_name, y_pred_name):
confusion_df = pd.DataFrame(columns=[protected_col_name, "FPR", "FNR"])
for name in list(df[protected_col_name].unique()):
a=df[df[protected_col_name]==name][y_target_name]
b=df[df[protected_col_name]==name][y_pred_name]
TN, FP, FN, TP = confusion_matrix(list(a), list(b),labels=[0, 1]).ravel()
TPR = TP/(TP+FN)
TNR = TN/(TN+FP)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
FPR = FP/(FP+TN)
FNR = FN/(TP+FN)
FDR = FP/(TP+FP)
ACC = (TP+TN)/(TP+FP+FN+TN)
LRplus=TPR/FPR
LRminus=FNR/TNR
F1=2*(PPV*TPR)/(PPV+TPR)
confusion_df = confusion_df.append({protected_col_name:name, "TPR":TPR, "TNR":TNR, "FPR":FPR,
"FNR":FNR, "PPV":PPV, "NPV":NPV, "FDR":FDR, "ACC":ACC,
"F1":F1, "LRplus":LRplus, "LRminus":LRminus, "TN":TN,
"FP":FP, "FN":FN, "TP":TP}, ignore_index=True)
return confusion_df
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment