Skip to content
Snippets Groups Projects
Commit b6406ffa authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

improved bias script, adjusted settings

parent 5920ed0a
No related branches found
No related tags found
No related merge requests found
Pipeline #87251 passed
......@@ -12,7 +12,7 @@ threshold_training: 10
train_ratio: 0.8
batch_size: 32
num_epochs: 5
verbose: True
verbose: False
network_layers: [128]
optimizer: "Adam"
......
......@@ -12,7 +12,7 @@ threshold_training: 10
train_ratio: 0.8
batch_size: 32
num_epochs: 10
verbose: True
verbose: False
network_layers: [128]
optimizer: "Adam"
......
......@@ -12,7 +12,7 @@ threshold_training: 10
train_ratio: 0.8
batch_size: 32
num_epochs: 10
verbose: True
verbose: False
network_layers: [128]
optimizer: "Adam"
......
......@@ -12,7 +12,7 @@ train_ratio: 0.8
batch_size: 32
num_epochs_ats: 10
num_epochs_ex: 5
verbose: True
verbose: False
network_layers: [128]
optimizer: "Adam"
......
......@@ -8,40 +8,11 @@ import xgboost as xgb
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from utility import metrics
from sklearn.metrics import confusion_matrix
from tools import data_loader, file_writer, file_reader
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
import matplotlib.pyplot as plt
def get_df_w_metrics(df, protected_col_name, y_target_name, y_pred_name):
confusion_df = pd.DataFrame(columns=[protected_col_name, "FPR", "FNR"])
for name in list(df[protected_col_name].unique()):
a=df[df[protected_col_name]==name][y_target_name]
b=df[df[protected_col_name]==name][y_pred_name]
TN, FP, FN, TP = confusion_matrix(list(a), list(b),labels=[0, 1]).ravel()
TPR = TP/(TP+FN)
TNR = TN/(TN+FP)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
FPR = FP/(FP+TN)
FNR = FN/(TP+FN)
FDR = FP/(TP+FP)
ACC = (TP+TN)/(TP+FP+FN+TN)
LRplus=TPR/FPR
LRminus=FNR/TNR
F1=2*(PPV*TPR)/(PPV+TPR)
confusion_df = confusion_df.append({protected_col_name:name, "TPR":TPR, "TNR":TNR, "FPR":FPR,
"FNR":FNR, "PPV":PPV, "NPV":NPV, "FDR":FDR, "ACC":ACC,
"F1":F1, "LRplus":LRplus, "LRminus":LRminus, "TN":TN,
"FP":FP, "FN":FN, "TP":TP}, ignore_index=True)
return confusion_df
def main():
# Load settings
with open(Path.joinpath(pt.CONFIGS_DIR, "fall_emb.yaml"), 'r') as stream:
......@@ -90,7 +61,7 @@ def main():
eval_set=eval_set,
eval_metric=metrics.gini_xgb,
early_stopping_rounds=early_stopping_rounds,
verbose=False)
verbose=False)
else:
fit_model = model.fit(X_train_split, y_train_split)
......@@ -100,20 +71,26 @@ def main():
# Save data
y_true_pd=y_valid_split.to_frame().reset_index(drop=True)
y_pred_pd=y_valid_scores.apply(lambda x: 1 if x == True else 0).to_frame().reset_index(drop=True).rename(columns={y_col_name : "output"})
y_valid_scores = y_valid_scores.apply(lambda x: 1 if x == True else 0).to_frame()
y_pred_pd=y_valid_scores.reset_index(drop=True).rename(columns={y_col_name : "output"})
y_pred_prob_pd = pd.DataFrame(pred, columns = ["output_prob"])
df_subset = pd.concat([X_valid_split.reset_index(drop=True), y_true_pd, y_pred_pd, y_pred_prob_pd], axis=1)
df_subset = pd.concat([X_valid_split.reset_index(drop=True), y_true_pd,
y_pred_pd, y_pred_prob_pd], axis=1)
df_test = df_test.append(df_subset, ignore_index=True)
# Save metrics
df_evaluate_proc = get_df_w_metrics(df_subset, protected_col_name, y_col_name, "output")
file_writer.write_csv(df_evaluate_proc, pt.INTERIM_DATA_DIR, "model"+str(i) + "_" + protected_col_name + ".csv")
df_evaluate_proc = metrics.get_cm_by_protected_variable(df_subset, protected_col_name,
y_col_name, "output")
file_writer.write_csv(df_evaluate_proc, pt.INTERIM_DATA_DIR, "model"+str(i)
+ "_" + protected_col_name + ".csv")
df_evaluate_together = df_subset.copy()
df_evaluate_together[protected_col_name] = "all"
df_evaluate_all = get_df_w_metrics(df_evaluate_together, protected_col_name, y_col_name, "output")
file_writer.write_csv(df_evaluate_all, pt.INTERIM_DATA_DIR, "model"+str(i) + "_" + protected_col_name + "_all.csv")
df_evaluate_all = metrics.get_cm_by_protected_variable(df_evaluate_together, protected_col_name,
y_col_name, "output")
file_writer.write_csv(df_evaluate_all, pt.INTERIM_DATA_DIR, "model"+str(i)
+ "_" + protected_col_name + "_all.csv")
valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))
valid_pre.append(precision_score(y_valid_split, y_valid_scores))
......@@ -134,17 +111,21 @@ def main():
# Save the confusion data for all
column_names = ["Group", "ML", "Measure", "Value"]
measures = ['FPR', 'FNR', 'ACC', 'F1', 'FDR', 'LRminus', 'LRplus',
'NPV', 'PPV', 'TNR', 'TPR','TP','TN','FN', 'FP']
df_out = pd.DataFrame(columns=column_names)
for i in range(5):
data = file_reader.read_csv(pt.INTERIM_DATA_DIR, f'model{i}_{protected_col_name}_all.csv')
for group in ["all"]:
for measure in ['FPR', 'FNR', 'ACC', 'F1', 'FDR', 'LRminus','LRplus', 'NPV', 'PPV', 'TNR', 'TPR','TP','TN','FN','FP']:
for measure in measures:
value = float(data[data[protected_col_name] == group][measure])
df_out=df_out.append({'Group': group, "ML":"XGBoost"+str(i), "Measure":measure, "Value":value}, ignore_index=True)
df_out=df_out.append({'Group': group, "ML":"XGBoost"+str(i),
"Measure":measure, "Value":value}, ignore_index=True)
file_writer.write_csv(df_out, pt.INTERIM_DATA_DIR, 'XGBoost_metrics_crossvalidated_all.csv')
global_all_bar=sns.barplot(data=df_out[df_out["Measure"].isin(["FPR","FNR","TPR","TNR"])],x="Group", y="Value", ci=95,hue="Measure")
global_all_bar=sns.barplot(data=df_out[df_out["Measure"].isin(["FPR","FNR","TPR","TNR"])],
x="Group", y="Value", ci=95,hue="Measure")
global_all_bar.set_title('All')
global_all_bar.get_figure().savefig(Path.joinpath(pt.REPORTS_PLOTS_DIR, f"{protected_col_name}_barplot_all.pdf"))
......@@ -155,12 +136,14 @@ def main():
for i in range(5):
data = file_reader.read_csv(pt.INTERIM_DATA_DIR, f'model{i}_{protected_col_name}.csv')
for group in [0.0, 1.0]:
for measure in ['FPR', 'FNR', 'ACC', 'F1', 'FDR', 'LRminus','LRplus', 'NPV', 'PPV', 'TNR', 'TPR','TP','TN','FN','FP']:
for measure in measures:
value=float(data[data[protected_col_name]==group][measure])
df_out=df_out.append({'Group': group,"ML":"XGBoost"+str(i),"Measure":measure,"Value":value}, ignore_index=True)
df_out=df_out.append({'Group': group,"ML":"XGBoost"+str(i),
"Measure":measure,"Value":value}, ignore_index=True)
file_writer.write_csv(df_out, pt.INTERIM_DATA_DIR, f'XGBoost_metrics_crossvalidated_{protected_col_name}.csv')
global_proc_bar=sns.barplot(data=df_out[df_out["Measure"].isin(["FPR","FNR","TPR","TNR"])],x="Group", y="Value", ci=95,hue="Measure")
global_proc_bar=sns.barplot(data=df_out[df_out["Measure"].isin(["FPR","FNR","TPR","TNR"])],
x="Group", y="Value", ci=95,hue="Measure")
global_proc_bar.set_title(f'Proctected: {protected_col_name}')
global_all_bar.get_figure().savefig(Path.joinpath(pt.REPORTS_PLOTS_DIR, "barplot_proc.pdf"))
......@@ -227,7 +210,7 @@ def main():
ax[i].set_ylabel('',fontsize=20)
ax[i].set_xlabel("",fontsize=20)
plt.savefig(Path.joinpath(pt.REPORTS_PLOTS_DIR, "XGBoost Gender Metrics.pdf"), dpi=300, bbox_inches = "tight")
plt.savefig(Path.joinpath(pt.REPORTS_PLOTS_DIR, "XGBoost Gender Metrics.pdf"), dpi=300, bbox_inches="tight")
# Calculate relation between male/female
frame=all_data_gender
......@@ -236,8 +219,10 @@ def main():
for i in list(frame["Model"].unique()):
for j in list(frame["Metric"].unique()) :
if j not in ["Mean_y_target","Mean_y_hat_prob"]:
female_val = frame[(frame["Model"]==i) & (frame["Metric"]==j) & (frame["Gender"]=="Female")]["Value"].mean()
male_val = frame[(frame["Model"]==i) & (frame["Metric"]==j) & (frame["Gender"]=="Male")]["Value"].mean()
female_val = frame[(frame["Model"]==i) & (frame["Metric"]==j)
& (frame["Gender"]=="Female")]["Value"].mean()
male_val = frame[(frame["Model"]==i) & (frame["Metric"]==j)
& (frame["Gender"]=="Male")]["Value"].mean()
relation=female_val/male_val
newFrame=newFrame.append({"Model":i,"Metric":j,"Relation":relation},ignore_index=True)
......
......@@ -47,15 +47,13 @@ def main():
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
params = {"n_estimators": 400,
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.1,
"eval_metric": "logloss",
"random_state": 0
}
"learning_rate": 0.1,
"scale_pos_weight": scale_pos_weight,
"objective": "binary:logistic",
"random_state": 0,
"use_label_encoder": False,
"eval_metric": 'logloss'}
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)
......
......@@ -58,7 +58,6 @@ class XgbClassifier(BaseClassifer):
scale_pos_weight = neg / pos
params = {"n_estimators": 400,
"learning_rate": 0.1,
"max_depth": 4,
"scale_pos_weight": scale_pos_weight,
"objective": "binary:logistic",
"random_state": 0,
......
import numpy as np
import pandas as pd
import xgboost
from typing import List, Tuple
from sklearn.metrics import confusion_matrix
def compute_mean(values: List):
return round(np.mean(values)*100, 3)
......@@ -26,4 +28,32 @@ def eval_gini(y_true: np.array, y_prob: np.array) -> float:
gini += y_i * delta
delta += 1 - y_i
gini = 1 - 2 * gini / (ntrue * (n - ntrue))
return gini
\ No newline at end of file
return gini
def get_cm_by_protected_variable(df, protected_col_name, y_target_name, y_pred_name):
confusion_df = pd.DataFrame(columns=[protected_col_name, "FPR", "FNR"])
for name in list(df[protected_col_name].unique()):
a=df[df[protected_col_name]==name][y_target_name]
b=df[df[protected_col_name]==name][y_pred_name]
TN, FP, FN, TP = confusion_matrix(list(a), list(b),labels=[0, 1]).ravel()
TPR = TP/(TP+FN)
TNR = TN/(TN+FP)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
FPR = FP/(FP+TN)
FNR = FN/(TP+FN)
FDR = FP/(TP+FP)
ACC = (TP+TN)/(TP+FP+FN+TN)
LRplus=TPR/FPR
LRminus=FNR/TNR
F1=2*(PPV*TPR)/(PPV+TPR)
confusion_df = confusion_df.append({protected_col_name:name, "TPR":TPR, "TNR":TNR, "FPR":FPR,
"FNR":FNR, "PPV":PPV, "NPV":NPV, "FDR":FDR, "ACC":ACC,
"F1":F1, "LRplus":LRplus, "LRminus":LRminus, "TN":TN,
"FP":FP, "FN":FN, "TP":TP}, ignore_index=True)
return confusion_df
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment