Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Christian Fischer Pedersen
AIR
Commits
b6406ffa
Commit
b6406ffa
authored
Oct 28, 2021
by
Christian Marius Lillelund
Browse files
improved bias script, adjusted settings
parent
5920ed0a
Pipeline
#87251
passed with stage
in 3 minutes
Changes
9
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
ml/configs/complete_emb.yaml
View file @
b6406ffa
...
...
@@ -12,7 +12,7 @@ threshold_training: 10
train_ratio
:
0.8
batch_size
:
32
num_epochs
:
5
verbose
:
Tru
e
verbose
:
Fals
e
network_layers
:
[
128
]
optimizer
:
"
Adam"
...
...
ml/configs/compliance_emb.yaml
View file @
b6406ffa
...
...
@@ -12,7 +12,7 @@ threshold_training: 10
train_ratio
:
0.8
batch_size
:
32
num_epochs
:
10
verbose
:
Tru
e
verbose
:
Fals
e
network_layers
:
[
128
]
optimizer
:
"
Adam"
...
...
ml/configs/fall_emb.yaml
View file @
b6406ffa
...
...
@@ -12,7 +12,7 @@ threshold_training: 10
train_ratio
:
0.8
batch_size
:
32
num_epochs
:
10
verbose
:
Tru
e
verbose
:
Fals
e
network_layers
:
[
128
]
optimizer
:
"
Adam"
...
...
ml/configs/risk_emb.yaml
View file @
b6406ffa
...
...
@@ -12,7 +12,7 @@ train_ratio: 0.8
batch_size
:
32
num_epochs_ats
:
10
num_epochs_ex
:
5
verbose
:
Tru
e
verbose
:
Fals
e
network_layers
:
[
128
]
optimizer
:
"
Adam"
...
...
ml/src/analysis/test_model_features.py
→
ml/src/analysis/test_model_
best_
features.py
View file @
b6406ffa
File moved
ml/src/analysis/test_model_bias.py
View file @
b6406ffa
...
...
@@ -8,40 +8,11 @@ import xgboost as xgb
import
seaborn
as
sns
from
sklearn.model_selection
import
StratifiedKFold
from
utility
import
metrics
from
sklearn.metrics
import
confusion_matrix
from
tools
import
data_loader
,
file_writer
,
file_reader
from
sklearn.metrics
import
accuracy_score
,
precision_score
from
sklearn.metrics
import
recall_score
,
roc_auc_score
import
matplotlib.pyplot
as
plt
def
get_df_w_metrics
(
df
,
protected_col_name
,
y_target_name
,
y_pred_name
):
confusion_df
=
pd
.
DataFrame
(
columns
=
[
protected_col_name
,
"FPR"
,
"FNR"
])
for
name
in
list
(
df
[
protected_col_name
].
unique
()):
a
=
df
[
df
[
protected_col_name
]
==
name
][
y_target_name
]
b
=
df
[
df
[
protected_col_name
]
==
name
][
y_pred_name
]
TN
,
FP
,
FN
,
TP
=
confusion_matrix
(
list
(
a
),
list
(
b
),
labels
=
[
0
,
1
]).
ravel
()
TPR
=
TP
/
(
TP
+
FN
)
TNR
=
TN
/
(
TN
+
FP
)
PPV
=
TP
/
(
TP
+
FP
)
NPV
=
TN
/
(
TN
+
FN
)
FPR
=
FP
/
(
FP
+
TN
)
FNR
=
FN
/
(
TP
+
FN
)
FDR
=
FP
/
(
TP
+
FP
)
ACC
=
(
TP
+
TN
)
/
(
TP
+
FP
+
FN
+
TN
)
LRplus
=
TPR
/
FPR
LRminus
=
FNR
/
TNR
F1
=
2
*
(
PPV
*
TPR
)
/
(
PPV
+
TPR
)
confusion_df
=
confusion_df
.
append
({
protected_col_name
:
name
,
"TPR"
:
TPR
,
"TNR"
:
TNR
,
"FPR"
:
FPR
,
"FNR"
:
FNR
,
"PPV"
:
PPV
,
"NPV"
:
NPV
,
"FDR"
:
FDR
,
"ACC"
:
ACC
,
"F1"
:
F1
,
"LRplus"
:
LRplus
,
"LRminus"
:
LRminus
,
"TN"
:
TN
,
"FP"
:
FP
,
"FN"
:
FN
,
"TP"
:
TP
},
ignore_index
=
True
)
return
confusion_df
def
main
():
# Load settings
with
open
(
Path
.
joinpath
(
pt
.
CONFIGS_DIR
,
"fall_emb.yaml"
),
'r'
)
as
stream
:
...
...
@@ -90,7 +61,7 @@ def main():
eval_set
=
eval_set
,
eval_metric
=
metrics
.
gini_xgb
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose
=
False
)
verbose
=
False
)
else
:
fit_model
=
model
.
fit
(
X_train_split
,
y_train_split
)
...
...
@@ -100,20 +71,26 @@ def main():
# Save data
y_true_pd
=
y_valid_split
.
to_frame
().
reset_index
(
drop
=
True
)
y_pred_pd
=
y_valid_scores
.
apply
(
lambda
x
:
1
if
x
==
True
else
0
).
to_frame
().
reset_index
(
drop
=
True
).
rename
(
columns
=
{
y_col_name
:
"output"
})
y_valid_scores
=
y_valid_scores
.
apply
(
lambda
x
:
1
if
x
==
True
else
0
).
to_frame
()
y_pred_pd
=
y_valid_scores
.
reset_index
(
drop
=
True
).
rename
(
columns
=
{
y_col_name
:
"output"
})
y_pred_prob_pd
=
pd
.
DataFrame
(
pred
,
columns
=
[
"output_prob"
])
df_subset
=
pd
.
concat
([
X_valid_split
.
reset_index
(
drop
=
True
),
y_true_pd
,
y_pred_pd
,
y_pred_prob_pd
],
axis
=
1
)
df_subset
=
pd
.
concat
([
X_valid_split
.
reset_index
(
drop
=
True
),
y_true_pd
,
y_pred_pd
,
y_pred_prob_pd
],
axis
=
1
)
df_test
=
df_test
.
append
(
df_subset
,
ignore_index
=
True
)
# Save metrics
df_evaluate_proc
=
get_df_w_metrics
(
df_subset
,
protected_col_name
,
y_col_name
,
"output"
)
file_writer
.
write_csv
(
df_evaluate_proc
,
pt
.
INTERIM_DATA_DIR
,
"model"
+
str
(
i
)
+
"_"
+
protected_col_name
+
".csv"
)
df_evaluate_proc
=
metrics
.
get_cm_by_protected_variable
(
df_subset
,
protected_col_name
,
y_col_name
,
"output"
)
file_writer
.
write_csv
(
df_evaluate_proc
,
pt
.
INTERIM_DATA_DIR
,
"model"
+
str
(
i
)
+
"_"
+
protected_col_name
+
".csv"
)
df_evaluate_together
=
df_subset
.
copy
()
df_evaluate_together
[
protected_col_name
]
=
"all"
df_evaluate_all
=
get_df_w_metrics
(
df_evaluate_together
,
protected_col_name
,
y_col_name
,
"output"
)
file_writer
.
write_csv
(
df_evaluate_all
,
pt
.
INTERIM_DATA_DIR
,
"model"
+
str
(
i
)
+
"_"
+
protected_col_name
+
"_all.csv"
)
df_evaluate_all
=
metrics
.
get_cm_by_protected_variable
(
df_evaluate_together
,
protected_col_name
,
y_col_name
,
"output"
)
file_writer
.
write_csv
(
df_evaluate_all
,
pt
.
INTERIM_DATA_DIR
,
"model"
+
str
(
i
)
+
"_"
+
protected_col_name
+
"_all.csv"
)
valid_acc
.
append
(
accuracy_score
(
y_valid_split
,
y_valid_scores
))
valid_pre
.
append
(
precision_score
(
y_valid_split
,
y_valid_scores
))
...
...
@@ -134,17 +111,21 @@ def main():
# Save the confusion data for all
column_names
=
[
"Group"
,
"ML"
,
"Measure"
,
"Value"
]
measures
=
[
'FPR'
,
'FNR'
,
'ACC'
,
'F1'
,
'FDR'
,
'LRminus'
,
'LRplus'
,
'NPV'
,
'PPV'
,
'TNR'
,
'TPR'
,
'TP'
,
'TN'
,
'FN'
,
'FP'
]
df_out
=
pd
.
DataFrame
(
columns
=
column_names
)
for
i
in
range
(
5
):
data
=
file_reader
.
read_csv
(
pt
.
INTERIM_DATA_DIR
,
f
'model
{
i
}
_
{
protected_col_name
}
_all.csv'
)
for
group
in
[
"all"
]:
for
measure
in
[
'FPR'
,
'FNR'
,
'ACC'
,
'F1'
,
'FDR'
,
'LRminus'
,
'LRplus'
,
'NPV'
,
'PPV'
,
'TNR'
,
'TPR'
,
'TP'
,
'TN'
,
'FN'
,
'FP'
]
:
for
measure
in
measures
:
value
=
float
(
data
[
data
[
protected_col_name
]
==
group
][
measure
])
df_out
=
df_out
.
append
({
'Group'
:
group
,
"ML"
:
"XGBoost"
+
str
(
i
),
"Measure"
:
measure
,
"Value"
:
value
},
ignore_index
=
True
)
df_out
=
df_out
.
append
({
'Group'
:
group
,
"ML"
:
"XGBoost"
+
str
(
i
),
"Measure"
:
measure
,
"Value"
:
value
},
ignore_index
=
True
)
file_writer
.
write_csv
(
df_out
,
pt
.
INTERIM_DATA_DIR
,
'XGBoost_metrics_crossvalidated_all.csv'
)
global_all_bar
=
sns
.
barplot
(
data
=
df_out
[
df_out
[
"Measure"
].
isin
([
"FPR"
,
"FNR"
,
"TPR"
,
"TNR"
])],
x
=
"Group"
,
y
=
"Value"
,
ci
=
95
,
hue
=
"Measure"
)
global_all_bar
=
sns
.
barplot
(
data
=
df_out
[
df_out
[
"Measure"
].
isin
([
"FPR"
,
"FNR"
,
"TPR"
,
"TNR"
])],
x
=
"Group"
,
y
=
"Value"
,
ci
=
95
,
hue
=
"Measure"
)
global_all_bar
.
set_title
(
'All'
)
global_all_bar
.
get_figure
().
savefig
(
Path
.
joinpath
(
pt
.
REPORTS_PLOTS_DIR
,
f
"
{
protected_col_name
}
_barplot_all.pdf"
))
...
...
@@ -155,12 +136,14 @@ def main():
for
i
in
range
(
5
):
data
=
file_reader
.
read_csv
(
pt
.
INTERIM_DATA_DIR
,
f
'model
{
i
}
_
{
protected_col_name
}
.csv'
)
for
group
in
[
0.0
,
1.0
]:
for
measure
in
[
'FPR'
,
'FNR'
,
'ACC'
,
'F1'
,
'FDR'
,
'LRminus'
,
'LRplus'
,
'NPV'
,
'PPV'
,
'TNR'
,
'TPR'
,
'TP'
,
'TN'
,
'FN'
,
'FP'
]
:
for
measure
in
measures
:
value
=
float
(
data
[
data
[
protected_col_name
]
==
group
][
measure
])
df_out
=
df_out
.
append
({
'Group'
:
group
,
"ML"
:
"XGBoost"
+
str
(
i
),
"Measure"
:
measure
,
"Value"
:
value
},
ignore_index
=
True
)
df_out
=
df_out
.
append
({
'Group'
:
group
,
"ML"
:
"XGBoost"
+
str
(
i
),
"Measure"
:
measure
,
"Value"
:
value
},
ignore_index
=
True
)
file_writer
.
write_csv
(
df_out
,
pt
.
INTERIM_DATA_DIR
,
f
'XGBoost_metrics_crossvalidated_
{
protected_col_name
}
.csv'
)
global_proc_bar
=
sns
.
barplot
(
data
=
df_out
[
df_out
[
"Measure"
].
isin
([
"FPR"
,
"FNR"
,
"TPR"
,
"TNR"
])],
x
=
"Group"
,
y
=
"Value"
,
ci
=
95
,
hue
=
"Measure"
)
global_proc_bar
=
sns
.
barplot
(
data
=
df_out
[
df_out
[
"Measure"
].
isin
([
"FPR"
,
"FNR"
,
"TPR"
,
"TNR"
])],
x
=
"Group"
,
y
=
"Value"
,
ci
=
95
,
hue
=
"Measure"
)
global_proc_bar
.
set_title
(
f
'Proctected:
{
protected_col_name
}
'
)
global_all_bar
.
get_figure
().
savefig
(
Path
.
joinpath
(
pt
.
REPORTS_PLOTS_DIR
,
"barplot_proc.pdf"
))
...
...
@@ -227,7 +210,7 @@ def main():
ax
[
i
].
set_ylabel
(
''
,
fontsize
=
20
)
ax
[
i
].
set_xlabel
(
""
,
fontsize
=
20
)
plt
.
savefig
(
Path
.
joinpath
(
pt
.
REPORTS_PLOTS_DIR
,
"XGBoost Gender Metrics.pdf"
),
dpi
=
300
,
bbox_inches
=
"tight"
)
plt
.
savefig
(
Path
.
joinpath
(
pt
.
REPORTS_PLOTS_DIR
,
"XGBoost Gender Metrics.pdf"
),
dpi
=
300
,
bbox_inches
=
"tight"
)
# Calculate relation between male/female
frame
=
all_data_gender
...
...
@@ -236,8 +219,10 @@ def main():
for
i
in
list
(
frame
[
"Model"
].
unique
()):
for
j
in
list
(
frame
[
"Metric"
].
unique
())
:
if
j
not
in
[
"Mean_y_target"
,
"Mean_y_hat_prob"
]:
female_val
=
frame
[(
frame
[
"Model"
]
==
i
)
&
(
frame
[
"Metric"
]
==
j
)
&
(
frame
[
"Gender"
]
==
"Female"
)][
"Value"
].
mean
()
male_val
=
frame
[(
frame
[
"Model"
]
==
i
)
&
(
frame
[
"Metric"
]
==
j
)
&
(
frame
[
"Gender"
]
==
"Male"
)][
"Value"
].
mean
()
female_val
=
frame
[(
frame
[
"Model"
]
==
i
)
&
(
frame
[
"Metric"
]
==
j
)
&
(
frame
[
"Gender"
]
==
"Female"
)][
"Value"
].
mean
()
male_val
=
frame
[(
frame
[
"Model"
]
==
i
)
&
(
frame
[
"Metric"
]
==
j
)
&
(
frame
[
"Gender"
]
==
"Male"
)][
"Value"
].
mean
()
relation
=
female_val
/
male_val
newFrame
=
newFrame
.
append
({
"Model"
:
i
,
"Metric"
:
j
,
"Relation"
:
relation
},
ignore_index
=
True
)
...
...
ml/src/model/train_xgboost_model.py
View file @
b6406ffa
...
...
@@ -47,15 +47,13 @@ def main():
neg
,
pos
=
np
.
bincount
(
y
)
scale_pos_weight
=
neg
/
pos
params
=
{
"n_estimators"
:
400
,
"objective"
:
"binary:logistic"
,
"scale_pos_weight"
:
scale_pos_weight
,
"use_label_encoder"
:
False
,
"learning_rate"
:
0.1
,
"eval_metric"
:
"logloss"
,
"random_state"
:
0
}
"learning_rate"
:
0.1
,
"scale_pos_weight"
:
scale_pos_weight
,
"objective"
:
"binary:logistic"
,
"random_state"
:
0
,
"use_label_encoder"
:
False
,
"eval_metric"
:
'logloss'
}
model
=
xgb
.
XGBClassifier
(
**
params
)
model
.
fit
(
X_train
,
y_train
)
...
...
ml/src/tools/classifiers.py
View file @
b6406ffa
...
...
@@ -58,7 +58,6 @@ class XgbClassifier(BaseClassifer):
scale_pos_weight
=
neg
/
pos
params
=
{
"n_estimators"
:
400
,
"learning_rate"
:
0.1
,
"max_depth"
:
4
,
"scale_pos_weight"
:
scale_pos_weight
,
"objective"
:
"binary:logistic"
,
"random_state"
:
0
,
...
...
ml/src/utility/metrics.py
View file @
b6406ffa
import
numpy
as
np
import
pandas
as
pd
import
xgboost
from
typing
import
List
,
Tuple
from
sklearn.metrics
import
confusion_matrix
def
compute_mean
(
values
:
List
):
return
round
(
np
.
mean
(
values
)
*
100
,
3
)
...
...
@@ -26,4 +28,32 @@ def eval_gini(y_true: np.array, y_prob: np.array) -> float:
gini
+=
y_i
*
delta
delta
+=
1
-
y_i
gini
=
1
-
2
*
gini
/
(
ntrue
*
(
n
-
ntrue
))
return
gini
\ No newline at end of file
return
gini
def
get_cm_by_protected_variable
(
df
,
protected_col_name
,
y_target_name
,
y_pred_name
):
confusion_df
=
pd
.
DataFrame
(
columns
=
[
protected_col_name
,
"FPR"
,
"FNR"
])
for
name
in
list
(
df
[
protected_col_name
].
unique
()):
a
=
df
[
df
[
protected_col_name
]
==
name
][
y_target_name
]
b
=
df
[
df
[
protected_col_name
]
==
name
][
y_pred_name
]
TN
,
FP
,
FN
,
TP
=
confusion_matrix
(
list
(
a
),
list
(
b
),
labels
=
[
0
,
1
]).
ravel
()
TPR
=
TP
/
(
TP
+
FN
)
TNR
=
TN
/
(
TN
+
FP
)
PPV
=
TP
/
(
TP
+
FP
)
NPV
=
TN
/
(
TN
+
FN
)
FPR
=
FP
/
(
FP
+
TN
)
FNR
=
FN
/
(
TP
+
FN
)
FDR
=
FP
/
(
TP
+
FP
)
ACC
=
(
TP
+
TN
)
/
(
TP
+
FP
+
FN
+
TN
)
LRplus
=
TPR
/
FPR
LRminus
=
FNR
/
TNR
F1
=
2
*
(
PPV
*
TPR
)
/
(
PPV
+
TPR
)
confusion_df
=
confusion_df
.
append
({
protected_col_name
:
name
,
"TPR"
:
TPR
,
"TNR"
:
TNR
,
"FPR"
:
FPR
,
"FNR"
:
FNR
,
"PPV"
:
PPV
,
"NPV"
:
NPV
,
"FDR"
:
FDR
,
"ACC"
:
ACC
,
"F1"
:
F1
,
"LRplus"
:
LRplus
,
"LRminus"
:
LRminus
,
"TN"
:
TN
,
"FP"
:
FP
,
"FN"
:
FN
,
"TP"
:
TP
},
ignore_index
=
True
)
return
confusion_df
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment