Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Christian Fischer Pedersen
AIR
Commits
946d6d23
Commit
946d6d23
authored
Jul 28, 2021
by
thecml
Browse files
introduced settings yaml and paths file
parent
90f7a573
Pipeline
#64262
failed with stage
in 2 minutes and 57 seconds
Changes
35
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
client.py
View file @
946d6d23
import
argparse
import
config
as
cfg
import
paths
as
pt
from
src.data
import
parse_and_clean_data
,
make_screenings
from
src.data
import
make_clusters
,
make_dataset_full
from
src.data
import
make_dataset_count
,
make_dataset_emb
...
...
@@ -41,7 +41,7 @@ def main():
use_real_ats_names
=
parsed_args
.
use_real_ats_names
run_full_pipeline
=
parsed_args
.
run_full_pipeline
print
(
f
"Client started. Using this configuration:"
)
print
(
f
"Raw data dictionary:
{
cfg
.
RAW_DATA_DIR_2020
}
"
)
print
(
f
"Raw data dictionary:
{
pt
.
RAW_DATA_DIR_2020
}
"
)
print
(
f
"Dataset year:
{
dataset_year
}
"
)
print
(
f
"Dataset version:
{
dataset_version
}
"
)
print
(
f
"Visualization enabled:
{
enable_visualization
}
"
)
...
...
@@ -60,7 +60,7 @@ def main():
print
(
"Making clusters ..."
)
make_clusters
.
main
()
print
(
f
"Completed making cluster model. It can be found at:
{
cfg
.
CLUSTERS_DIR
}
\n
"
)
print
(
f
"Completed making cluster model. It can be found at:
{
pt
.
CLUSTERS_DIR
}
\n
"
)
print
(
"Making full dataset ..."
)
make_dataset_full
.
main
(
use_real_ats_names
)
...
...
@@ -73,15 +73,15 @@ def main():
make_dataset_count
.
main
()
print
(
"
\n
Completed generating datasets at:"
)
print
(
f
"Interim data dictionary:
{
cfg
.
INTERIM_DATA_DIR
}
"
)
print
(
f
"Processed data dictionary:
{
cfg
.
PROCESSED_DATA_DIR
}
\n
"
)
print
(
f
"Interim data dictionary:
{
pt
.
INTERIM_DATA_DIR
}
"
)
print
(
f
"Processed data dictionary:
{
pt
.
PROCESSED_DATA_DIR
}
\n
"
)
print
(
f
"Making 4 XGBoost models based on version:
{
dataset_version
}
...
\n
"
)
make_xgb_models
.
main
(
dataset_version
)
print
(
f
"Completed making models. Models and SHAP plots can be found at:
\n
"
+
f
"
{
cfg
.
COMPLETE_XGB_DIR
}
\n
"
+
f
"
{
cfg
.
COMPLIANCE_XGB_DIR
}
\n
"
+
f
"
{
cfg
.
FALL_XGB_DIR
}
\n
"
+
f
"
{
cfg
.
FALL_TEST_XGB_DIR
}
"
+
"
\n
"
)
f
"
{
pt
.
COMPLETE_XGB_DIR
}
\n
"
+
f
"
{
pt
.
COMPLIANCE_XGB_DIR
}
\n
"
+
f
"
{
pt
.
FALL_XGB_DIR
}
\n
"
+
f
"
{
pt
.
FALL_TEST_XGB_DIR
}
"
+
"
\n
"
)
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
configs/settings.yaml
0 → 100644
View file @
946d6d23
---
# Settings for parser -------------------------------------------------
#
ats_resolution
:
10
ex_resolution
:
9
ats_delimiter
:
6
threshold_weeks
:
8
threshold_training
:
10
fall_exercise_threshold
:
3
fall_exercises
:
[
'
8058'
,
'
8062'
,
'
8066'
,
'
8077'
,
'
8074'
,
'
8059'
,
'
8071'
,
'
8067'
]
# Settings for dataset -------------------------------------------------
#
use_real_ats_names
:
False
src/analysis/find_best_shap_features.py
View file @
946d6d23
#!/usr/bin/env python
import
numpy
as
np
import
config
as
cfg
import
paths
as
pt
from
tools
import
file_writer
,
data_loader
from
sklearn.model_selection
import
StratifiedKFold
import
xgboost
as
xgb
...
...
@@ -43,8 +43,8 @@ def main():
importances
=
shap_sorted_df
[
'shap_values'
]
features
=
shap_sorted_df
[
'feature'
]
file_writer
.
write_shap_importance_plot
(
features
,
importances
,
cfg
.
REPORTS_PLOTS_DIR
,
PLOT_FILENAME
)
file_writer
.
write_csv
(
shap_sorted_df
,
cfg
.
REPORTS_DIR
,
CSV_FILENAME
)
file_writer
.
write_shap_importance_plot
(
features
,
importances
,
pt
.
REPORTS_PLOTS_DIR
,
PLOT_FILENAME
)
file_writer
.
write_csv
(
shap_sorted_df
,
pt
.
REPORTS_DIR
,
CSV_FILENAME
)
def
get_best_shap_features
(
X
:
np
.
ndarray
,
y
:
np
.
ndarray
,
cols
:
List
[
str
],
seed
:
int
):
...
...
src/analysis/make_models_cases.py
View file @
946d6d23
#!/usr/bin/env python
import
numpy
as
np
import
config
as
cfg
import
paths
as
pt
from
typing
import
List
from
tools
import
file_reader
,
file_writer
,
preprocessor
,
classifiers
import
tensorflow
as
tf
tf
.
get_logger
().
setLevel
(
'ERROR'
)
from
pathlib
import
Path
tf
.
get_logger
().
setLevel
(
'ERROR'
)
NUM_ITER
=
10
CASES
=
[
"Complete"
,
"Compliance"
,
"Fall"
,
"Fall_test"
]
...
...
@@ -22,9 +23,9 @@ class CVResult:
self
.
rec
=
rec
self
.
rocauc
=
rocauc
ATS_COLS
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
\
ATS_COLS
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
10
+
1
)]
\
+
[
'Cluster'
,
'LoanPeriod'
,
'NumberAts'
]
EX_COLS
=
[
str
(
i
)
+
'Ex'
for
i
in
range
(
1
,
cfg
.
EX_RESOLUTION
+
1
)]
+
[
'NumberEx'
]
EX_COLS
=
[
str
(
i
)
+
'Ex'
for
i
in
range
(
1
,
9
+
1
)]
+
[
'NumberEx'
]
CLF_NAMES
=
[
"MLP"
,
"LR"
,
"XGB"
,
"RF"
,
"SVM"
,
"KNN"
]
CLASSIFIERS
=
{
"MLP"
:
classifiers
.
train_mlp_cv
,
...
...
@@ -37,30 +38,30 @@ CLASSIFIERS = {
def
load_complete
():
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
11
)}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
'complete.csv'
,
converters
=
ats
)
return
df
def
load_fall
():
converters
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
11
)}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
'fall.csv'
,
converters
=
converters
)
return
df
def
load_compliance
():
converters
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
11
)}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
'compliance.csv'
,
converters
=
converters
)
return
df
def
load_fall_test
():
ex
=
{
str
(
i
)
+
'Ex'
:
str
for
i
in
range
(
1
,
cfg
.
EX_RESOLUTION
+
1
)}
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)}
ex
=
{
str
(
i
)
+
'Ex'
:
str
for
i
in
range
(
1
,
9
+
1
)}
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
10
+
1
)}
converters
=
{
**
ex
,
**
ats
}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
'fall_test.csv'
,
converters
=
converters
)
return
df
...
...
@@ -69,7 +70,7 @@ def main():
for
case
in
CASES
:
results_filename
=
f
"
{
case
}
baseline results.txt"
# Version 1
with
open
(
Path
.
joinpath
(
cfg
.
REPORTS_DIR
,
results_filename
),
"w+"
)
as
text_file
:
with
open
(
Path
.
joinpath
(
pt
.
REPORTS_DIR
,
results_filename
),
"w+"
)
as
text_file
:
text_file
.
write
(
f
"
{
case
}
version 1 - without Ats and/or Ex columns"
)
if
case
==
"Complete"
:
...
...
@@ -107,24 +108,24 @@ def main():
case
,
1
,
"without Ats and/or Ex columns"
)
# Version 2
with
open
(
Path
.
joinpath
(
cfg
.
REPORTS_DIR
,
results_filename
),
"a"
)
as
text_file
:
with
open
(
Path
.
joinpath
(
pt
.
REPORTS_DIR
,
results_filename
),
"a"
)
as
text_file
:
text_file
.
write
(
"
\n\n
"
)
text_file
.
write
(
f
"
{
case
}
version 2 - with embeddings"
)
if
case
==
"Complete"
:
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
'complete_emb.csv'
)
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
'complete_emb.csv'
)
X
=
df
.
drop
([
'Complete'
],
axis
=
1
)
y
=
df
[
'Complete'
]
elif
case
==
"Compliance"
:
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
'compliance_emb.csv'
)
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
'compliance_emb.csv'
)
X
=
df
.
drop
([
'Compliance'
],
axis
=
1
)
y
=
df
[
'Compliance'
]
elif
case
==
"Fall"
:
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
'fall_emb.csv'
)
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
'fall_emb.csv'
)
X
=
df
.
drop
([
'Fall'
],
axis
=
1
)
y
=
df
[
'Fall'
]
else
:
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
'fall_test_emb.csv'
)
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
'fall_test_emb.csv'
)
X
=
df
.
drop
([
'Fall'
],
axis
=
1
)
y
=
df
[
'Fall'
]
...
...
@@ -146,24 +147,24 @@ def main():
case
,
2
,
"with embeddings"
)
# Version 3
with
open
(
Path
.
joinpath
(
cfg
.
REPORTS_DIR
,
results_filename
),
"a"
)
as
text_file
:
with
open
(
Path
.
joinpath
(
pt
.
REPORTS_DIR
,
results_filename
),
"a"
)
as
text_file
:
text_file
.
write
(
"
\n\n
"
)
text_file
.
write
(
f
"
{
case
}
version 3 - with counts"
)
if
case
==
"Complete"
:
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
'complete_count.csv'
)
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
'complete_count.csv'
)
X
=
df
.
drop
([
'Complete'
],
axis
=
1
)
y
=
df
[
'Complete'
]
elif
case
==
"Compliance"
:
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
'compliance_count.csv'
)
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
'compliance_count.csv'
)
X
=
df
.
drop
([
'Compliance'
],
axis
=
1
)
y
=
df
[
'Compliance'
]
elif
case
==
"Fall"
:
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
'fall_count.csv'
)
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
'fall_count.csv'
)
X
=
df
.
drop
([
'Fall'
],
axis
=
1
)
y
=
df
[
'Fall'
]
else
:
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
'fall_test_count.csv'
)
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
'fall_test_count.csv'
)
X
=
df
.
drop
([
'Fall'
],
axis
=
1
)
y
=
df
[
'Fall'
]
...
...
@@ -219,7 +220,7 @@ def make_and_print_scores(classifer_name: str, k: int, res_acc: List,
metrics
=
" - "
.
join
([
"{}: {:.4f} (+/- {:.2f})"
.
format
(
r_m
.
name
,
r_m
.
result
,
r_s
.
result
)
for
r_m
,
r_s
in
(
zip
(
results_mean
,
results_std
)
or
[])])
text
=
"
\r
{} K={}: {}"
.
format
(
classifer_name
,
k
,
metrics
)
with
open
(
Path
.
joinpath
(
cfg
.
REPORTS_DIR
,
results_filename
),
"a"
)
as
text_file
:
with
open
(
Path
.
joinpath
(
pt
.
REPORTS_DIR
,
results_filename
),
"a"
)
as
text_file
:
text_file
.
write
(
text
)
def
make_plots
(
y_test
:
np
.
ndarray
,
results
:
np
.
ndarray
,
...
...
@@ -227,16 +228,16 @@ def make_plots(y_test: np.ndarray, results: np.ndarray,
roc_file_name
=
f
"
{
case_name
}
version
{
version_number
}
- ROC curves.pdf"
results_list
=
list
(
results
)
file_writer
.
write_roc_curve
(
y_test
,
results_list
,
cfg
.
REPORTS_PLOTS_DIR
,
roc_file_name
,
case_subtitle
)
file_writer
.
write_accuracy_plot
(
results_list
,
NUM_ITER
,
CLF_NAMES
,
cfg
.
REPORTS_PLOTS_DIR
,
pt
.
REPORTS_PLOTS_DIR
,
roc_file_name
,
case_subtitle
)
file_writer
.
write_accuracy_plot
(
results_list
,
NUM_ITER
,
CLF_NAMES
,
pt
.
REPORTS_PLOTS_DIR
,
f
"
{
case_name
}
version
{
version_number
}
- Accuracy.pdf"
,
case_subtitle
)
file_writer
.
write_precision_plot
(
results_list
,
NUM_ITER
,
CLF_NAMES
,
cfg
.
REPORTS_PLOTS_DIR
,
file_writer
.
write_precision_plot
(
results_list
,
NUM_ITER
,
CLF_NAMES
,
pt
.
REPORTS_PLOTS_DIR
,
f
"
{
case_name
}
version
{
version_number
}
- Precision.pdf"
,
case_subtitle
)
file_writer
.
write_recall_plot
(
results_list
,
NUM_ITER
,
CLF_NAMES
,
cfg
.
REPORTS_PLOTS_DIR
,
file_writer
.
write_recall_plot
(
results_list
,
NUM_ITER
,
CLF_NAMES
,
pt
.
REPORTS_PLOTS_DIR
,
f
"
{
case_name
}
version
{
version_number
}
- Recall.pdf"
,
case_subtitle
)
file_writer
.
write_rocauc_plot
(
results_list
,
NUM_ITER
,
CLF_NAMES
,
cfg
.
REPORTS_PLOTS_DIR
,
file_writer
.
write_rocauc_plot
(
results_list
,
NUM_ITER
,
CLF_NAMES
,
pt
.
REPORTS_PLOTS_DIR
,
f
"
{
case_name
}
version
{
version_number
}
- ROCAUC.pdf"
,
case_subtitle
)
file_writer
.
write_cm_plot
(
y_test
,
results_list
[
2
][
1
],
cfg
.
REPORTS_PLOTS_DIR
,
file_writer
.
write_cm_plot
(
y_test
,
results_list
[
2
][
1
],
pt
.
REPORTS_PLOTS_DIR
,
f
"
{
case_name
}
version
{
version_number
}
- CM.pdf"
,
"XGB - "
f
'
{
case_subtitle
}
'
)
if
__name__
==
'__main__'
:
...
...
src/analysis/make_models_scaling.py
View file @
946d6d23
#!/usr/bin/env python
import
numpy
as
np
import
pandas
as
pd
import
config
as
cfg
import
paths
as
pt
from
sklearn.preprocessing
import
StandardScaler
,
MinMaxScaler
from
sklearn.preprocessing
import
RobustScaler
,
MaxAbsScaler
,
QuantileTransformer
from
tools
import
classifiers
,
data_loader
...
...
@@ -41,16 +41,16 @@ def main():
MaxAbsScaler
(),
QuantileTransformer
(),
QuantileTransformer
(
random_state
=
0
),
QuantileTransformer
(
output_distribution
=
'normal'
,
random_state
=
0
)]
output_filename
=
f
"
{
case
}
scaling results.txt"
with
open
(
Path
.
joinpath
(
cfg
.
REPORTS_DIR
,
output_filename
),
"w+"
)
as
text_file
:
with
open
(
Path
.
joinpath
(
pt
.
REPORTS_DIR
,
output_filename
),
"w+"
)
as
text_file
:
text_file
.
write
(
f
"
{
case
}
case using
{
len
(
clfs
)
}
clfs and
{
len
(
scalers
)
}
scalers
\n\n
"
)
for
clf_name
,
clf
in
zip
(
clf_names
,
clfs
):
for
scaler_name
,
scaler
in
zip
(
scaler_names
,
scalers
):
with
open
(
Path
.
joinpath
(
cfg
.
REPORTS_DIR
,
output_filename
),
"a"
)
as
text_file
:
with
open
(
Path
.
joinpath
(
pt
.
REPORTS_DIR
,
output_filename
),
"a"
)
as
text_file
:
text_file
.
write
(
f
"Results for
{
clf_name
}
,
{
scaler_name
}
:
\n
"
)
X_sc
=
pd
.
DataFrame
(
scaler
.
fit_transform
(
X
.
iloc
[:,:
n_scale_cols
]))
X_new
=
pd
.
concat
([
X_sc
,
X
.
iloc
[:,
n_scale_cols
:]],
axis
=
1
)
_
,
result_acc
,
result_pre
,
result_recall
,
result_rocauc
,
_
=
clf
(
X_new
,
y
)
with
open
(
Path
.
joinpath
(
cfg
.
REPORTS_DIR
,
output_filename
),
"a"
)
as
text_file
:
with
open
(
Path
.
joinpath
(
pt
.
REPORTS_DIR
,
output_filename
),
"a"
)
as
text_file
:
text_file
.
write
(
f
"Accuracy:
{
round
(
np
.
mean
(
result_acc
),
3
)
}
\n
"
)
text_file
.
write
(
f
"Precision:
{
round
(
np
.
mean
(
result_pre
),
3
)
}
\n
"
)
text_file
.
write
(
f
"Recall:
{
round
(
np
.
mean
(
result_recall
),
3
)
}
\n
"
)
...
...
src/analysis/test_ats_model.py
View file @
946d6d23
import
pandas
as
pd
import
numpy
as
np
from
tools
import
preprocessor
,
file_reader
,
explainer
import
config
as
cfg
import
paths
as
pt
import
os
import
csv
import
joblib
from
pathlib
import
Path
def
main
():
model
=
file_reader
.
read_joblib
(
cfg
.
COMPLETE_XGB_DIR
,
model
=
file_reader
.
read_joblib
(
pt
.
COMPLETE_XGB_DIR
,
'complete_xgboost.joblib'
)
input_data
=
{
"Gender"
:
[
0
],
"BirthYear"
:
[
46
],
...
...
@@ -18,9 +18,9 @@ def main():
new_data_df
[
'NumberAts'
]
=
len
(
new_data_df
[
'Ats'
][
0
].
split
(
","
))
df
=
preprocessor
.
split_cat_columns
(
new_data_df
,
col_to_split
=
'Ats'
,
tag
=
'Ats'
,
resolution
=
cfg
.
ATS_RESOLUTION
)
resolution
=
10
)
cols_ats
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
cols_ats
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
10
+
1
)]
header_list
=
[
'Gender'
,
'BirthYear'
,
'Cluster'
,
'LoanPeriod'
,
'NumberAts'
]
+
cols_ats
df
=
df
.
reindex
(
columns
=
header_list
)
...
...
@@ -29,8 +29,8 @@ def main():
df
[
'Cluster'
]
=
14
df
[
'Cluster'
]
=
pd
.
to_numeric
(
df
[
'Cluster'
])
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
):
path
=
Path
.
joinpath
(
cfg
.
PROCESSED_DATA_DIR
,
'embeddings'
)
for
i
in
range
(
1
,
10
+
1
):
path
=
Path
.
joinpath
(
pt
.
PROCESSED_DATA_DIR
,
'embeddings'
)
embedding
=
file_reader
.
read_embedding
(
path
,
f
'complete_
{
i
}
Ats.csv'
)
column
=
f
'
{
i
}
Ats'
df
[
column
]
=
df
[
column
].
replace
(
to_replace
=
embedding
)
...
...
src/analysis/test_ats_model_one_citizen.py
View file @
946d6d23
import
pandas
as
pd
import
numpy
as
np
from
tools
import
preprocessor
,
file_reader
,
explainer
import
config
as
cfg
from
tools
import
file_reader
,
explainer
import
paths
as
pt
from
pathlib
import
Path
def
main
():
model
=
file_reader
.
read_joblib
(
cfg
.
COMPLETE_XGB_DIR
,
model
=
file_reader
.
read_joblib
(
pt
.
COMPLETE_XGB_DIR
,
'complete_xgboost.joblib'
)
converters
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)}
df
=
file_reader
.
read_csv
(
cfg
.
TESTS_FILES_DIR
,
converters
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
10
+
1
)}
df
=
file_reader
.
read_csv
(
pt
.
TESTS_FILES_DIR
,
'test_citizens.csv'
,
converters
=
converters
)
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
):
path
=
Path
.
joinpath
(
cfg
.
PROCESSED_DATA_DIR
,
'embeddings'
)
for
i
in
range
(
1
,
10
+
1
):
path
=
Path
.
joinpath
(
pt
.
PROCESSED_DATA_DIR
,
'embeddings'
)
embedding
=
file_reader
.
read_embedding
(
path
,
f
'complete_
{
i
}
Ats.csv'
)
column
=
f
'
{
i
}
Ats'
df
[
column
]
=
df
[
column
].
replace
(
to_replace
=
embedding
)
...
...
src/analysis/test_ats_resolution.py
View file @
946d6d23
import
pandas
as
pd
import
numpy
as
np
import
config
as
cfg
import
paths
as
pt
import
os
import
csv
import
joblib
...
...
@@ -18,14 +18,14 @@ def main():
target_name
=
"Complete"
step_size
=
10
for
idx
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
,
step_size
):
for
idx
in
range
(
1
,
10
+
1
,
step_size
):
logloss_train
,
logloss_test
=
list
(),
list
()
auc_train
,
auc_test
=
list
(),
list
()
for
ats_res
in
range
(
idx
,
idx
+
step_size
):
make_dataset_full
.
main
(
ats_resolution
=
ats_res
)
make_dataset_emb
.
main
(
ats_resolution
=
ats_res
)
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
'complete_emb.csv'
)
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
'complete_emb.csv'
)
df
=
df
.
sample
(
frac
=
1
,
random_state
=
0
).
reset_index
(
drop
=
True
)
...
...
@@ -79,7 +79,7 @@ def main():
plt
.
ylabel
(
'Logloss'
)
plt
.
xlabel
(
'Iterations'
)
plt
.
title
(
file_name
)
plt
.
savefig
(
Path
.
joinpath
(
cfg
.
REPORTS_PLOTS_DIR
,
f
'
{
file_name
}
.pdf'
),
plt
.
savefig
(
Path
.
joinpath
(
pt
.
REPORTS_PLOTS_DIR
,
f
'
{
file_name
}
.pdf'
),
dpi
=
300
,
bbox_inches
=
"tight"
)
...
...
src/analysis/test_fall_test_model.py
View file @
946d6d23
import
pandas
as
pd
import
numpy
as
np
from
tools
import
preprocessor
,
file_reader
import
config
as
cfg
import
paths
as
pt
import
os
import
csv
import
joblib
from
pathlib
import
Path
def
main
():
model
=
file_reader
.
read_joblib
(
cfg
.
FALL_TEST_XGB_DIR
,
model
=
file_reader
.
read_joblib
(
pt
.
FALL_TEST_XGB_DIR
,
'fall_test_xgboost.joblib'
)
for
gender
in
range
(
0
,
2
):
...
...
@@ -36,14 +36,14 @@ def main():
df
=
preprocessor
.
split_cat_columns
(
new_data_df
,
col_to_split
=
'Ats'
,
tag
=
'Ats'
,
resolution
=
cfg
.
ATS_RESOLUTION
)
resolution
=
10
)
df
=
preprocessor
.
split_cat_columns
(
df
,
col_to_split
=
'Ex'
,
tag
=
'Ex'
,
resolution
=
cfg
.
ATS_RESOLUTION
)
resolution
=
10
)
cols_ats
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
cols_ex
=
[
str
(
i
)
+
'Ex'
for
i
in
range
(
1
,
cfg
.
EX_RESOLUTION
+
1
)]
cols_ats
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
10
+
1
)]
cols_ex
=
[
str
(
i
)
+
'Ex'
for
i
in
range
(
1
,
9
+
1
)]
header_list
=
[
'Gender'
,
'BirthYear'
,
"Cluster"
,
"LoanPeriod"
,
"NumberSplit"
,
"NumberScreening"
,
"NumberWeeks"
,
"MeanEvaluation"
,
"NumberFalls"
,
...
...
@@ -53,15 +53,15 @@ def main():
df
=
df
.
reindex
(
columns
=
header_list
)
df
=
df
.
fillna
(
'0'
)
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
):
path
=
Path
.
joinpath
(
cfg
.
PROCESSED_DATA_DIR
,
'embeddings'
)
for
i
in
range
(
1
,
10
+
1
):
path
=
Path
.
joinpath
(
pt
.
PROCESSED_DATA_DIR
,
'embeddings'
)
embedding
=
file_reader
.
read_embedding
(
path
,
f
'fall_test_
{
i
}
Ats.csv'
)
column
=
f
'
{
i
}
Ats'
df
[
column
]
=
df
[
column
].
replace
(
to_replace
=
embedding
)
df
[
column
]
=
pd
.
to_numeric
(
df
[
column
])
for
i
in
range
(
1
,
cfg
.
EX_RESOLUTION
+
1
):
path
=
Path
.
joinpath
(
cfg
.
PROCESSED_DATA_DIR
,
'embeddings'
)
for
i
in
range
(
1
,
9
+
1
):
path
=
Path
.
joinpath
(
pt
.
PROCESSED_DATA_DIR
,
'embeddings'
)
embedding
=
file_reader
.
read_embedding
(
path
,
f
'fall_test_
{
i
}
Ex.csv'
)
column
=
f
'
{
i
}
Ex'
df
[
column
]
=
df
[
column
].
replace
(
to_replace
=
embedding
)
...
...
src/analysis/tune_kmodes_settings.py
View file @
946d6d23
#!/usr/bin/env python
import
config
as
cfg
import
paths
as
pt
import
pandas
as
pd
from
sklearn.pipeline
import
Pipeline
from
sklearn.base
import
BaseEstimator
,
TransformerMixin
...
...
@@ -11,11 +11,10 @@ from sklearn.ensemble import RandomForestClassifier
from
sklearn.model_selection
import
StratifiedKFold
def
main
():
df
=
file_reader
.
read_csv
(
cfg
.
INTERIM_DATA_DIR
,
'screenings.csv'
,
df
=
file_reader
.
read_csv
(
pt
.
INTERIM_DATA_DIR
,
'screenings.csv'
,
converters
=
{
'CitizenId'
:
str
})
df
=
preprocessor
.
split_cat_columns
(
df
,
col_to_split
=
'Ats'
,
tag
=
'Ats'
,
resolution
=
cfg
.
ATS_RESOLUTION
)
df
=
preprocessor
.
split_cat_columns
(
df
,
col_to_split
=
'Ats'
,
tag
=
'Ats'
,
resolution
=
10
)
df
=
feature_maker
.
make_complete_feature
(
df
)
general_cols
=
df
[[
'CitizenId'
,
'Gender'
,
'BirthYear'
,
'LoanPeriod'
]]
...
...
@@ -76,7 +75,7 @@ def main():
print
(
'
\n
Best hyperparameters:'
)
print
(
random_search
.
best_params_
)
results
=
pd
.
DataFrame
(
random_search
.
cv_results_
)
file_writer
.
write_csv
(
results
,
cfg
.
REPORTS_DIR
,
'kmodes-settings-random-grid-search-results.csv'
)
file_writer
.
write_csv
(
results
,
pt
.
REPORTS_DIR
,
'kmodes-settings-random-grid-search-results.csv'
)
if
__name__
==
'__main__'
:
main
()
src/analysis/tune_mlp_keras_tuner.py
View file @
946d6d23
from
tools
import
data_loader
import
tensorflow
as
tf
import
kerastuner
as
kt
import
config
as
cfg
from
pathlib
import
Path
import
paths
as
pt
import
shutil
CASE
=
"Complete"
...
...
@@ -38,7 +38,7 @@ def main():
tuner
=
kt
.
BayesianOptimization
(
create_model
,
objective
=
'val_accuracy'
,
max_trials
=
20
,
executions_per_trial
=
2
,
directory
=
Path
.
joinpath
(
cfg
.
REPORTS_DIR
,
'keras_tuner'
),
directory
=
Path
.
joinpath
(
pt
.
REPORTS_DIR
,
'keras_tuner'
),
project_name
=
'complete_mlp'
,
seed
=
0
)
...
...
@@ -47,7 +47,7 @@ def main():
print
(
tuner
.
get_best_hyperparameters
(
num_trials
=
1
)[
0
].
values
)
shutil
.
rmtree
(
Path
.
joinpath
(
cfg
.
REPORTS_DIR
,
'keras_tuner'
))
shutil
.
rmtree
(
Path
.
joinpath
(
pt
.
REPORTS_DIR
,
'keras_tuner'
))
if
__name__
==
'__main__'
:
main
()
...
...
src/analysis/tune_xgb_random_search.py
View file @
946d6d23
import
numpy
as
np
import
pandas
as
pd
import
config
as
cfg
import
paths
as
pt
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.pipeline
import
Pipeline
from
sklearn.model_selection
import
RandomizedSearchCV
...
...
@@ -64,7 +64,7 @@ def main():
print
(
'
\n
Best hyperparameters:'
)
print
(
random_search
.
best_params_
)
results
=
pd
.
DataFrame
(
random_search
.
cv_results_
)
file_writer
.
write_csv
(
results
,
cfg
.
REPORTS_DIR
,
'xgb-random-grid-search-results.csv'
)
file_writer
.
write_csv
(
results
,
pt
.
REPORTS_DIR
,
'xgb-random-grid-search-results.csv'
)
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
src/data/make_clusters.py
View file @
946d6d23
...
...
@@ -2,24 +2,25 @@
import
numpy
as
np
import
pandas
as
pd
import
config
as
cfg
import
paths
as
pt
import
yaml
from
typing
import
List
from
kmodes
import
kmodes
from
pathlib
import
Path
from
tools
import
file_reader
,
file_writer
,
preprocessor
USE_ATS_NAMES
=
False
def
main
():
df
=
file_reader
.
read_csv
(
cfg
.
INTERIM_DATA_DIR
,
'screenings.csv'
,
with
open
(
Path
.
joinpath
(
pt
.
CONFIGS_DIR
,
"settings.yaml"
),
'r'
)
as
stream
:
settings
=
yaml
.
safe_load
(
stream
)
df
=
file_reader
.
read_csv
(
pt
.
INTERIM_DATA_DIR
,
'screenings.csv'
,
converters
=
{
'CitizenId'
:
str
})
df
=
preprocessor
.
split_cat_columns
(
df
,
col_to_split
=
'Ats'
,
tag
=
'Ats'
,
resolution
=
cfg
.
ATS_RESOLUTION
)
resolution
=
settings
[
'ats_resolution'
]
)
if
USE_ATS_NAMES
:
if
settings
[
'use_real_ats_names'
]
:
df
=
preprocessor
.
replace_cat_values
(
df
)
cols_ats
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
cols_ats
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
settings
[
'ats_resolution'
]
+
1
)]
header_list
=
[
'CitizenId'
]
+
cols_ats
df
=
df
[
header_list
]
...
...
@@ -39,9 +40,9 @@ def main():
cluster_centroids
=
pd
.
DataFrame
(
dict
([
i
for
i
in
zip
(
range
(
0
,
len
(
model
.
cluster_centroids_
)),
model
.
cluster_centroids_
)]))
file_writer
.
write_joblib
(
model
,
cfg
.
CLUSTERS_DIR
,
'km.joblib'
)
file_writer
.
write_csv
(
cluster_centroids
,
cfg
.
INTERIM_DATA_DIR
,
f
'cluster_centroids.csv'
)
file_writer
.
write_csv
(
clusters
,
cfg
.
INTERIM_DATA_DIR
,
'cl.csv'
)
file_writer
.
write_joblib
(
model
,
pt
.
CLUSTERS_DIR
,
'km.joblib'
)
file_writer
.
write_csv
(
cluster_centroids
,
pt
.
INTERIM_DATA_DIR
,
f
'cluster_centroids.csv'
)
file_writer
.
write_csv
(
clusters
,
pt
.
INTERIM_DATA_DIR
,
'cl.csv'
)
if
__name__
==
'__main__'
:
main
()
src/data/make_dataset_count.py
View file @
946d6d23
#!/usr/bin/env python
import
config
as
cfg
import
paths
as
pt
from
tools
import
file_reader
,
file_writer
from
tools
import
preprocessor
from
utility
import
embedder
import
pandas
as
pd
import
numpy
as
np
from
pathlib
import
Path
import
yaml
def
main
():
make_complete_count
()
...
...
@@ -13,75 +15,83 @@ def main():
make_fall_test
()
def
make_complete_count
():
with
open
(
Path
.
joinpath
(
pt
.
CONFIGS_DIR
,
"settings.yaml"
),
'r'
)
as
stream
:
settings
=
yaml
.
safe_load
(
stream
)
case
=
'Complete'
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
settings
[
'ats_resolution'
]
+
1
)}
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
f
'complete.csv'
,
converters
=
ats
)
cols_ats
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]