Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Christian Fischer Pedersen
AIR
Commits
8820cbc1
Commit
8820cbc1
authored
Mar 19, 2021
by
Christian Marius Lillelund
Browse files
added fall long case, script for plain xgb model
parent
5760e7af
Changes
13
Hide whitespace changes
Inline
Side-by-side
src/analysis/make_models_scaling.py
View file @
8820cbc1
...
...
@@ -2,12 +2,8 @@
import
numpy
as
np
import
pandas
as
pd
import
config
as
cfg
import
xgboost
as
xgb
from
sklearn.metrics
import
accuracy_score
from
sklearn.preprocessing
import
StandardScaler
,
MinMaxScaler
from
sklearn.preprocessing
import
RobustScaler
,
MaxAbsScaler
,
QuantileTransformer
from
sklearn.metrics
import
roc_auc_score
,
precision_score
,
recall_score
from
sklearn.model_selection
import
train_test_split
from
tools
import
classifiers
,
data_loader
from
pathlib
import
Path
from
sklearn.base
import
BaseEstimator
,
TransformerMixin
...
...
src/config.py
View file @
8820cbc1
...
...
@@ -30,25 +30,28 @@ CLUSTERS_DIR = Path.joinpath(ROOT_DIR, 'models/clusters')
COMPLETE_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/complete'
)
COMPLETE_TF_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/complete/tensorflow'
)
COMPLETE_XGB_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/complete/xgboost'
)
COMPLETE_CAT_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/complete/catboost'
)
COMPLETE_RF_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/complete/random_forest'
)
COMPLETE_EMB_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/complete/embeddings'
)
COMPLETE_EMB_FULL_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/complete/embeddings_full'
)
FALL_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall'
)
FALL_TF_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall/tensorflow'
)
FALL_XGB_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall/xgboost'
)
FALL_CAT_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall/catboost'
)
FALL_RF_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall/random_forest'
)
FALL_EMB_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall/embeddings'
)
SUCCESS_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/success'
)
SUCCESS_TF_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/success/tensorflow'
)
SUCCESS_XGB_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/success/xgboost'
)
SUCCESS_CAT_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/success/catboost'
)
SUCCESS_RF_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/success/random_forest'
)
SUCCESS_EMB_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/success/embeddings'
)
COMPLIANCE_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/compliance'
)
COMPLIANCE_TF_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/compliance/tensorflow'
)
COMPLIANCE_XGB_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/compliance/xgboost'
)
COMPLIANCE_RF_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/compliance/random_forest'
)
COMPLIANCE_EMB_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/compliance/embeddings'
)
FALL_SHORT_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall_short'
)
FALL_SHORT_TF_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall_short/tensorflow'
)
FALL_SHORT_XGB_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall_short/xgboost'
)
FALL_SHORT_RF_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall_short/random_forest'
)
FALL_SHORT_EMB_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall_short/embeddings'
)
FALL_LONG_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall_long'
)
FALL_LONG_TF_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall_long/tensorflow'
)
FALL_LONG_XGB_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall_long/xgboost'
)
FALL_LONG_RF_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall_long/random_forest'
)
FALL_LONG_EMB_DIR
=
Path
.
joinpath
(
ROOT_DIR
,
'models/fall_long/embeddings'
)
GENERAL_FEATURES
=
[
'Gender'
,
'Age'
,
'Cluster'
]
ATS_RESOLUTION
=
50
EX_RESOLUTION
=
9
ATS_DELIMITER
=
6
THRESHOLD_WEEKS
=
8
THRESHOLD_TRAINING
=
10
...
...
src/data/make_dataset.py
deleted
100644 → 0
View file @
5760e7af
#!/usr/bin/env python
import
config
as
cfg
from
tools
import
file_reader
,
file_writer
,
feature_maker
from
tools
import
preprocessor
,
neural_embedder
from
utility
import
embedder
import
pandas
as
pd
import
numpy
as
np
from
pathlib
import
Path
from
sklearn.decomposition
import
PCA
CASES
=
[
"Complete"
,
"Success"
,
"Fall"
]
USE_ATS_NAMES
=
True
def
main
():
cl
=
file_reader
.
read_csv
(
cfg
.
INTERIM_DATA_DIR
,
'cl.csv'
,
converters
=
{
'CitizenId'
:
str
,
'Cluster'
:
int
})
df
=
file_reader
.
read_csv
(
cfg
.
INTERIM_DATA_DIR
,
'screenings.csv'
,
converters
=
{
'CitizenId'
:
str
})
fd
=
file_reader
.
read_pickle
(
cfg
.
INTERIM_DATA_DIR
,
'fd.pkl'
)
longterm_fall_df
=
make_longterm_fall
(
df
,
fd
)
file_writer
.
write_csv
(
longterm_fall_df
,
cfg
.
PROCESSED_DATA_DIR
,
f
'longterm_fall.csv'
)
for
case
in
CASES
:
df_full
=
make_dataset_full
(
cl
,
df
,
case
)
file_writer
.
write_csv
(
df_full
,
cfg
.
PROCESSED_DATA_DIR
,
f
'
{
case
.
lower
()
}
.csv'
)
df_count
=
make_dataset_count
(
case
)
file_writer
.
write_csv
(
df_count
,
cfg
.
PROCESSED_DATA_DIR
,
f
'
{
case
.
lower
()
}
_count.csv'
)
df_emb
=
make_dataset_emb
(
case
)
file_writer
.
write_csv
(
df_emb
,
cfg
.
PROCESSED_DATA_DIR
,
f
'
{
case
.
lower
()
}
_emb.csv'
)
def
make_longterm_fall
(
df
,
fd
):
fd
=
fd
.
drop_duplicates
([
"CitizenId"
,
"Date"
])
df
=
preprocessor
.
split_cat_columns
(
df
,
col
=
'Ats'
,
tag
=
'Ats'
,
resolution
=
cfg
.
ATS_RESOLUTION
)
df
=
feature_maker
.
make_longterm_fall_feature
(
df
,
fd
)
return
df
def
make_dataset_full
(
cl
:
pd
.
DataFrame
,
df
:
pd
.
DataFrame
,
case
:
str
):
df
[
'Cluster'
]
=
cl
[
'Cluster'
]
df
=
preprocessor
.
split_cat_columns
(
df
,
col
=
'Ats'
,
tag
=
'Ats'
,
resolution
=
cfg
.
ATS_RESOLUTION
)
if
case
==
"Complete"
:
df
=
feature_maker
.
make_complete_feature
(
df
)
elif
case
==
"Success"
:
df
=
feature_maker
.
make_success_feature
(
df
)
else
:
df
=
feature_maker
.
make_fall_feature
(
df
)
ats_cols
=
df
.
filter
(
regex
=
'Ats'
,
axis
=
1
)
general_cols
=
df
[[
'Gender'
,
'BirthYear'
,
'Cluster'
,
'LoanPeriod'
]]
df
=
pd
.
concat
([
general_cols
,
ats_cols
,
df
[[
case
]]],
axis
=
1
)
if
USE_ATS_NAMES
:
df
=
preprocessor
.
replace_ats_values
(
df
)
return
df
def
make_dataset_count
(
case
:
str
):
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
f
'
{
case
.
lower
()
}
.csv'
,
converters
=
ats
)
if
USE_ATS_NAMES
:
cols_ats
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
unique_ats
=
[
df
[
f
'
{
i
}
Ats'
].
unique
()
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
unique_ats
=
list
(
set
(
np
.
concatenate
(
unique_ats
)))
df_ats
=
preprocessor
.
extract_cat_count
(
df
,
unique_ats
,
cols_ats
,
''
)
df
=
df
.
drop
(
cols_ats
,
axis
=
1
)
df
=
pd
.
concat
([
df
.
drop
(
case
,
axis
=
1
),
df_ats
,
df
[[
case
]]],
axis
=
1
)
df
=
df
.
drop
(
'0'
,
axis
=
1
)
else
:
num_cols
=
embedder
.
get_numerical_cols
(
df
,
case
)
cols_ats
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
unique_ats
=
[
df
[
f
'
{
i
}
Ats'
].
unique
()
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
unique_ats
=
list
(
set
(
np
.
concatenate
(
unique_ats
)))
df_ats
=
preprocessor
.
extract_cat_count
(
df
,
unique_ats
,
cols_ats
,
'Ats_'
)
df
=
pd
.
concat
([
df
,
df_ats
],
axis
=
1
)
ats_columns
=
[
'Ats_'
+
ats
for
ats
in
unique_ats
]
df
=
df
[
num_cols
+
ats_columns
+
[
case
]]
df
=
df
.
drop
([
'Ats_0'
],
axis
=
1
)
return
df
def
make_dataset_emb
(
case
:
str
):
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
f
'
{
case
.
lower
()
}
.csv'
,
converters
=
ats
)
emb_cols
=
df
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_numerical_cols
=
df
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
-
1
df_to_enc
=
df
.
iloc
[:,
n_numerical_cols
:]
target_name
=
case
train_ratio
=
0.9
X_train
,
X_val
,
y_train
,
y_val
,
labels
=
preprocessor
.
prepare_data_for_embedder
(
df_to_enc
,
target_name
,
train_ratio
)
if
case
==
"Complete"
:
artifacts_path
=
cfg
.
COMPLETE_EMB_DIR
epochs
=
5
elif
case
==
"Success"
:
artifacts_path
=
cfg
.
SUCCESS_EMB_DIR
epochs
=
5
else
:
artifacts_path
=
cfg
.
FALL_EMB_DIR
epochs
=
20
params
=
{
"df"
:
df_to_enc
,
"target_name"
:
target_name
,
"train_ratio"
:
train_ratio
,
"network_layers"
:
([
128
]),
"epochs"
:
epochs
,
"batch_size"
:
128
,
"verbose"
:
False
,
"artifacts_path"
:
artifacts_path
}
network
=
neural_embedder
.
NeuralEmbedder
(
**
params
)
network
.
fit
(
X_train
,
y_train
,
X_val
,
y_val
)
network
.
save_model
()
embedded_weights
=
network
.
get_embedded_weights
()
network
.
save_weights
(
embedded_weights
)
network
.
save_labels
(
labels
)
network
.
make_visualizations_from_network
(
extension
=
'png'
)
emb_cols
=
df
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_numerical_cols
=
df
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
-
1
embedded_df
=
df
.
iloc
[:,
n_numerical_cols
:
df
.
shape
[
1
]
-
1
]
for
index
in
range
(
embedded_df
.
shape
[
1
]):
column
=
embedded_df
.
columns
[
index
]
labels_column
=
labels
[
index
]
embeddings_column
=
embedded_weights
[
index
]
pca
=
PCA
(
n_components
=
1
)
Y
=
pca
.
fit_transform
(
embeddings_column
)
y_array
=
np
.
concatenate
(
Y
)
mapping
=
dict
(
zip
(
labels_column
.
classes_
,
y_array
))
file_writer
.
write_mapping
(
mapping
,
Path
.
joinpath
(
cfg
.
PROCESSED_DATA_DIR
,
'embeddings'
),
f
'
{
case
.
lower
()
}
_
{
column
}
.csv'
)
df
[
column
]
=
df
[
column
].
replace
(
to_replace
=
mapping
)
return
df
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
src/data/make_dataset_count.py
0 → 100644
View file @
8820cbc1
#!/usr/bin/env python
import
config
as
cfg
from
tools
import
file_reader
,
file_writer
from
tools
import
preprocessor
from
utility
import
embedder
import
pandas
as
pd
import
numpy
as
np
USE_CAT_NAMES
=
True
def
main
():
make_complete_count
()
make_compliance_count
()
make_fall_short_count
()
make_fall_long_count
()
def
make_complete_count
():
case
=
'Complete'
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
f
'complete.csv'
,
converters
=
ats
)
cols_ats
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
unique_ats
=
[
df
[
f
'
{
i
}
Ats'
].
unique
()
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
unique_ats
=
list
(
set
(
np
.
concatenate
(
unique_ats
)))
df_ats
=
preprocessor
.
extract_cat_count
(
df
,
unique_ats
,
cols_ats
,
''
)
if
USE_CAT_NAMES
:
df
=
df
.
drop
(
cols_ats
,
axis
=
1
)
df
=
pd
.
concat
([
df
.
drop
(
case
,
axis
=
1
),
df_ats
,
df
[[
case
]]],
axis
=
1
)
df
=
df
.
drop
(
'0'
,
axis
=
1
)
else
:
num_cols
=
embedder
.
get_numerical_cols
(
df
,
case
)
df
=
pd
.
concat
([
df
,
df_ats
],
axis
=
1
)
ats_columns
=
[
'Ats_'
+
ats
for
ats
in
unique_ats
]
df
=
df
[
num_cols
+
ats_columns
+
df
[[
case
]]]
df
=
df
.
drop
([
'Ats_0'
],
axis
=
1
)
file_writer
.
write_csv
(
df
,
cfg
.
PROCESSED_DATA_DIR
,
'complete_count.csv'
)
def
make_compliance_count
():
case
=
'Compliance'
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
f
'compliance.csv'
,
converters
=
ats
)
cols_ats
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
unique_ats
=
[
df
[
f
'
{
i
}
Ats'
].
unique
()
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
unique_ats
=
list
(
set
(
np
.
concatenate
(
unique_ats
)))
df_ats
=
preprocessor
.
extract_cat_count
(
df
,
unique_ats
,
cols_ats
,
''
)
if
USE_CAT_NAMES
:
df
=
df
.
drop
(
cols_ats
,
axis
=
1
)
df
=
pd
.
concat
([
df
.
drop
(
case
,
axis
=
1
),
df_ats
,
df
[[
case
]]],
axis
=
1
)
df
=
df
.
drop
(
'0'
,
axis
=
1
)
else
:
num_cols
=
embedder
.
get_numerical_cols
(
df
,
case
)
df
=
pd
.
concat
([
df
,
df_ats
],
axis
=
1
)
ats_columns
=
[
'Ats_'
+
ats
for
ats
in
unique_ats
]
df
=
df
[
num_cols
+
ats_columns
+
df
[[
case
]]]
df
=
df
.
drop
([
'Ats_0'
],
axis
=
1
)
file_writer
.
write_csv
(
df
,
cfg
.
PROCESSED_DATA_DIR
,
'compliance_count.csv'
)
def
make_fall_short_count
():
case
=
'FallShort'
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
f
'fall_short.csv'
,
converters
=
ats
)
cols_ats
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
unique_ats
=
[
df
[
f
'
{
i
}
Ats'
].
unique
()
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
unique_ats
=
list
(
set
(
np
.
concatenate
(
unique_ats
)))
df_ats
=
preprocessor
.
extract_cat_count
(
df
,
unique_ats
,
cols_ats
,
''
)
if
USE_CAT_NAMES
:
df
=
df
.
drop
(
cols_ats
,
axis
=
1
)
df
=
pd
.
concat
([
df
.
drop
(
case
,
axis
=
1
),
df_ats
,
df
[[
case
]]],
axis
=
1
)
df
=
df
.
drop
(
'0'
,
axis
=
1
)
else
:
num_cols
=
embedder
.
get_numerical_cols
(
df
,
case
)
df
=
pd
.
concat
([
df
,
df_ats
],
axis
=
1
)
ats_columns
=
[
'Ats_'
+
ats
for
ats
in
unique_ats
]
df
=
df
[
num_cols
+
ats_columns
+
df
[[
case
]]]
df
=
df
.
drop
([
'Ats_0'
],
axis
=
1
)
file_writer
.
write_csv
(
df
,
cfg
.
PROCESSED_DATA_DIR
,
'fall_short_count.csv'
)
def
make_fall_long_count
():
case
=
'FallLong'
ex
=
{
str
(
i
)
+
'Ex'
:
str
for
i
in
range
(
1
,
cfg
.
EX_RESOLUTION
+
1
)}
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)}
converters
=
{
**
ex
,
**
ats
}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
f
'fall_long.csv'
,
converters
=
converters
)
num_cols
=
embedder
.
get_numerical_cols
(
df
,
case
)
# Extract exercises
cols_ex
=
[
str
(
i
)
+
'Ex'
for
i
in
range
(
1
,
cfg
.
EX_RESOLUTION
+
1
)]
unique_ex
=
[
df
[
f
'
{
i
}
Ex'
].
unique
()
for
i
in
range
(
1
,
cfg
.
EX_RESOLUTION
+
1
)]
unique_ex
=
list
(
set
(
np
.
concatenate
(
unique_ex
)))
df_ex
=
preprocessor
.
extract_cat_count
(
df
,
unique_ex
,
cols_ex
,
'Ex_'
)
# Extract ats
cols_ats
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
unique_ats
=
[
df
[
f
'
{
i
}
Ats'
].
unique
()
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)]
unique_ats
=
list
(
set
(
np
.
concatenate
(
unique_ats
)))
df_ats
=
preprocessor
.
extract_cat_count
(
df
,
unique_ats
,
cols_ats
,
'Ats_'
)
# Merge dataframes
df
=
pd
.
concat
([
df
,
df_ex
,
df_ats
],
axis
=
1
)
ex_columns
=
[
'Ex_'
+
ex
for
ex
in
unique_ex
]
ats_columns
=
[
'Ats_'
+
ats
for
ats
in
unique_ats
]
df
=
df
[
num_cols
+
ex_columns
+
ats_columns
+
[
case
]]
df
=
df
.
drop
([
'Ex_0'
,
'Ats_0'
],
axis
=
1
)
file_writer
.
write_csv
(
df
,
cfg
.
PROCESSED_DATA_DIR
,
'fall_long_count.csv'
)
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
src/data/make_dataset_emb.py
0 → 100644
View file @
8820cbc1
#!/usr/bin/env python
import
config
as
cfg
from
tools
import
file_reader
,
file_writer
,
feature_maker
from
tools
import
preprocessor
,
neural_embedder
from
utility
import
embedder
import
pandas
as
pd
import
numpy
as
np
from
pathlib
import
Path
from
sklearn.decomposition
import
PCA
def
main
():
make_complete_emb
()
make_compliance_emb
()
make_fall_short_emb
()
make_fall_long_emb
()
def
make_complete_emb
():
case
=
'Complete'
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
'complete.csv'
,
converters
=
ats
)
emb_cols
=
df
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_numerical_cols
=
df
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
-
1
df_to_enc
=
df
.
iloc
[:,
n_numerical_cols
:]
target_name
=
case
train_ratio
=
0.9
X_train
,
X_val
,
y_train
,
y_val
,
labels
=
preprocessor
.
prepare_data_for_embedder
(
df_to_enc
,
target_name
,
train_ratio
)
artifacts_path
=
cfg
.
COMPLETE_EMB_DIR
epochs
=
5
params
=
{
"df"
:
df_to_enc
,
"target_name"
:
target_name
,
"train_ratio"
:
train_ratio
,
"network_layers"
:
([
128
]),
"epochs"
:
epochs
,
"batch_size"
:
128
,
"verbose"
:
False
,
"artifacts_path"
:
artifacts_path
}
network
=
neural_embedder
.
NeuralEmbedder
(
**
params
)
network
.
fit
(
X_train
,
y_train
,
X_val
,
y_val
)
network
.
save_model
()
embedded_weights
=
network
.
get_embedded_weights
()
network
.
save_weights
(
embedded_weights
)
network
.
save_labels
(
labels
)
network
.
make_visualizations_from_network
(
extension
=
'png'
)
emb_cols
=
df
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_numerical_cols
=
df
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
-
1
embedded_df
=
df
.
iloc
[:,
n_numerical_cols
:
df
.
shape
[
1
]
-
1
]
for
index
in
range
(
embedded_df
.
shape
[
1
]):
column
=
embedded_df
.
columns
[
index
]
labels_column
=
labels
[
index
]
embeddings_column
=
embedded_weights
[
index
]
pca
=
PCA
(
n_components
=
1
)
Y
=
pca
.
fit_transform
(
embeddings_column
)
y_array
=
np
.
concatenate
(
Y
)
mapping
=
dict
(
zip
(
labels_column
.
classes_
,
y_array
))
file_writer
.
write_mapping
(
mapping
,
Path
.
joinpath
(
cfg
.
PROCESSED_DATA_DIR
,
'embeddings'
),
f
'complete_
{
column
}
.csv'
)
df
[
column
]
=
df
[
column
].
replace
(
to_replace
=
mapping
)
file_writer
.
write_csv
(
df
,
cfg
.
PROCESSED_DATA_DIR
,
'complete_emb.csv'
)
def
make_compliance_emb
():
case
=
'Compliance'
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
f
'compliance.csv'
,
converters
=
ats
)
emb_cols
=
df
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_numerical_cols
=
df
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
-
1
df_to_enc
=
df
.
iloc
[:,
n_numerical_cols
:]
target_name
=
case
train_ratio
=
0.9
X_train
,
X_val
,
y_train
,
y_val
,
labels
=
preprocessor
.
prepare_data_for_embedder
(
df_to_enc
,
target_name
,
train_ratio
)
artifacts_path
=
cfg
.
COMPLIANCE_EMB_DIR
epochs
=
5
params
=
{
"df"
:
df_to_enc
,
"target_name"
:
target_name
,
"train_ratio"
:
train_ratio
,
"network_layers"
:
([
128
]),
"epochs"
:
epochs
,
"batch_size"
:
128
,
"verbose"
:
False
,
"artifacts_path"
:
artifacts_path
}
network
=
neural_embedder
.
NeuralEmbedder
(
**
params
)
network
.
fit
(
X_train
,
y_train
,
X_val
,
y_val
)
network
.
save_model
()
embedded_weights
=
network
.
get_embedded_weights
()
network
.
save_weights
(
embedded_weights
)
network
.
save_labels
(
labels
)
network
.
make_visualizations_from_network
(
extension
=
'png'
)
emb_cols
=
df
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_numerical_cols
=
df
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
-
1
embedded_df
=
df
.
iloc
[:,
n_numerical_cols
:
df
.
shape
[
1
]
-
1
]
for
index
in
range
(
embedded_df
.
shape
[
1
]):
column
=
embedded_df
.
columns
[
index
]
labels_column
=
labels
[
index
]
embeddings_column
=
embedded_weights
[
index
]
pca
=
PCA
(
n_components
=
1
)
Y
=
pca
.
fit_transform
(
embeddings_column
)
y_array
=
np
.
concatenate
(
Y
)
mapping
=
dict
(
zip
(
labels_column
.
classes_
,
y_array
))
file_writer
.
write_mapping
(
mapping
,
Path
.
joinpath
(
cfg
.
PROCESSED_DATA_DIR
,
'embeddings'
),
f
'compliance_
{
column
}
.csv'
)
df
[
column
]
=
df
[
column
].
replace
(
to_replace
=
mapping
)
file_writer
.
write_csv
(
df
,
cfg
.
PROCESSED_DATA_DIR
,
'compliance_emb.csv'
)
def
make_fall_short_emb
():
case
=
'FallShort'
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
f
'fall_short.csv'
,
converters
=
ats
)
emb_cols
=
df
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_numerical_cols
=
df
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
-
1
df_to_enc
=
df
.
iloc
[:,
n_numerical_cols
:]
target_name
=
case
train_ratio
=
0.9
X_train
,
X_val
,
y_train
,
y_val
,
labels
=
preprocessor
.
prepare_data_for_embedder
(
df_to_enc
,
target_name
,
train_ratio
)
artifacts_path
=
cfg
.
FALL_SHORT_EMB_DIR
epochs
=
5
params
=
{
"df"
:
df_to_enc
,
"target_name"
:
target_name
,
"train_ratio"
:
train_ratio
,
"network_layers"
:
([
128
]),
"epochs"
:
epochs
,
"batch_size"
:
128
,
"verbose"
:
False
,
"artifacts_path"
:
artifacts_path
}
network
=
neural_embedder
.
NeuralEmbedder
(
**
params
)
network
.
fit
(
X_train
,
y_train
,
X_val
,
y_val
)
network
.
save_model
()
embedded_weights
=
network
.
get_embedded_weights
()
network
.
save_weights
(
embedded_weights
)
network
.
save_labels
(
labels
)
network
.
make_visualizations_from_network
(
extension
=
'png'
)
emb_cols
=
df
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_numerical_cols
=
df
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
-
1
embedded_df
=
df
.
iloc
[:,
n_numerical_cols
:
df
.
shape
[
1
]
-
1
]
for
index
in
range
(
embedded_df
.
shape
[
1
]):
column
=
embedded_df
.
columns
[
index
]
labels_column
=
labels
[
index
]
embeddings_column
=
embedded_weights
[
index
]
pca
=
PCA
(
n_components
=
1
)
Y
=
pca
.
fit_transform
(
embeddings_column
)
y_array
=
np
.
concatenate
(
Y
)
mapping
=
dict
(
zip
(
labels_column
.
classes_
,
y_array
))
file_writer
.
write_mapping
(
mapping
,
Path
.
joinpath
(
cfg
.
PROCESSED_DATA_DIR
,
'embeddings'
),
f
'fall_short_
{
column
}
.csv'
)
df
[
column
]
=
df
[
column
].
replace
(
to_replace
=
mapping
)
file_writer
.
write_csv
(
df
,
cfg
.
PROCESSED_DATA_DIR
,
'fall_short_emb.csv'
)
def
make_fall_long_emb
():
case
=
'FallLong'
ex
=
{
str
(
i
)
+
'Ex'
:
str
for
i
in
range
(
1
,
cfg
.
EX_RESOLUTION
+
1
)}
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
cfg
.
ATS_RESOLUTION
+
1
)}
converters
=
{
**
ex
,
**
ats
}
df
=
file_reader
.
read_csv
(
cfg
.
PROCESSED_DATA_DIR
,
f
'fall_long.csv'
,
converters
=
converters
)
emb_cols
=
df
.
filter
(
regex
=
'((\d+)[Ats|Ex])\w+'
,
axis
=
1
)
n_numerical_cols
=
df
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
-
1
df_to_enc
=
df
.
iloc
[:,
n_numerical_cols
:]
target_name
=
case
train_ratio
=
0.9
X_train
,
X_val
,
y_train
,
y_val
,
labels
=
preprocessor
.
prepare_data_for_embedder
(
df_to_enc
,
target_name
,
train_ratio
)
artifacts_path
=
cfg
.
FALL_LONG_EMB_DIR
epochs
=
5
params
=
{
"df"
:
df_to_enc
,
"target_name"
:
target_name
,
"train_ratio"
:
train_ratio
,
"network_layers"
:
([
128
]),
"epochs"
:
epochs
,
"batch_size"
:
128
,
"verbose"
:
False
,
"artifacts_path"
:
artifacts_path
}
network
=
neural_embedder
.
NeuralEmbedder
(
**
params
)
network
.
fit
(
X_train
,
y_train
,
X_val
,
y_val
)
network
.
save_model
()
embedded_weights
=
network
.
get_embedded_weights
()
network
.
save_weights
(
embedded_weights
)
network
.
save_labels
(
labels
)
network
.
make_visualizations_from_network
(
extension
=
'png'
)
emb_cols
=
df
.
filter
(
regex
=
'((\d+)[Ats|Ex])\w+'
,
axis
=
1
)
n_numerical_cols
=
df
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
-
1
embedded_df
=
df
.
iloc
[:,
n_numerical_cols
:
df
.
shape
[
1
]
-
1
]
for
index
in
range
(
embedded_df
.
shape
[
1
]):
column
=
embedded_df
.
columns
[
index
]
labels_column
=
labels
[
index
]
embeddings_column
=
embedded_weights
[
index
]
pca
=
PCA
(
n_components
=
1
)
Y
=
pca
.
fit_transform
(
embeddings_column
)
y_array
=
np
.
concatenate
(
Y
)
mapping
=
dict
(
zip
(
labels_column
.
classes_
,
y_array
))
file_writer
.
write_mapping
(
mapping
,
Path
.
joinpath
(
cfg
.
PROCESSED_DATA_DIR
,
'embeddings'
),
f
'fall_long_
{
column
}
.csv'
)
df
[
column
]
=
df
[
column
].
replace
(
to_replace
=
mapping
)
file_writer
.
write_csv
(
df
,
cfg
.
PROCESSED_DATA_DIR
,
'fall_long_emb.csv'
)