Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Christian Fischer Pedersen
AIR
Commits
3a92b513
Commit
3a92b513
authored
Aug 02, 2021
by
thecml
Browse files
added adasyn test, decision tree viz
parent
da6e77d2
Pipeline
#64875
passed with stage
in 3 minutes and 4 seconds
Changes
16
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
configs/complete_emb.yaml
View file @
3a92b513
...
...
@@ -11,7 +11,5 @@ train_ratio: 0.8
batch_size
:
32
num_epochs
:
5
verbose
:
True
# Network Hyperparams --------------------------------------
network_layers
:
[
128
]
\ No newline at end of file
network_layers
:
[
128
]
optimizer
:
"
Adam"
\ No newline at end of file
configs/compliance_emb.yaml
View file @
3a92b513
...
...
@@ -11,7 +11,5 @@ train_ratio: 0.8
batch_size
:
32
num_epochs
:
20
verbose
:
True
# Network Hyperparams --------------------------------------
network_layers
:
[
128
]
\ No newline at end of file
network_layers
:
[
128
]
optimizer
:
"
Adam"
\ No newline at end of file
configs/fall_emb.yaml
View file @
3a92b513
...
...
@@ -11,7 +11,5 @@ train_ratio: 0.8
batch_size
:
32
num_epochs
:
10
verbose
:
True
# Network Hyperparams --------------------------------------
network_layers
:
[
128
]
\ No newline at end of file
network_layers
:
[
128
]
optimizer
:
"
Adam"
\ No newline at end of file
configs/settings.yaml
View file @
3a92b513
...
...
@@ -13,4 +13,4 @@ fall_exercises: ['8058','8062','8066','8077','8074','8059','8071','8067']
# Settings for dataset -------------------------------------------------
#
use_real_ats_names
:
Fals
e
use_real_ats_names
:
Tru
e
notebooks/MLP_shap.ipynb
View file @
3a92b513
This source diff could not be displayed because it is too large. You can
view the blob
instead.
notebooks/RF_DecisionTree.ipynb
0 → 100644
View file @
3a92b513
This diff is collapsed.
Click to expand it.
src/analysis/find_best_shap_features.py
View file @
3a92b513
...
...
@@ -26,7 +26,6 @@ def main():
dl
=
data_loader
.
FallDataLoader
(
FALL_FILENAME
).
load_data
()
X
,
y
=
dl
.
get_data
()
#X['Random'] = np.random.rand(len(X),1) # add random noise col
cols
=
X
.
columns
X
=
np
.
array
(
X
)
y
=
np
.
array
(
y
)
...
...
src/analysis/tune_rf_grid_search.py
0 → 100644
View file @
3a92b513
import
numpy
as
np
import
pandas
as
pd
import
paths
as
pt
from
sklearn.model_selection
import
GridSearchCV
from
xgboost
import
XGBClassifier
from
tools
import
data_loader
,
file_writer
from
sklearn.model_selection
import
StratifiedKFold
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.metrics
import
roc_auc_score
,
accuracy_score
FILENAME
=
"fall_emb.csv"
def
main
():
dl
=
data_loader
.
FallDataLoader
(
FILENAME
).
load_data
()
features
=
dl
.
get_features
()
X_train
,
X_test
,
y_train
,
y_test
=
dl
.
prepare_data_split
(
test_size
=
0.7
,
scaling_strategy
=
"Standard"
)
param_grid
=
{
'n_estimators'
:
[
200
,
400
,
600
,
800
,
1000
],
'max_features'
:
[
'auto'
],
'max_depth'
:
[
3
],
'min_samples_split'
:
[
2
],
'min_samples_leaf'
:
[
3
],
'criterion'
:[
'gini'
]
}
model
=
RandomForestClassifier
(
random_state
=
0
,
class_weight
=
"balanced"
)
cv_rfc
=
GridSearchCV
(
estimator
=
model
,
param_grid
=
param_grid
,
cv
=
5
,
verbose
=
2
,
n_jobs
=-
1
,
scoring
=
'roc_auc'
)
cv_rfc
.
fit
(
X_train
,
y_train
)
print
(
"Best: %f using %s"
%
(
cv_rfc
.
best_score_
,
cv_rfc
.
best_params_
))
rf
=
cv_rfc
.
best_estimator_
print
(
f
"
\n
ACC_TEST:
{
accuracy_score
(
y_test
,
rf
.
predict
(
X_test
))
}
"
)
print
(
f
"ROC_AUC_TEST:
{
roc_auc_score
(
y_test
,
rf
.
predict_proba
(
X_test
)[
:
,
1
])
}
"
)
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
src/analysis/tune_rf_grid_search_imbl.py
0 → 100644
View file @
3a92b513
import
numpy
as
np
import
pandas
as
pd
import
paths
as
pt
from
sklearn.model_selection
import
GridSearchCV
from
xgboost
import
XGBClassifier
from
tools
import
data_loader
,
file_writer
from
sklearn.model_selection
import
StratifiedKFold
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.metrics
import
roc_auc_score
,
accuracy_score
from
imblearn.pipeline
import
make_pipeline
,
Pipeline
from
imblearn.over_sampling
import
ADASYN
FILENAME
=
"fall_emb.csv"
def
main
():
dl
=
data_loader
.
FallDataLoader
(
FILENAME
).
load_data
()
features
=
dl
.
get_features
()
X_train
,
X_test
,
y_train
,
y_test
=
dl
.
prepare_data_split
(
test_size
=
0.7
,
scaling_strategy
=
"Standard"
)
param_grid
=
{
'model__n_estimators'
:
[
200
,
400
,
600
,
800
,
1000
],
'model__max_features'
:
[
'auto'
],
'model__max_depth'
:
[
3
],
'model__min_samples_split'
:
[
2
],
'model__min_samples_leaf'
:
[
3
],
'model__criterion'
:[
'gini'
]
}
rf
=
RandomForestClassifier
(
random_state
=
0
)
adasyn
=
ADASYN
(
random_state
=
0
)
pipeline
=
Pipeline
([(
'sampling'
,
adasyn
),
(
'model'
,
rf
)])
cv_rfc
=
GridSearchCV
(
estimator
=
pipeline
,
param_grid
=
param_grid
,
cv
=
5
,
verbose
=
2
,
n_jobs
=-
1
,
scoring
=
'roc_auc'
)
cv_rfc
.
fit
(
X_train
,
y_train
)
print
(
"Best: %f using %s"
%
(
cv_rfc
.
best_score_
,
cv_rfc
.
best_params_
))
rf
=
cv_rfc
.
best_estimator_
print
(
f
"
\n
ACC_TEST:
{
accuracy_score
(
y_test
,
rf
.
predict
(
X_test
))
}
"
)
print
(
f
"ROC_AUC_TEST:
{
roc_auc_score
(
y_test
,
rf
.
predict_proba
(
X_test
)[
:
,
1
])
}
"
)
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
src/analysis/tune_xgb_random_search.py
deleted
100644 → 0
View file @
da6e77d2
import
numpy
as
np
import
pandas
as
pd
import
paths
as
pt
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.pipeline
import
Pipeline
from
sklearn.model_selection
import
RandomizedSearchCV
from
sklearn.neighbors
import
KNeighborsClassifier
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.linear_model
import
LogisticRegression
from
sklearn.svm
import
SVC
from
sklearn.naive_bayes
import
GaussianNB
from
xgboost
import
XGBClassifier
from
tools
import
data_loader
,
file_writer
from
sklearn.model_selection
import
StratifiedKFold
CASE
=
"Complete"
COMPLETE_FILENAME
=
"complete_count.csv"
FALL_FILENAME
=
"fall_count.csv"
SCALING_STRATEGY
=
"Standard"
def
main
():
if
CASE
==
"Complete"
:
X
,
y
=
data_loader
.
CompleteDataLoader
(
COMPLETE_FILENAME
)
\
.
load_data
().
prepare_data
(
SCALING_STRATEGY
)
else
:
X
,
y
=
data_loader
.
FallDataLoader
(
FALL_FILENAME
)
\
.
load_data
().
prepare_data
(
SCALING_STRATEGY
)
params
=
{
'min_child_weight'
:
[
1
,
5
,
10
,
20
],
'gamma'
:
[
0.1
,
0.2
,
0.5
,
1
,
1.5
,
2
,
5
,
10
],
'subsample'
:
[
0.2
,
0.4
,
0.6
,
0.8
,
1.0
],
'colsample_bytree'
:
[
0.2
,
0.4
,
0.6
,
0.8
,
1.0
],
'max_depth'
:
[
1
,
2
,
3
,
4
,
5
,
10
,
20
,
50
]
}
neg
,
pos
=
np
.
bincount
(
y
)
scale_pos_weight
=
neg
/
pos
xgb
=
XGBClassifier
(
learning_rate
=
0.1
,
n_estimators
=
400
,
scale_pos_weight
=
scale_pos_weight
,
objective
=
'binary:logistic'
,
eval_metric
=
'logloss'
,
use_label_encoder
=
False
,
seed
=
0
)
skf
=
StratifiedKFold
(
n_splits
=
5
,
shuffle
=
True
,
random_state
=
0
)
random_search
=
RandomizedSearchCV
(
xgb
,
param_distributions
=
params
,
n_iter
=
10
,
scoring
=
'neg_log_loss'
,
n_jobs
=-
1
,
cv
=
skf
,
verbose
=
3
,
random_state
=
0
)
random_search
.
fit
(
X
,
y
)
print
(
'
\n
All results:'
)
print
(
random_search
.
cv_results_
)
print
(
'
\n
Best estimator:'
)
print
(
random_search
.
best_estimator_
)
print
(
'
\n
Best normalized gini score for %d-fold search with %d parameter combinations:'
%
(
5
,
5
))
print
(
random_search
.
best_score_
*
2
-
1
)
print
(
'
\n
Best hyperparameters:'
)
print
(
random_search
.
best_params_
)
results
=
pd
.
DataFrame
(
random_search
.
cv_results_
)
file_writer
.
write_csv
(
results
,
pt
.
REPORTS_DIR
,
'xgb-random-grid-search-results.csv'
)
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
src/data/make_dataset_count.py
View file @
3a92b513
...
...
@@ -28,6 +28,8 @@ def make_complete_count():
unique_ats
=
list
(
set
(
np
.
concatenate
(
unique_ats
)))
df_ats
=
preprocessor
.
extract_cat_count
(
df
,
unique_ats
,
cols_ats
,
'Ats_'
)
df_ats
=
df_ats
.
drop
([
'Ats_0'
],
axis
=
1
)
df
=
df
.
drop
(
cols_ats
,
axis
=
1
)
df
=
pd
.
concat
([
df
.
drop
(
case
,
axis
=
1
),
df_ats
,
df
[[
case
]]],
axis
=
1
)
...
...
@@ -46,6 +48,8 @@ def make_compliance_count():
unique_ats
=
list
(
set
(
np
.
concatenate
(
unique_ats
)))
df_ats
=
preprocessor
.
extract_cat_count
(
df
,
unique_ats
,
cols_ats
,
'Ats_'
)
df_ats
=
df_ats
.
drop
([
'Ats_0'
],
axis
=
1
)
df
=
df
.
drop
(
cols_ats
,
axis
=
1
)
df
=
pd
.
concat
([
df
.
drop
(
case
,
axis
=
1
),
df_ats
,
df
[[
case
]]],
axis
=
1
)
...
...
@@ -65,6 +69,8 @@ def make_fall_count():
unique_ats
=
list
(
set
(
np
.
concatenate
(
unique_ats
)))
df_ats
=
preprocessor
.
extract_cat_count
(
df
,
unique_ats
,
cols_ats
,
'Ats_'
)
df_ats
=
df_ats
.
drop
([
'Ats_0'
],
axis
=
1
)
df
=
df
.
drop
(
cols_ats
,
axis
=
1
)
df
=
pd
.
concat
([
df
.
drop
(
case
,
axis
=
1
),
df_ats
,
df
[[
case
]]],
axis
=
1
)
...
...
src/data/make_dataset_emb.py
View file @
3a92b513
...
...
@@ -44,13 +44,14 @@ def main(ats_resolution: int = None):
# Encode dataframe given params
model_path
=
Path
.
joinpath
(
pt
.
ROOT_DIR
,
emb_cfg
[
'model_path'
])
df_enc
=
encode_dataframe
(
df
=
df_to_enc
,
target_name
=
emb_cfg
[
'target_name'
],
batch_size
=
emb_cfg
[
'batch_size'
],
train_ratio
=
emb_cfg
[
'train_ratio'
],
epochs
=
emb_cfg
[
'num_epochs'
],
network_layers
=
emb_cfg
[
'network_layers'
],
verbose
=
emb_cfg
[
'verbose'
],
model_path
=
model_path
)
target_name
=
emb_cfg
[
'target_name'
],
batch_size
=
emb_cfg
[
'batch_size'
],
train_ratio
=
emb_cfg
[
'train_ratio'
],
epochs
=
emb_cfg
[
'num_epochs'
],
optimizer
=
emb_cfg
[
'optimizer'
],
network_layers
=
emb_cfg
[
'network_layers'
],
verbose
=
emb_cfg
[
'verbose'
],
model_path
=
model_path
)
df_rand
=
pd
.
DataFrame
(
np
.
random
.
rand
(
len
(
df
),
1
),
columns
=
[
'Rand'
])
df
=
pd
.
concat
([
df
.
drop
(
target_name
,
axis
=
1
),
df_rand
,
df_enc
,
df
.
pop
(
target_name
)],
axis
=
1
)
...
...
@@ -81,14 +82,14 @@ def make_fall_test_emb(ats_resolution):
df
=
pd
.
concat
([
df
.
drop
(
'Fall'
,
axis
=
1
),
ats_enc
,
ex_enc
,
df
.
pop
(
'Fall'
)],
axis
=
1
)
file_writer
.
write_csv
(
df
,
pt
.
PROCESSED_DATA_DIR
,
'fall_test_emb.csv'
)
def
encode_dataframe
(
df
,
target_name
,
batch_size
,
train_ratio
,
epochs
,
network_layers
,
verbose
,
model_path
):
def
encode_dataframe
(
df
,
target_name
,
batch_size
,
train_ratio
,
epochs
,
optimizer
,
network_layers
,
verbose
,
model_path
):
X_train
,
X_val
,
y_train
,
y_val
,
labels
=
preprocessor
.
prepare_data_for_embedder
(
df
,
target_name
,
train_ratio
)
network
=
neural_embedder
.
NeuralEmbedder
(
df
=
df
,
target_name
=
target_name
,
epochs
=
epochs
,
batch_size
=
batch_size
,
network_layers
=
network_layers
,
verbose
=
verbose
,
model_path
=
model_path
)
optimizer_fn
=
optimizer
,
verbose
=
verbose
,
model_path
=
model_path
)
network
.
fit
(
X_train
,
y_train
,
X_val
,
y_val
)
network
.
save_model
()
embedded_weights
=
network
.
get_embedded_weights
()
...
...
@@ -114,14 +115,16 @@ def encode_dataframe(df, target_name, batch_size, train_ratio,
return
df_to_enc
def
encode_dataframe_cv
(
df_to_enc
,
target_name
,
artifacts_path
):
params
=
get_config
(
df_to_enc
,
target_name
,
artifacts_path
)
X
,
y
=
preprocessor
.
get_X_y
(
df_to_enc
,
target_name
)
def
encode_dataframe_cv
(
df
,
target_name
,
batch_size
,
train_ratio
,
epochs
,
network_layers
,
verbose
,
model_path
):
X
,
y
=
preprocessor
.
get_X_y
(
df
,
target_name
)
X
,
labels
=
preprocessor
.
encode_vector_label
(
X
)
y
=
np
.
array
(
y
)
network
=
neural_embedder
.
NeuralEmbedder
(
df
=
df
,
target_name
=
target_name
,
epochs
=
epochs
,
batch_size
=
batch_size
,
network_layers
=
network_layers
,
verbose
=
verbose
,
model_path
=
model_path
)
network
=
neural_embedder
.
NeuralEmbedder
(
**
params
)
skf
=
StratifiedKFold
(
n_splits
=
5
,
shuffle
=
True
,
random_state
=
0
)
es_callback
=
tf
.
keras
.
callbacks
.
EarlyStopping
(
monitor
=
'val_loss'
,
mode
=
'min'
,
...
...
@@ -150,7 +153,7 @@ def encode_dataframe_cv(df_to_enc, target_name, artifacts_path):
if
ENABLE_EMB_VIZ
:
network
.
make_visualizations_from_network
(
extension
=
'png'
)
df_to_enc
=
df
_to_enc
.
drop
(
target_name
,
axis
=
1
)
df_to_enc
=
df
.
drop
(
target_name
,
axis
=
1
)
for
index
in
range
(
df_to_enc
.
shape
[
1
]):
column
=
df_to_enc
.
columns
[
index
]
labels_column
=
labels
[
index
]
...
...
src/data/make_screenings.py
View file @
3a92b513
...
...
@@ -138,7 +138,7 @@ def get_screenings_by_id(data, id, settings):
single_screening
[
'Ats'
]
=
feature_maker
.
get_ats
(
citizen_data
.
ats
,
end_date
,
settings
)
single_screening
[
'NumberAts'
]
=
feature_maker
.
get_number_ats
(
citizen_data
.
ats
,
end_date
)
single_screening
[
'LoanPeriod'
]
=
feature_maker
.
get_loan_period
(
citizen_data
.
ats
,
end_date
)
single_screening
[
'LoanPeriod'
]
=
feature_maker
.
get_
avg_
loan_period
(
citizen_data
.
ats
,
end_date
)
single_screening
[
'Needs'
]
=
screening
.
NeedForHelpScore
single_screening
[
'NeedsReason'
]
=
screening
.
NeedForHelpReason
...
...
src/model/train_xgb_models.py
View file @
3a92b513
...
...
@@ -11,7 +11,7 @@ from sklearn.model_selection import train_test_split
import
xgboost
as
xgb
DATA_DIR
=
pt
.
PROCESSED_DATA_DIR
CASES
=
[
"Complete"
,
"Compliance"
,
"Fall"
,
"Fall_test"
]
CASES
=
[
"Complete"
,
"Compliance"
,
"Fall"
]
def
main
(
dataset_version
:
str
=
'emb'
):
for
case
in
CASES
:
...
...
@@ -47,7 +47,7 @@ def main(dataset_version : str = 'emb'):
df
=
df
.
sample
(
frac
=
1
,
random_state
=
0
).
reset_index
(
drop
=
True
)
#df['Random'] = np.random.rand(len(df),1) # add random noise col
X
=
df
.
drop
([
target_name
],
axis
=
1
)
y
=
df
[
target_name
]
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
0.2
,
...
...
src/tools/data_loader.py
View file @
3a92b513
...
...
@@ -6,8 +6,8 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler
from
sklearn.model_selection
import
train_test_split
class
BaseDataLoader
(
ABC
):
def
__init__
(
self
,
file_name
=
None
,
converters
=
None
):
"""Initilizer method that takes a file name"""
def
__init__
(
self
,
file_name
,
converters
=
None
):
"""Initilizer method that takes a file name
and optionally a converter
"""
self
.
file_name
=
file_name
self
.
converters
=
converters
...
...
@@ -15,62 +15,50 @@ class BaseDataLoader(ABC):
def
load_data
(
self
):
"""Loads the data from a data set at startup"""
@
abstractmethod
def
prepare_data
(
self
,
scaling_strategy
=
None
):
"""Prepares the data from a data set"""
@
abstractmethod
def
prepare_data_split
(
self
,
scaling_strategy
=
None
,
test_size
=
None
):
"""Prepares and splits the data from a data set"""
class
ComplianceDataLoader
(
BaseDataLoader
):
def
load_data
(
self
):
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
self
.
file_name
,
converters
=
self
.
converters
)
X
=
df
.
drop
([
'Compliance'
],
axis
=
1
)
y
=
df
[
'Compliance'
]
self
.
X
=
X
self
.
y
=
y
return
self
def
get_data
(
self
):
"""Returns the features and target"""
return
self
.
X
,
self
.
y
def
get_features
(
self
):
"""Returns the feature names"""
return
self
.
X
.
columns
def
prepare_data
(
self
,
scaling_strategy
:
str
=
"Standard"
):
def
prepare_data
(
self
,
scaling_strategy
=
None
):
"""Prepares the data from a data set"""
X
=
np
.
array
(
self
.
X
)
y
=
np
.
array
(
self
.
y
)
emb_cols
=
self
.
X
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_scale_cols
=
self
.
X
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
if
scaling_strategy
==
"Standard"
:
scaler
=
StandardScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
else
:
scaler
=
MinMaxScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
if
scaling_strategy
!=
None
:
emb_cols
=
self
.
X
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_scale_cols
=
self
.
X
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
if
scaling_strategy
==
"Standard"
:
scaler
=
StandardScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
else
:
scaler
=
MinMaxScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
return
X
,
y
def
prepare_data_split
(
self
,
scaling_strategy
:
str
=
"Standard"
,
test_size
:
float
=
0.3
):
def
prepare_data_split
(
self
,
test_size
,
scaling_strategy
=
None
):
"""Prepares and splits the data from a data set"""
X
=
np
.
array
(
self
.
X
)
y
=
np
.
array
(
self
.
y
)
emb_cols
=
self
.
X
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_scale_cols
=
self
.
X
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
if
scaling_strategy
==
"Standard"
:
scaler
=
StandardScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
else
:
scaler
=
MinMaxScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
if
scaling_strategy
!=
None
:
emb_cols
=
self
.
X
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_scale_cols
=
self
.
X
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
if
scaling_strategy
==
"Standard"
:
scaler
=
StandardScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
else
:
scaler
=
MinMaxScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
test_size
,
stratify
=
y
,
random_state
=
0
)
...
...
@@ -87,49 +75,17 @@ class CompleteDataLoader(BaseDataLoader):
self
.
X
=
X
self
.
y
=
y
return
self
def
get_data
(
self
):
return
self
.
X
,
self
.
y
def
get_features
(
self
):
return
self
.
X
.
columns
def
prepare_data
(
self
,
scaling_strategy
:
str
=
"Standard"
):
X
=
np
.
array
(
self
.
X
)
y
=
np
.
array
(
self
.
y
)
emb_cols
=
self
.
X
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_scale_cols
=
self
.
X
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
if
scaling_strategy
==
"Standard"
:
scaler
=
StandardScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
else
:
scaler
=
MinMaxScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
return
X
,
y
def
prepare_data_split
(
self
,
scaling_strategy
:
str
=
"Standard"
,
test_size
:
float
=
0.3
):
X
=
np
.
array
(
self
.
X
)
y
=
np
.
array
(
self
.
y
)
emb_cols
=
self
.
X
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_scale_cols
=
self
.
X
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
if
scaling_strategy
==
"Standard"
:
scaler
=
StandardScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
else
:
scaler
=
MinMaxScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
test_size
,
stratify
=
y
,
random_state
=
0
)
return
X_train
,
X_test
,
y_train
,
y_test
class
ComplianceDataLoader
(
BaseDataLoader
):
def
load_data
(
self
):
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
self
.
file_name
,
converters
=
self
.
converters
)
X
=
df
.
drop
([
'Compliance'
],
axis
=
1
)
y
=
df
[
'Compliance'
]
self
.
X
=
X
self
.
y
=
y
return
self
class
FallDataLoader
(
BaseDataLoader
):
def
load_data
(
self
):
...
...
@@ -141,49 +97,6 @@ class FallDataLoader(BaseDataLoader):
self
.
X
=
X
self
.
y
=
y
return
self
def
get_data
(
self
):
return
self
.
X
,
self
.
y
def
get_features
(
self
):
return
self
.
X
.
columns
def
prepare_data
(
self
,
scaling_strategy
:
str
=
"Standard"
):
X
=
np
.
array
(
self
.
X
)
y
=
np
.
array
(
self
.
y
)
emb_cols
=
self
.
X
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_scale_cols
=
self
.
X
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
if
scaling_strategy
==
"Standard"
:
scaler
=
StandardScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
else
:
scaler
=
MinMaxScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
return
X
,
y
def
prepare_data_split
(
self
,
scaling_strategy
:
str
,
test_size
:
float
):
X
=
np
.
array
(
self
.
X
)
y
=
np
.
array
(
self
.
y
)
emb_cols
=
self
.
X
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_scale_cols
=
self
.
X
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
if
scaling_strategy
==
"Standard"
:
scaler
=
StandardScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
else
:
scaler
=
MinMaxScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
test_size
=
test_size
,
stratify
=
y
,
random_state
=
0
)
return
X_train
,
X_test
,
y_train
,
y_test
class
FallTestDataLoader
(
BaseDataLoader
):
def
load_data
(
self
):
...
...
@@ -196,40 +109,40 @@ class FallTestDataLoader(BaseDataLoader):
self
.
y
=
y
return
self
def
get_data
(
self
):
return
self
.
X
,
self
.
y
def
prepare_data
(
self
,
scaling_strategy
:
str
=
"Standard"
):
def
prepare_data
(
self
,
scaling_strategy
:
str
=
None
):
X
=
np
.
array
(
self
.
X
)
y
=
np
.
array
(
self
.
y
)
emb_cols
=
self
.
X
.
filter
(
regex
=
'((\d+)[Ats|Ex])\w+'
,
axis
=
1
)
n_scale_cols
=
self
.
X
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
if
scaling_strategy
==
"Standard"
:
scaler
=
StandardScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
else
:
scaler
=
MinMaxScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
if
scaling_strategy
!=
None
:
emb_cols
=
self
.
X
.
filter
(
regex
=
'((\d+)[Ats|Ex])\w+'
,
axis
=
1
)
n_scale_cols
=
self
.
X
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
if
scaling_strategy
==
"Standard"
:
scaler
=
StandardScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
else
:
scaler
=
MinMaxScaler
()
X_sc
=
scaler
.
fit_transform
(
X
[:,:
n_scale_cols
])
X
=
np
.
concatenate
([
X_sc
,
X
[:,
n_scale_cols
:]],
axis
=
1
)
return
X