Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Christian Fischer Pedersen
AIR
Commits
94e72200
Commit
94e72200
authored
Dec 16, 2021
by
thecml
Browse files
added scripts for hyperparams
parent
e1eb738f
Pipeline
#99774
failed with stage
in 58 seconds
Changes
24
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
94e72200
.vscode/settings.json
wandb
\ No newline at end of file
ml/configs/alarm.yaml
View file @
94e72200
...
...
@@ -2,4 +2,21 @@
#
features_to_normalize
:
[
'
BirthYear'
,
'
LoanPeriod'
,
'
NumberAts'
]
features_to_scale
:
[
'
Gender'
,
'
BirthYear'
,
'
LoanPeriod'
,
'
NumberAts'
]
\ No newline at end of file
features_to_scale
:
[
'
Gender'
,
'
BirthYear'
,
'
LoanPeriod'
,
'
NumberAts'
]
# Dataset Stuff -------------------------------------------------
#
target_name
:
"
Alarm"
model_path
:
models/alarm/embeddings
use_real_ats_names
:
False
# Embedding Hyperparams --------------------------------------
train_ratio
:
0.8
batch_size
:
32
num_epochs
:
5
verbose
:
True
network_layers
:
[
128
]
metrics
:
[
'
accuracy'
]
optimizer
:
"
Adam"
ml/models/complete_rf.joblib
View file @
94e72200
No preview for this file type
ml/models/compliance_rf.joblib
View file @
94e72200
No preview for this file type
ml/models/fall_rf.joblib
View file @
94e72200
No preview for this file type
ml/models/risk_rf.joblib
View file @
94e72200
No preview for this file type
ml/requirements.txt
View file @
94e72200
...
...
@@ -19,4 +19,5 @@ fastapi-jwt-auth==0.5.0
uvicorn==0.13.4
PyYAML==5.4.1
imbalanced-learn==0.8.0
scikit-survival==0.16.0
\ No newline at end of file
scikit-survival==0.16.0
wandb=0.12.7
\ No newline at end of file
ml/src/analysis/test_model_baseline.py
View file @
94e72200
...
...
@@ -69,6 +69,25 @@ def load_data_count(case, settings):
X
,
y
=
dl
.
get_data
()
return
X
,
y
def
load_data_ordinal
(
case
,
settings
):
if
case
==
"Complete"
:
dl
=
data_loader
.
CompleteDataLoader
(
pt
.
PROCESSED_DATA_DIR
,
"complete_ordinal.csv"
,
settings
).
load_data
()
X
,
y
=
dl
.
get_data
()
elif
case
==
"Compliance"
:
dl
=
data_loader
.
ComplianceDataLoader
(
pt
.
PROCESSED_DATA_DIR
,
"compliance_ordinal.csv"
,
settings
).
load_data
()
X
,
y
=
dl
.
get_data
()
elif
case
==
"Fall"
:
dl
=
data_loader
.
FallDataLoader
(
pt
.
PROCESSED_DATA_DIR
,
"fall_ordinal.csv"
,
settings
).
load_data
()
X
,
y
=
dl
.
get_data
()
else
:
dl
=
data_loader
.
RiskDataLoader
(
pt
.
PROCESSED_DATA_DIR
,
"risk_ordinal.csv"
,
settings
).
load_data
()
X
,
y
=
dl
.
get_data
()
return
X
,
y
def
load_data_ohe
(
case
,
settings
):
if
case
==
"Complete"
:
dl
=
data_loader
.
CompleteDataLoader
(
pt
.
PROCESSED_DATA_DIR
,
...
...
@@ -104,12 +123,14 @@ def main():
encoding
=
'UTF8'
,
newline
=
''
)
as
f
:
writer
=
csv
.
writer
(
f
)
writer
.
writerow
(
header
)
versions
=
[
'Embedded'
,
'Counts'
,
'OneHot'
]
versions
=
[
'Embedded'
,
'Counts'
,
'OneHot'
,
'Ordinal'
]
for
version
in
versions
:
if
version
==
"Embedded"
:
X
,
y
=
load_data_embedded
(
case
,
target_settings
)
elif
version
==
"Counts"
:
X
,
y
=
load_data_count
(
case
,
target_settings
)
elif
version
==
"Ordinal"
:
X
,
y
=
load_data_ordinal
(
case
,
target_settings
)
else
:
X
,
y
=
load_data_ohe
(
case
,
target_settings
)
...
...
ml/src/analysis/test_model_survival.py
View file @
94e72200
#!/usr/bin/env python
import
paths
as
pt
from
tools
import
data_loader
from
sklearn.preprocessing
import
LabelEncoder
from
tools
import
data_loader
,
file_reader
from
utility.settings
import
load_settings
from
sksurv.ensemble
import
RandomSurvivalForest
from
sklearn.model_selection
import
KFold
from
sksurv.metrics
import
concordance_index_censored
from
io
import
StringIO
from
pathlib
import
Path
import
shutil
def
main
():
data_settings
=
load_settings
(
pt
.
CONFIGS_DIR
,
"data.yaml"
)
...
...
@@ -17,16 +19,15 @@ def main():
target_settings
).
load_data
()
X
,
y
=
dl
.
get_data
()
labels_enc
=
dict
()
ats_cols
=
[
f
"
{
i
}
Ats"
for
i
in
range
(
1
,
ats_resolution
+
1
)]
for
col_name
in
ats_cols
:
le
=
LabelEncoder
()
le
.
fit
(
X
.
loc
[:,
col_name
].
astype
(
str
))
labels_enc
[
col_name
]
=
le
X
.
loc
[:,
col_name
]
=
le
.
transform
(
X
.
loc
[:,
col_name
].
astype
(
str
))
X
=
X
[:
10000
]
y
=
y
[:
10000
]
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
ats_resolution
+
1
)}
infile
=
StringIO
()
file_path
=
pt
.
PROCESSED_DATA_DIR
file_name
=
"alarm_emb.csv"
with
open
(
Path
.
joinpath
(
file_path
,
file_name
),
'r'
)
as
fd
:
shutil
.
copyfileobj
(
fd
,
infile
)
infile
.
seek
(
0
)
emb_file
=
file_reader
.
read_csv
(
infile
,
converters
=
ats
)
X
=
emb_file
model
=
RandomSurvivalForest
(
n_estimators
=
200
,
max_depth
=
3
,
...
...
ml/src/data/make_dataset_emb.py
View file @
94e72200
#!/usr/bin/env python
from
tools
import
file_reader
,
file_writer
from
typing
import
BinaryIO
from
tools
import
file_reader
,
file_writer
,
data_loader
from
tools
import
preprocessor
,
neural_embedder
from
utility.settings
import
load_settings
import
pandas
as
pd
...
...
@@ -15,13 +16,12 @@ USE_GROUPING = False
ENABLE_EMB_VIZ
=
False
def
main
(
ats_resolution
:
int
=
None
):
for
label_name
in
[
"Complete"
,
"Compliance"
,
"Fall"
,
"Risk"
]:
for
label_name
in
[
"Complete"
,
"Compliance"
,
"Fall"
,
"Risk"
,
"Alarm"
]:
data_settings
=
load_settings
(
pt
.
CONFIGS_DIR
,
'data.yaml'
)
target_settings
=
load_settings
(
pt
.
CONFIGS_DIR
,
f
'
{
label_name
.
lower
()
}
.yaml'
)
if
ats_resolution
==
None
:
ats_resolution
=
data_settings
[
'ats_resolution'
]
if
label_name
==
"Risk"
:
ex_resolution
=
target_settings
[
'ex_resolution'
]
...
...
@@ -34,6 +34,20 @@ def main(ats_resolution: int = None):
shutil
.
copyfileobj
(
fd
,
infile
)
infile
.
seek
(
0
)
df
=
file_reader
.
read_csv
(
infile
,
converters
=
ats
)
elif
label_name
==
"Alarm"
:
ats_cols
=
[
f
"
{
i
}
Ats"
for
i
in
range
(
1
,
ats_resolution
+
1
)]
num_cols
=
[
"BirthYear"
,
"Gender"
,
"LoanPeriod"
,
"NumberAts"
]
dl
=
data_loader
.
AlarmDataLoader
(
pt
.
PROCESSED_DATA_DIR
,
"alarm_data.pkl"
,
target_settings
).
load_data
()
X
,
y
=
dl
.
get_data
()
data
=
np
.
column_stack
((
X
[
num_cols
],
X
[
ats_cols
],
y
[
"Status"
]))
df
=
pd
.
DataFrame
(
data
,
columns
=
num_cols
+
ats_cols
+
[
"Alarm"
])
for
col
in
num_cols
:
df
[
col
]
=
df
[
col
].
astype
(
int
)
for
col
in
ats_cols
:
df
[
col
]
=
df
[
col
].
astype
(
str
)
df
[
"Alarm"
]
=
df
[
"Alarm"
].
astype
(
int
)
else
:
ex
=
{
str
(
i
)
+
'Ex'
:
str
for
i
in
range
(
1
,
ex_resolution
+
1
)}
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
ats_resolution
+
1
)}
...
...
@@ -46,7 +60,7 @@ def main(ats_resolution: int = None):
infile
.
seek
(
0
)
df
=
file_reader
.
read_csv
(
infile
,
converters
=
converters
)
if
label_name
in
[
"Complete"
,
"Compliance"
,
"Fall"
]:
if
label_name
in
[
"Complete"
,
"Compliance"
,
"Fall"
,
"Alarm"
]:
emb_cols
=
df
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_numerical_cols
=
df
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
-
1
df_to_enc
=
df
.
iloc
[:,
n_numerical_cols
:]
...
...
@@ -60,7 +74,7 @@ def main(ats_resolution: int = None):
df
=
df
.
drop
(
ats_cols
+
ex_cols
,
axis
=
1
)
model_path
=
Path
.
joinpath
(
pt
.
ROOT_DIR
,
target_settings
[
'model_path'
])
if
label_name
in
[
"Complete"
,
"Compliance"
,
"Fall"
]:
if
label_name
in
[
"Complete"
,
"Compliance"
,
"Fall"
,
"Alarm"
]:
df_enc
=
encode_dataframe
(
df
=
df_to_enc
,
target_name
=
target_settings
[
'target_name'
],
metrics
=
target_settings
[
'metrics'
],
...
...
@@ -96,6 +110,8 @@ def main(ats_resolution: int = None):
if
label_name
in
[
"Complete"
,
"Compliance"
,
"Fall"
]:
df
=
pd
.
concat
([
df
.
drop
(
label_name
,
axis
=
1
),
df_enc
,
df
.
pop
(
label_name
)],
axis
=
1
)
elif
label_name
==
"Alarm"
:
df
=
pd
.
concat
([
df
.
drop
(
label_name
,
axis
=
1
),
df_enc
],
axis
=
1
)
else
:
df
=
pd
.
concat
([
df
.
drop
(
label_name
,
axis
=
1
),
ats_enc
,
ex_enc
,
df
.
pop
(
label_name
)],
axis
=
1
)
...
...
@@ -113,7 +129,8 @@ def encode_dataframe(df, target_name, metrics, batch_size, train_ratio, epochs,
X_train
,
X_val
,
y_train
,
y_val
,
labels
=
preprocessor
.
prepare_data_for_emb
(
df
,
target_name
,
train_ratio
)
network
=
neural_embedder
.
NeuralEmbedder
(
df
=
df
,
target_name
=
target_name
,
network
=
neural_embedder
.
NeuralEmbedder
(
df
=
df
,
target_name
=
target_name
,
metrics
=
metrics
,
epochs
=
epochs
,
batch_size
=
batch_size
,
network_layers
=
network_layers
,
...
...
ml/src/data/make_dataset_ordinal.py
0 → 100644
View file @
94e72200
#!/usr/bin/env python
import
paths
as
pt
from
tools
import
file_reader
,
file_writer
from
tools
import
preprocessor
from
utility
import
embedder
import
pandas
as
pd
import
numpy
as
np
from
pathlib
import
Path
from
utility.settings
import
load_settings
from
io
import
StringIO
import
shutil
from
sklearn.preprocessing
import
OrdinalEncoder
def
main
():
for
label_name
in
[
"Complete"
,
"Compliance"
,
"Fall"
,
"Risk"
]:
data_settings
=
load_settings
(
pt
.
CONFIGS_DIR
,
'data.yaml'
)
ats_resolution
=
data_settings
[
'ats_resolution'
]
if
label_name
==
"Risk"
:
target_settings
=
load_settings
(
pt
.
CONFIGS_DIR
,
f
'
{
label_name
.
lower
()
}
.yaml'
)
ex_resolution
=
target_settings
[
'ex_resolution'
]
if
label_name
in
[
"Complete"
,
"Compliance"
,
"Fall"
]:
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
ats_resolution
+
1
)}
infile
=
StringIO
()
file_path
=
pt
.
PROCESSED_DATA_DIR
file_name
=
f
'
{
label_name
.
lower
()
}
.csv'
with
open
(
Path
.
joinpath
(
file_path
,
file_name
),
'r'
)
as
fd
:
shutil
.
copyfileobj
(
fd
,
infile
)
infile
.
seek
(
0
)
df
=
file_reader
.
read_csv
(
infile
,
converters
=
ats
)
else
:
ex
=
{
str
(
i
)
+
'Ex'
:
str
for
i
in
range
(
1
,
ex_resolution
+
1
)}
ats
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
ats_resolution
+
1
)}
converters
=
{
**
ex
,
**
ats
}
infile
=
StringIO
()
file_path
=
pt
.
PROCESSED_DATA_DIR
file_name
=
f
'
{
label_name
.
lower
()
}
.csv'
with
open
(
Path
.
joinpath
(
file_path
,
file_name
),
'r'
)
as
fd
:
shutil
.
copyfileobj
(
fd
,
infile
)
infile
.
seek
(
0
)
df
=
file_reader
.
read_csv
(
infile
,
converters
=
converters
)
if
label_name
in
[
"Complete"
,
"Compliance"
,
"Fall"
]:
ats_cols
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
ats_resolution
+
1
)]
oenc
=
OrdinalEncoder
()
oenc
.
fit
(
df
[
ats_cols
].
astype
(
str
))
df_enc
=
oenc
.
transform
(
df
[
ats_cols
].
astype
(
str
))
df_stack
=
np
.
column_stack
((
df
.
drop
(
ats_cols
+
[
label_name
],
axis
=
1
).
values
,
df_enc
,
df
[[
label_name
]].
values
))
feature_names
=
df
.
columns
.
tolist
()
df
=
pd
.
DataFrame
(
df_stack
,
columns
=
feature_names
)
else
:
ex_cols
=
[
str
(
i
)
+
'Ex'
for
i
in
range
(
1
,
ex_resolution
+
1
)]
ats_cols
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
ats_resolution
+
1
)]
total_cols
=
ex_cols
+
ats_cols
oenc
=
OrdinalEncoder
()
oenc
.
fit
(
df
[
total_cols
].
astype
(
str
))
df_enc
=
oenc
.
transform
(
df
[
total_cols
].
astype
(
str
))
df_stack
=
np
.
column_stack
((
df
.
drop
(
total_cols
+
[
label_name
],
axis
=
1
).
values
,
df_enc
,
df
[[
label_name
]].
values
))
feature_names
=
df
.
columns
.
tolist
()
df
=
pd
.
DataFrame
(
df_stack
,
columns
=
feature_names
)
outfile
=
StringIO
()
file_path
=
pt
.
PROCESSED_DATA_DIR
file_name
=
f
'
{
label_name
.
lower
()
}
_ordinal.csv'
with
open
(
Path
.
joinpath
(
file_path
,
file_name
),
'w'
,
newline
=
''
)
as
fd
:
file_writer
.
write_csv
(
df
,
outfile
)
outfile
.
seek
(
0
)
shutil
.
copyfileobj
(
outfile
,
fd
)
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
ml/src/model/train_r
andom_forest
_model.py
→
ml/src/model/train_r
f
_model
s
.py
View file @
94e72200
...
...
@@ -41,8 +41,11 @@ def main():
settings
).
load_data
()
X
,
y
=
dl
.
prepare_data
()
model
=
RandomForestClassifier
(
n_estimators
=
200
,
class_weight
=
"balanced"
,
model
=
RandomForestClassifier
(
n_estimators
=
1
,
bootstrap
=
False
,
min_samples_leaf
=
0.1
,
min_samples_split
=
0.54
,
max_depth
=
29
,
random_state
=
0
)
model
.
fit
(
X
,
y
)
...
...
ml/src/model/train_xgb
oost
_model.py
→
ml/src/model/train_xgb_model
s
.py
View file @
94e72200
File moved
ml/src/tools/classifiers.py
View file @
94e72200
...
...
@@ -11,7 +11,6 @@ from sklearn.model_selection import cross_validate, StratifiedKFold
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.svm
import
SVC
from
sklearn.linear_model
import
LogisticRegression
from
tools
import
preprocessor
from
keras.wrappers.scikit_learn
import
KerasClassifier
import
numpy
as
np
import
tensorflow
as
tf
...
...
ml/src/tuning/tune_alarm_boost_wb.py
0 → 100644
View file @
94e72200
from
utility.settings
import
load_settings
from
sksurv.ensemble
import
GradientBoostingSurvivalAnalysis
from
sksurv.metrics
import
(
concordance_index_censored
,
concordance_index_ipcw
,
integrated_brier_score
)
from
sklearn.model_selection
import
KFold
from
tools
import
data_loader
,
preprocessor
import
paths
as
pt
import
numpy
as
np
import
pandas
as
pd
import
os
os
.
environ
[
"WANDB_SILENT"
]
=
"true"
import
wandb
sweep_config
=
{
"method"
:
"random"
,
# try grid or random
"metric"
:
{
"name"
:
"c_harrell"
,
"goal"
:
"maximize"
},
"parameters"
:
{
"n_estimators"
:
{
"values"
:
[
50
,
100
,
200
,
400
,
600
,
800
,
1000
]
},
"learning_rate"
:
{
"values"
:
[
0.1
,
0.5
,
1.0
]
},
"max_depth"
:
{
"values"
:
[
int
(
x
)
for
x
in
np
.
linspace
(
1
,
18
,
15
,
endpoint
=
True
)]
},
"loss"
:
{
"values"
:
[
'coxph'
]
},
"min_samples_split"
:
{
"values"
:
[
int
(
x
)
for
x
in
np
.
linspace
(
2
,
10
,
10
,
endpoint
=
True
)]
},
"max_features"
:
{
"values"
:
[
None
,
"auto"
,
"sqrt"
,
"log2"
]
},
"dropout_rate"
:
{
"values"
:
[
float
(
x
)
for
x
in
np
.
linspace
(
0.0
,
0.9
,
10
,
endpoint
=
True
)]
},
"subsample"
:
{
"values"
:
[
float
(
x
)
for
x
in
np
.
linspace
(
0.1
,
1.0
,
10
,
endpoint
=
True
)]
}
}
}
def
main
():
sweep_id
=
wandb
.
sweep
(
sweep_config
,
project
=
"air-alarm-boost"
)
wandb
.
agent
(
sweep_id
,
train_model
,
count
=
5
)
def
train_model
():
config_defaults
=
{
'n_estimators'
:
100
,
'learning_rate'
:
0.1
,
'max_depth'
:
3
,
'loss'
:
'coxph'
,
'min_samples_split'
:
2
,
'max_features'
:
None
,
'dropout_rate'
:
0.0
,
'subsample'
:
1.0
,
'seed'
:
0
,
'test_size'
:
0.25
,
}
# Initialize a new wandb run
wandb
.
init
(
config
=
config_defaults
)
# Config is a variable that holds and saves hyperparameters and inputs
config
=
wandb
.
config
# Load data
data_settings
=
load_settings
(
pt
.
CONFIGS_DIR
,
"data.yaml"
)
target_settings
=
load_settings
(
pt
.
CONFIGS_DIR
,
"alarm.yaml"
)
dl
=
data_loader
.
AlarmDataLoader
(
pt
.
PROCESSED_DATA_DIR
,
"alarm_data.pkl"
,
target_settings
).
load_data
()
X
,
y
=
dl
.
get_data
()
# Encode X
ats_resolution
=
data_settings
[
'ats_resolution'
]
ats_cols
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
ats_resolution
+
1
)]
X_enc
=
preprocessor
.
one_hot_encode
(
X
,
ats_cols
)
X
=
pd
.
concat
([
X
.
drop
(
ats_cols
,
axis
=
1
),
X_enc
],
axis
=
1
)
# Make model
model
=
GradientBoostingSurvivalAnalysis
(
n_estimators
=
config
.
n_estimators
,
learning_rate
=
config
.
learning_rate
,
max_depth
=
config
.
max_depth
,
loss
=
config
.
loss
,
min_samples_split
=
config
.
min_samples_split
,
max_features
=
config
.
max_features
,
dropout_rate
=
config
.
dropout_rate
,
random_state
=
0
)
# Make CV
kf
=
KFold
(
n_splits
=
5
,
shuffle
=
True
,
random_state
=
0
)
c_index_harells
=
list
()
c_index_unos
=
list
()
brier_scores
=
list
()
for
train
,
test
in
kf
.
split
(
X
,
y
):
model
.
fit
(
X
.
iloc
[
train
],
y
[
train
])
prediction
=
model
.
predict
(
X
.
iloc
[
test
])
c_harrell
=
concordance_index_censored
(
y
[
test
][
"Status"
],
y
[
test
][
"Days_to_alarm"
],
prediction
)
c_uno
=
concordance_index_ipcw
(
y
[
train
],
y
[
test
],
prediction
)
lower
,
upper
=
np
.
percentile
(
y
[
"Days_to_alarm"
],
[
10
,
90
])
alarm_times
=
np
.
arange
(
lower
,
upper
+
1
)
surv_prob
=
np
.
row_stack
([
fn
(
alarm_times
)
for
fn
in
model
.
predict_survival_function
(
X
.
iloc
[
test
])])
brier_score
=
integrated_brier_score
(
y
[
train
],
y
[
test
],
surv_prob
,
alarm_times
)
c_index_harells
.
append
(
c_harrell
[
0
])
c_index_unos
.
append
(
c_uno
[
0
])
brier_scores
.
append
(
brier_score
)
c_index_harell_mean
=
np
.
mean
(
c_index_harells
)
c_index_uno_mean
=
np
.
mean
(
c_index_unos
)
brier_score_mean
=
np
.
mean
(
brier_scores
)
# Log to wandb
wandb
.
log
({
"c_harrell"
:
c_index_harell_mean
})
wandb
.
log
({
"c_uno"
:
c_index_uno_mean
})
wandb
.
log
({
"brier_score"
:
brier_score_mean
})
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
ml/src/tuning/tune_alarm_rsf_wb.py
0 → 100644
View file @
94e72200
from
utility.settings
import
load_settings
from
sksurv.ensemble
import
RandomSurvivalForest
from
sksurv.metrics
import
(
concordance_index_censored
,
concordance_index_ipcw
,
integrated_brier_score
)
from
sklearn.model_selection
import
KFold
from
tools
import
data_loader
,
preprocessor
import
paths
as
pt
from
sklearn.model_selection
import
train_test_split
import
numpy
as
np
import
pandas
as
pd
os
.
environ
[
"WANDB_SILENT"
]
=
"true"
import
wandb
sweep_config
=
{
"method"
:
"random"
,
# try grid or random
"metric"
:
{
"name"
:
"c_harrell"
,
"goal"
:
"maximize"
},
"parameters"
:
{
"n_estimators"
:
{
"values"
:
[
50
,
100
,
200
,
400
,
600
,
800
,
1000
]
},
"max_depth"
:
{
"values"
:
[
int
(
x
)
for
x
in
np
.
linspace
(
1
,
32
,
32
,
endpoint
=
True
)]
},
"min_samples_split"
:
{
"values"
:
[
float
(
x
)
for
x
in
np
.
linspace
(
0.1
,
0.9
,
10
,
endpoint
=
True
)]
},
"min_samples_leaf"
:
{
"values"
:
[
float
(
x
)
for
x
in
np
.
linspace
(
0.1
,
0.5
,
5
,
endpoint
=
True
)]
},
"max_features"
:
{
"values"
:
[
None
,
'auto'
,
'sqrt'
,
'log2'
]
},
}
}
def
main
():
sweep_id
=
wandb
.
sweep
(
sweep_config
,
project
=
"air-alarm-rsf"
)
wandb
.
agent
(
sweep_id
,
train_model
,
count
=
5
)
def
train_model
():
config_defaults
=
{
'n_estimators'
:
[
100
],
'max_depth'
:
[
None
],
'min_samples_split'
:
[
2
],
'min_samples_leaf'
:
[
1
],
'max_features'
:
[
None
],
"seed"
:
0
,
"test_size"
:
0.25
,
}
# Initialize a new wandb run
wandb
.
init
(
config
=
config_defaults
)
# Config is a variable that holds and saves hyperparameters and inputs
config
=
wandb
.
config
# Load data
data_settings
=
load_settings
(
pt
.
CONFIGS_DIR
,
"data.yaml"
)
target_settings
=
load_settings
(
pt
.
CONFIGS_DIR
,
"alarm.yaml"
)
dl
=
data_loader
.
AlarmDataLoader
(
pt
.
PROCESSED_DATA_DIR
,
"alarm_data.pkl"
,
target_settings
).
load_data
()
X
,
y
=
dl
.
get_data
()
# Encode X
ats_resolution
=
data_settings
[
'ats_resolution'
]
ats_cols
=
[
str
(
i
)
+
'Ats'
for
i
in
range
(
1
,
ats_resolution
+
1
)]
X_enc
=
preprocessor
.
one_hot_encode
(
X
,
ats_cols
)
X
=
pd
.
concat
([
X
.
drop
(
ats_cols
,
axis
=
1
),
X_enc
],
axis
=
1
)
# Make model
model
=
RandomSurvivalForest
(
n_estimators
=
config
.
n_estimators
,
max_depth
=
config
.
max_depth
,
min_samples_split
=
config
.
min_samples_split
,
min_samples_leaf
=
config
.
min_samples_leaf
,
max_features
=
config
.
max_features
,
random_state
=
0
)
# Make CV
kf
=
KFold
(
n_splits
=
5
,
shuffle
=
True
,
random_state
=
0
)
c_index_harells
=
list
()
c_index_unos
=
list
()
brier_scores
=
list
()
for
train
,
test
in
kf
.
split
(
X
,
y
):
model
.
fit
(
X
.
iloc
[
train
],
y
[
train
])
prediction
=
model
.
predict
(
X
.
iloc
[
test
])
c_harrell
=
concordance_index_censored
(
y
[
test
][
"Status"
],
y
[
test
][
"Days_to_alarm"
],
prediction
)
c_uno
=
concordance_index_ipcw
(
y
[
train
],
y
[
test
],
prediction
)
lower
,
upper
=
np
.
percentile
(
y
[
"Days_to_alarm"
],
[
10
,
90
])
alarm_times
=
np
.
arange
(
lower
,
upper
+
1
)
surv_prob
=
np
.
row_stack
([
fn
(
alarm_times
)
for
fn
in
model
.
predict_survival_function
(
X
.
iloc
[
test
])])
brier_score
=
integrated_brier_score
(
y
[
train
],
y
[
test
],
surv_prob
,
alarm_times
)
c_index_harells
.
append
(
c_harrell
[
0
])
c_index_unos
.
append
(
c_uno
[
0
])
brier_scores
.
append
(
brier_score
)
c_index_harell_mean
=
np
.
mean
(
c_index_harells
)
c_index_uno_mean
=
np
.
mean
(
c_index_unos
)
brier_score_mean
=
np
.
mean
(
brier_scores
)
# Log to wandb
wandb
.
log
({
"c_harrell"
:
c_index_harell_mean
})
wandb
.
log
({
"c_uno"
:
c_index_uno_mean
})
wandb
.
log
({
"brier_score"
:
brier_score_mean
})
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
ml/src/tuning/tune_complete_rf_wb.py
0 → 100644
View file @
94e72200
from
utility.settings
import
load_settings
from
sklearn.ensemble
import
RandomForestClassifier
from
tools
import
data_loader
import
paths
as
pt
from
sklearn.model_selection
import
cross_validate
,
StratifiedKFold
import
numpy
as
np
import
os
os
.
environ
[
"WANDB_SILENT"
]
=
"true"
import
wandb
sweep_config
=
{
"method"
:
"random"
,
# try grid or random
"metric"
:
{
"name"
:
"accuracy"
,
"goal"
:
"maximize"
},
"parameters"
:
{
"n_estimators"
:
{
"values"
:
[
1
,
2
,
4
,
8
,
16
,
32
,
64
,
100
,
200
,
400
]
},
"criterion"
:
{
"values"
:
[
"gini"
,
"entropy"
]
},
"max_depth"
:
{
"values"
:
[
int
(
x
)
for
x
in
np
.
linspace
(
1
,
32
,
32
,
endpoint
=
True
)]
},