Commit 94e72200 authored by thecml's avatar thecml
Browse files

added scripts for hyperparams

parent e1eb738f
Pipeline #99774 failed with stage
in 58 seconds
.vscode/settings.json
wandb
\ No newline at end of file
......@@ -2,4 +2,21 @@
#
features_to_normalize: ['BirthYear', 'LoanPeriod', 'NumberAts']
features_to_scale: ['Gender', 'BirthYear', 'LoanPeriod', 'NumberAts']
\ No newline at end of file
features_to_scale: ['Gender', 'BirthYear', 'LoanPeriod', 'NumberAts']
# Dataset Stuff -------------------------------------------------
#
target_name: "Alarm"
model_path: models/alarm/embeddings
use_real_ats_names: False
# Embedding Hyperparams --------------------------------------
train_ratio: 0.8
batch_size: 32
num_epochs: 5
verbose: True
network_layers: [128]
metrics: ['accuracy']
optimizer: "Adam"
No preview for this file type
No preview for this file type
......@@ -19,4 +19,5 @@ fastapi-jwt-auth==0.5.0
uvicorn==0.13.4
PyYAML==5.4.1
imbalanced-learn==0.8.0
scikit-survival==0.16.0
\ No newline at end of file
scikit-survival==0.16.0
wandb=0.12.7
\ No newline at end of file
......@@ -69,6 +69,25 @@ def load_data_count(case, settings):
X, y = dl.get_data()
return X, y
def load_data_ordinal(case, settings):
if case == "Complete":
dl = data_loader.CompleteDataLoader(pt.PROCESSED_DATA_DIR,
"complete_ordinal.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Compliance":
dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
"compliance_ordinal.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Fall":
dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
"fall_ordinal.csv", settings).load_data()
X, y = dl.get_data()
else:
dl = data_loader.RiskDataLoader(pt.PROCESSED_DATA_DIR,
"risk_ordinal.csv", settings).load_data()
X, y = dl.get_data()
return X, y
def load_data_ohe(case, settings):
if case == "Complete":
dl = data_loader.CompleteDataLoader(pt.PROCESSED_DATA_DIR,
......@@ -104,12 +123,14 @@ def main():
encoding='UTF8', newline='') as f:
writer = csv.writer(f)
writer.writerow(header)
versions = ['Embedded', 'Counts', 'OneHot']
versions = ['Embedded', 'Counts', 'OneHot', 'Ordinal']
for version in versions:
if version == "Embedded":
X, y = load_data_embedded(case, target_settings)
elif version == "Counts":
X, y = load_data_count(case, target_settings)
elif version == "Ordinal":
X, y = load_data_ordinal(case, target_settings)
else:
X, y = load_data_ohe(case, target_settings)
......
#!/usr/bin/env python
import paths as pt
from tools import data_loader
from sklearn.preprocessing import LabelEncoder
from tools import data_loader, file_reader
from utility.settings import load_settings
from sksurv.ensemble import RandomSurvivalForest
from sklearn.model_selection import KFold
from sksurv.metrics import concordance_index_censored
from io import StringIO
from pathlib import Path
import shutil
def main():
data_settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
......@@ -17,16 +19,15 @@ def main():
target_settings).load_data()
X, y = dl.get_data()
labels_enc = dict()
ats_cols = [f"{i}Ats" for i in range(1, ats_resolution+1)]
for col_name in ats_cols:
le = LabelEncoder()
le.fit(X.loc[:, col_name].astype(str))
labels_enc[col_name] = le
X.loc[:, col_name] = le.transform(X.loc[:, col_name].astype(str))
X = X[:10000]
y = y[:10000]
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = "alarm_emb.csv"
with open(Path.joinpath(file_path, file_name), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
emb_file = file_reader.read_csv(infile, converters=ats)
X = emb_file
model = RandomSurvivalForest(n_estimators=200,
max_depth=3,
......
#!/usr/bin/env python
from tools import file_reader, file_writer
from typing import BinaryIO
from tools import file_reader, file_writer, data_loader
from tools import preprocessor, neural_embedder
from utility.settings import load_settings
import pandas as pd
......@@ -15,13 +16,12 @@ USE_GROUPING = False
ENABLE_EMB_VIZ = False
def main(ats_resolution: int = None):
for label_name in ["Complete", "Compliance", "Fall", "Risk"]:
for label_name in ["Complete", "Compliance", "Fall", "Risk", "Alarm"]:
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
if ats_resolution == None:
ats_resolution = data_settings['ats_resolution']
if label_name == "Risk":
ex_resolution = target_settings['ex_resolution']
......@@ -34,6 +34,20 @@ def main(ats_resolution: int = None):
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=ats)
elif label_name == "Alarm":
ats_cols = [f"{i}Ats" for i in range(1, ats_resolution+1)]
num_cols = ["BirthYear", "Gender", "LoanPeriod", "NumberAts"]
dl = data_loader.AlarmDataLoader(pt.PROCESSED_DATA_DIR,
"alarm_data.pkl",
target_settings).load_data()
X, y = dl.get_data()
data = np.column_stack((X[num_cols], X[ats_cols], y["Status"]))
df = pd.DataFrame(data, columns=num_cols + ats_cols + ["Alarm"])
for col in num_cols:
df[col] = df[col].astype(int)
for col in ats_cols:
df[col] = df[col].astype(str)
df["Alarm"] = df["Alarm"].astype(int)
else:
ex = {str(i)+'Ex':str for i in range(1, ex_resolution+1)}
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
......@@ -46,7 +60,7 @@ def main(ats_resolution: int = None):
infile.seek(0)
df = file_reader.read_csv(infile, converters=converters)
if label_name in ["Complete", "Compliance", "Fall"]:
if label_name in ["Complete", "Compliance", "Fall", "Alarm"]:
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
......@@ -60,7 +74,7 @@ def main(ats_resolution: int = None):
df = df.drop(ats_cols + ex_cols, axis=1)
model_path = Path.joinpath(pt.ROOT_DIR, target_settings['model_path'])
if label_name in ["Complete", "Compliance", "Fall"]:
if label_name in ["Complete", "Compliance", "Fall", "Alarm"]:
df_enc = encode_dataframe(df=df_to_enc,
target_name=target_settings['target_name'],
metrics=target_settings['metrics'],
......@@ -96,6 +110,8 @@ def main(ats_resolution: int = None):
if label_name in ["Complete", "Compliance", "Fall"]:
df = pd.concat([df.drop(label_name, axis=1), df_enc,
df.pop(label_name)], axis=1)
elif label_name == "Alarm":
df = pd.concat([df.drop(label_name, axis=1), df_enc], axis=1)
else:
df = pd.concat([df.drop(label_name, axis=1), ats_enc,
ex_enc, df.pop(label_name)], axis=1)
......@@ -113,7 +129,8 @@ def encode_dataframe(df, target_name, metrics, batch_size, train_ratio, epochs,
X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_emb(df,
target_name,
train_ratio)
network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name,
network = neural_embedder.NeuralEmbedder(df=df,
target_name=target_name,
metrics=metrics,
epochs=epochs, batch_size=batch_size,
network_layers=network_layers,
......
#!/usr/bin/env python
import paths as pt
from tools import file_reader, file_writer
from tools import preprocessor
from utility import embedder
import pandas as pd
import numpy as np
from pathlib import Path
from utility.settings import load_settings
from io import StringIO
import shutil
from sklearn.preprocessing import OrdinalEncoder
def main():
for label_name in ["Complete", "Compliance", "Fall", "Risk"]:
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
ats_resolution = data_settings['ats_resolution']
if label_name == "Risk":
target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
ex_resolution = target_settings['ex_resolution']
if label_name in ["Complete", "Compliance", "Fall"]:
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=ats)
else:
ex = {str(i)+'Ex':str for i in range(1, ex_resolution+1)}
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
converters = {**ex, **ats}
infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=converters)
if label_name in ["Complete", "Compliance", "Fall"]:
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
oenc = OrdinalEncoder()
oenc.fit(df[ats_cols].astype(str))
df_enc = oenc.transform(df[ats_cols].astype(str))
df_stack = np.column_stack((df.drop(ats_cols + [label_name], axis=1).values,
df_enc, df[[label_name]].values))
feature_names = df.columns.tolist()
df = pd.DataFrame(df_stack, columns=feature_names)
else:
ex_cols = [str(i)+'Ex' for i in range(1, ex_resolution+1)]
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
total_cols = ex_cols + ats_cols
oenc = OrdinalEncoder()
oenc.fit(df[total_cols].astype(str))
df_enc = oenc.transform(df[total_cols].astype(str))
df_stack = np.column_stack((df.drop(total_cols + [label_name], axis=1).values,
df_enc, df[[label_name]].values))
feature_names = df.columns.tolist()
df = pd.DataFrame(df_stack, columns=feature_names)
outfile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}_ordinal.csv'
with open(Path.joinpath(file_path, file_name), 'w', newline='') as fd:
file_writer.write_csv(df, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -41,8 +41,11 @@ def main():
settings).load_data()
X, y = dl.prepare_data()
model = RandomForestClassifier(n_estimators=200,
class_weight="balanced",
model = RandomForestClassifier(n_estimators=1,
bootstrap=False,
min_samples_leaf=0.1,
min_samples_split=0.54,
max_depth=29,
random_state=0)
model.fit(X, y)
......
......@@ -11,7 +11,6 @@ from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from tools import preprocessor
from keras.wrappers.scikit_learn import KerasClassifier
import numpy as np
import tensorflow as tf
......
from utility.settings import load_settings
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.metrics import (concordance_index_censored,
concordance_index_ipcw,
integrated_brier_score)
from sklearn.model_selection import KFold
from tools import data_loader, preprocessor
import paths as pt
import numpy as np
import pandas as pd
import os
os.environ["WANDB_SILENT"] = "true"
import wandb
sweep_config = {
"method": "random", # try grid or random
"metric": {
"name": "c_harrell",
"goal": "maximize"
},
"parameters": {
"n_estimators": {
"values": [50, 100, 200, 400, 600, 800, 1000]
},
"learning_rate": {
"values": [0.1, 0.5, 1.0]
},
"max_depth": {
"values": [int(x) for x in np.linspace(1, 18, 15, endpoint=True)]
},
"loss": {
"values": ['coxph']
},
"min_samples_split": {
"values": [int(x) for x in np.linspace(2, 10, 10, endpoint=True)]
},
"max_features": {
"values": [None, "auto", "sqrt", "log2"]
},
"dropout_rate": {
"values": [float(x) for x in np.linspace(0.0, 0.9, 10, endpoint=True)]
},
"subsample": {
"values": [float(x) for x in np.linspace(0.1, 1.0, 10, endpoint=True)]
}
}
}
def main():
sweep_id = wandb.sweep(sweep_config,
project="air-alarm-boost")
wandb.agent(sweep_id, train_model, count=5)
def train_model():
config_defaults = {
'n_estimators': 100,
'learning_rate': 0.1,
'max_depth': 3,
'loss': 'coxph',
'min_samples_split': 2,
'max_features': None,
'dropout_rate': 0.0,
'subsample': 1.0,
'seed': 0,
'test_size': 0.25,
}
# Initialize a new wandb run
wandb.init(config=config_defaults)
# Config is a variable that holds and saves hyperparameters and inputs
config = wandb.config
# Load data
data_settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
target_settings = load_settings(pt.CONFIGS_DIR, "alarm.yaml")
dl = data_loader.AlarmDataLoader(pt.PROCESSED_DATA_DIR,
"alarm_data.pkl",
target_settings).load_data()
X, y = dl.get_data()
# Encode X
ats_resolution = data_settings['ats_resolution']
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
X_enc = preprocessor.one_hot_encode(X, ats_cols)
X = pd.concat([X.drop(ats_cols, axis=1), X_enc], axis=1)
# Make model
model = GradientBoostingSurvivalAnalysis(n_estimators=config.n_estimators,
learning_rate=config.learning_rate,
max_depth=config.max_depth,
loss=config.loss,
min_samples_split=config.min_samples_split,
max_features=config.max_features,
dropout_rate=config.dropout_rate,
random_state=0)
# Make CV
kf = KFold(n_splits=5, shuffle=True, random_state=0)
c_index_harells = list()
c_index_unos = list()
brier_scores = list()
for train, test in kf.split(X, y):
model.fit(X.iloc[train], y[train])
prediction = model.predict(X.iloc[test])
c_harrell = concordance_index_censored(y[test]["Status"],
y[test]["Days_to_alarm"],
prediction)
c_uno = concordance_index_ipcw(y[train], y[test], prediction)
lower, upper = np.percentile(y["Days_to_alarm"], [10, 90])
alarm_times = np.arange(lower, upper+1)
surv_prob = np.row_stack([fn(alarm_times)
for fn in model.predict_survival_function(X.iloc[test])])
brier_score = integrated_brier_score(y[train], y[test],
surv_prob, alarm_times)
c_index_harells.append(c_harrell[0])
c_index_unos.append(c_uno[0])
brier_scores.append(brier_score)
c_index_harell_mean = np.mean(c_index_harells)
c_index_uno_mean = np.mean(c_index_unos)
brier_score_mean = np.mean(brier_scores)
# Log to wandb
wandb.log({"c_harrell": c_index_harell_mean})
wandb.log({"c_uno": c_index_uno_mean})
wandb.log({"brier_score": brier_score_mean})
if __name__ == "__main__":
main()
\ No newline at end of file
from utility.settings import load_settings
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import (concordance_index_censored,
concordance_index_ipcw,
integrated_brier_score)
from sklearn.model_selection import KFold
from tools import data_loader, preprocessor
import paths as pt
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
os.environ["WANDB_SILENT"] = "true"
import wandb
sweep_config = {
"method": "random", # try grid or random
"metric": {
"name": "c_harrell",
"goal": "maximize"
},
"parameters": {
"n_estimators": {
"values": [50, 100, 200, 400, 600, 800, 1000]
},
"max_depth": {
"values": [int(x) for x in np.linspace(1, 32, 32, endpoint=True)]
},
"min_samples_split": {
"values": [float(x) for x in np.linspace(0.1, 0.9, 10, endpoint=True)]
},
"min_samples_leaf": {
"values": [float(x) for x in np.linspace(0.1, 0.5, 5, endpoint=True)]
},
"max_features": {
"values": [None, 'auto', 'sqrt', 'log2']
},
}
}
def main():
sweep_id = wandb.sweep(sweep_config,
project="air-alarm-rsf")
wandb.agent(sweep_id, train_model, count=5)
def train_model():
config_defaults = {
'n_estimators': [100],
'max_depth' : [None],
'min_samples_split': [2],
'min_samples_leaf': [1],
'max_features': [None],
"seed": 0,
"test_size": 0.25,
}
# Initialize a new wandb run
wandb.init(config=config_defaults)
# Config is a variable that holds and saves hyperparameters and inputs
config = wandb.config
# Load data
data_settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
target_settings = load_settings(pt.CONFIGS_DIR, "alarm.yaml")
dl = data_loader.AlarmDataLoader(pt.PROCESSED_DATA_DIR,
"alarm_data.pkl",
target_settings).load_data()
X, y = dl.get_data()
# Encode X
ats_resolution = data_settings['ats_resolution']
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
X_enc = preprocessor.one_hot_encode(X, ats_cols)
X = pd.concat([X.drop(ats_cols, axis=1), X_enc], axis=1)
# Make model
model = RandomSurvivalForest(n_estimators=config.n_estimators,
max_depth=config.max_depth,
min_samples_split=config.min_samples_split,
min_samples_leaf=config.min_samples_leaf,
max_features=config.max_features,
random_state=0)
# Make CV
kf = KFold(n_splits=5, shuffle=True, random_state=0)
c_index_harells = list()
c_index_unos = list()
brier_scores = list()
for train, test in kf.split(X, y):
model.fit(X.iloc[train], y[train])
prediction = model.predict(X.iloc[test])
c_harrell = concordance_index_censored(y[test]["Status"],
y[test]["Days_to_alarm"],
prediction)
c_uno = concordance_index_ipcw(y[train], y[test], prediction)
lower, upper = np.percentile(y["Days_to_alarm"], [10, 90])
alarm_times = np.arange(lower, upper+1)
surv_prob = np.row_stack([fn(alarm_times)
for fn in model.predict_survival_function(X.iloc[test])])
brier_score = integrated_brier_score(y[train], y[test],
surv_prob, alarm_times)
c_index_harells.append(c_harrell[0])
c_index_unos.append(c_uno[0])
brier_scores.append(brier_score)
c_index_harell_mean = np.mean(c_index_harells)
c_index_uno_mean = np.mean(c_index_unos)
brier_score_mean = np.mean(brier_scores)
# Log to wandb
wandb.log({"c_harrell": c_index_harell_mean})
wandb.log({"c_uno": c_index_uno_mean})
wandb.log({"brier_score": brier_score_mean})
if __name__ == "__main__":
main()
\ No newline at end of file
from utility.settings import load_settings
from sklearn.ensemble import RandomForestClassifier
from tools import data_loader
import paths as pt
from sklearn.model_selection import cross_validate, StratifiedKFold
import numpy as np
import os
os.environ["WANDB_SILENT"] = "true"
import wandb
sweep_config = {
"method": "random", # try grid or random
"metric": {
"name": "accuracy",
"goal": "maximize"
},
"parameters": {
"n_estimators": {
"values": [1, 2, 4, 8, 16, 32, 64, 100, 200, 400]
},
"criterion": {
"values": ["gini", "entropy"]
},
"max_depth": {
"values": [<