Commit a445a6df authored by thecml's avatar thecml
Browse files

made an cv version of neural embedder

parent d06bd82e
Pipeline #63159 passed with stage
in 3 minutes
......@@ -8,12 +8,20 @@ import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.decomposition import PCA
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
def main(enable_visualization: bool = True):
make_complete_emb(enable_visualization)
make_compliance_emb(enable_visualization)
make_fall_emb(enable_visualization)
make_fall_test_emb(enable_visualization)
USE_CROSS_VALID = True
ENABLE_EMB_VIZ = False
VERBOSE = False
def main():
make_complete_emb()
make_compliance_emb()
make_fall_emb()
make_fall_test_emb()
def get_config(df_to_enc, target_name, artifacts_path):
return {
......@@ -21,13 +29,13 @@ def get_config(df_to_enc, target_name, artifacts_path):
"target_name": target_name,
"train_ratio": 0.8,
"network_layers": ([128]),
"epochs": 5,
"epochs": 200,
"batch_size": 32,
"verbose": False,
"verbose": VERBOSE,
"artifacts_path": artifacts_path
}
def make_complete_emb(enable_visualization):
def make_complete_emb():
target_name = 'Complete'
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
......@@ -39,10 +47,22 @@ def make_complete_emb(enable_visualization):
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.COMPLIANCE_EMB_DIR
df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path, enable_visualization)
file_writer.write_csv(df_enc, cfg.PROCESSED_DATA_DIR, 'complete_emb.csv')
def make_compliance_emb(enable_visualization):
if USE_CROSS_VALID:
df_enc = encode_dataframe_cv(df_to_enc, target_name, artifacts_path)
else:
df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path)
ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
df = df.drop(ats_cols, axis=1)
df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
if USE_CROSS_VALID:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'complete_emb_cv.csv')
else:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'complete_emb.csv')
def make_compliance_emb():
target_name = 'Compliance'
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
......@@ -54,10 +74,22 @@ def make_compliance_emb(enable_visualization):
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.COMPLIANCE_EMB_DIR
df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path, enable_visualization)
file_writer.write_csv(df_enc, cfg.PROCESSED_DATA_DIR, 'compliance_emb.csv')
def make_fall_emb(enable_visualization):
if USE_CROSS_VALID:
df_enc = encode_dataframe_cv(df_to_enc, target_name, artifacts_path)
else:
df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path)
ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
df = df.drop(ats_cols, axis=1)
df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
if USE_CROSS_VALID:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'compliance_emb_cv.csv')
else:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'compliance_emb.csv')
def make_fall_emb():
target_name = 'Fall'
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
......@@ -69,10 +101,22 @@ def make_fall_emb(enable_visualization):
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.FALL_EMB_DIR
df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path, enable_visualization)
file_writer.write_csv(df_enc, cfg.PROCESSED_DATA_DIR, 'fall_emb.csv')
def make_fall_test_emb(enable_visualization):
if USE_CROSS_VALID:
df_enc = encode_dataframe_cv(df_to_enc, target_name, artifacts_path)
else:
df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path)
ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
df = df.drop(ats_cols, axis=1)
df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
if USE_CROSS_VALID:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_emb_cv.csv')
else:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_emb.csv')
def make_fall_test_emb():
ex = {str(i)+'Ex':str for i in range(1, cfg.EX_RESOLUTION+1)}
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
converters = {**ex, **ats}
......@@ -90,15 +134,75 @@ def make_fall_test_emb(enable_visualization):
df_ex_to_enc = df_ex_to_enc.drop(['NumberFalls'], axis=1)
artifacts_path = cfg.FALL_TEST_EMB_DIR
ats_enc = encode_dataframe(df_ats_to_enc, 'Fall', artifacts_path, enable_visualization)
ex_enc = encode_dataframe(df_ex_to_enc, 'Fall', artifacts_path, enable_visualization)
if USE_CROSS_VALID:
ats_enc = encode_dataframe_cv(df_ats_to_enc, 'Fall', artifacts_path)
ex_enc = encode_dataframe_cv(df_ex_to_enc, 'Fall', artifacts_path)
else:
ats_enc = encode_dataframe(df_ats_to_enc, 'Fall', artifacts_path)
ex_enc = encode_dataframe(df_ex_to_enc, 'Fall', artifacts_path)
df = df.drop(ats_cols + ex_cols, axis=1)
df = pd.concat([df.drop('Fall', axis=1), ats_enc, ex_enc, df.pop('Fall')], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
if USE_CROSS_VALID:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_test_emb_cv.csv')
else:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
def encode_dataframe_cv(df_to_enc, target_name, artifacts_path):
params = get_config(df_to_enc, target_name, artifacts_path)
X, y = preprocessor.get_X_y(df_to_enc, target_name)
X, labels = preprocessor.encode_vector_label(X)
y = np.array(y)
network = neural_embedder.NeuralEmbedder(**params)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
mode='min',
patience=3,
verbose=1)
weights = []
for train_index, valid_index in skf.split(X, y):
X_train, X_valid = X[train_index,:], X[valid_index,:]
y_train, y_valid = y[train_index], y[valid_index]
_ = network.fit(X_train, y_train, X_valid, y_valid,
callbacks=[callback])
embedded_weights = network.get_embedded_weights()
weights.append(embedded_weights)
new_weights = list()
for weights_list_tuple in zip(*weights):
new_weights.append(
[np.array(weights_).mean(axis=0)\
for weights_ in zip(*weights_list_tuple)])
network.save_weights(new_weights)
network.save_labels(labels)
if ENABLE_EMB_VIZ:
network.make_visualizations_from_network(extension='png')
df_to_enc = df_to_enc.drop(target_name, axis=1)
for index in range(df_to_enc.shape[1] - 1):
column = df_to_enc.columns[index]
labels_column = labels[index]
embeddings_column = new_weights[index]
pca = PCA(n_components=1)
Y = pca.fit_transform(embeddings_column)
y_array = np.concatenate(Y)
mapping = dict(zip(labels_column.classes_, y_array))
file_writer.write_mapping(mapping,
Path.joinpath(cfg.PROCESSED_DATA_DIR, 'embeddings'),
f'{target_name.lower()}_{column}.csv')
df_to_enc[column] = df_to_enc[column].replace(to_replace=mapping)
return df_to_enc
def encode_dataframe(df_to_enc, target_name, artifacts_path, enable_visualization):
def encode_dataframe(df_to_enc, target_name, artifacts_path):
params = get_config(df_to_enc, target_name, artifacts_path)
X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(df_to_enc,
......@@ -112,7 +216,7 @@ def encode_dataframe(df_to_enc, target_name, artifacts_path, enable_visualizatio
network.save_weights(embedded_weights)
network.save_labels(labels)
if enable_visualization:
if ENABLE_EMB_VIZ:
network.make_visualizations_from_network(extension='png')
df_to_enc = df_to_enc.drop(target_name, axis=1)
......
......@@ -19,28 +19,28 @@ def main(dataset_version : str = 'emb'):
if dataset_version == 'ohe':
df = file_reader.read_csv(DATA_DIR, 'complete_count.csv')
else:
df = file_reader.read_csv(DATA_DIR, 'complete_emb.csv')
df = file_reader.read_csv(DATA_DIR, 'complete_emb_cv.csv')
model_dir = cfg.COMPLETE_XGB_DIR
target_name = "Complete"
elif case == "Compliance":
if dataset_version == 'ohe':
df = file_reader.read_csv(DATA_DIR, 'compliance_count.csv')
else:
df = file_reader.read_csv(DATA_DIR, 'compliance_emb.csv')
df = file_reader.read_csv(DATA_DIR, 'compliance_emb_cv.csv')
model_dir = cfg.COMPLIANCE_XGB_DIR
target_name = "Compliance"
elif case == "Fall":
if dataset_version == 'ohe':
df = file_reader.read_csv(DATA_DIR, 'fall_count.csv')
else:
df = file_reader.read_csv(DATA_DIR, 'fall_emb.csv')
df = file_reader.read_csv(DATA_DIR, 'fall_emb_cv.csv')
model_dir = cfg.FALL_XGB_DIR
target_name = "Fall"
else:
if dataset_version == 'ohe':
df = file_reader.read_csv(DATA_DIR, 'fall_test_count.csv')
else:
df = file_reader.read_csv(DATA_DIR, 'fall_test_emb.csv')
df = file_reader.read_csv(DATA_DIR, 'fall_test_emb_cv.csv')
model_dir = cfg.FALL_TEST_XGB_DIR
target_name = "Fall"
......@@ -58,11 +58,11 @@ def main(dataset_version : str = 'emb'):
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.1,
"learning_rate": 0.07,
"eval_metric": "logloss",
"seed": 0
}
model = xgb.XGBClassifier(**params)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
......@@ -77,10 +77,10 @@ def main(dataset_version : str = 'emb'):
if optimize_rounds:
eval_set=[(X_valid_split, y_valid_split)]
fit_model = model.fit(X_train_split, y_train_split,
eval_set=eval_set,
eval_metric=metrics.gini_xgb,
early_stopping_rounds=early_stopping_rounds,
verbose=False)
eval_set=eval_set,
eval_metric=metrics.gini_xgb,
early_stopping_rounds=early_stopping_rounds,
verbose=False)
else:
fit_model = model.fit(X_train_split, y_train_split)
......
......@@ -69,7 +69,7 @@ class NeuralEmbedder:
self.regularization_factor = regularization_factor
self.loss_fn = loss_fn
self.optimizer_fn = optimizer_fn
self.metrics = metrics
self.metrics = metrics
self.verbose = verbose
self.artifacts_path = artifacts_path
self.unique_classes = self.df[self.target_name].nunique()
......@@ -213,7 +213,7 @@ class NeuralEmbedder:
:param y_train: training targets
:param X_valid: validation features
:param y_valid: validation targets
:return a History object
:return a History object and model
"""
history = self.model.fit(x=transpose_to_list(X_train),
y=y_train,
......@@ -225,6 +225,12 @@ class NeuralEmbedder:
verbose=self.verbose)
return history
def get_model(self):
return self.model
def set_weights(self, weights):
self.model.set_weights(weights)
def save_model(self) -> None:
self.model.save(self.artifacts_path)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment