Commit fcfe4493 authored by thecml's avatar thecml
Browse files

added a mlp script to make cv embeddings

parent 7c4dda75
Pipeline #63428 passed with stage
in 2 minutes and 59 seconds
......@@ -11,9 +11,10 @@ from utility.metrics import gini_xgb
import shap
from typing import List
CASE = "Fall"
COMPLETE_FILENAME = "complete_count.csv"
FALL_FILENAME = "fall_count.csv"
CASE = "Complete"
COMPLETE_FILENAME = "complete_emb.csv"
COMPLIANCE_FILENAME = "compliance_emb.csv"
FALL_FILENAME = "fall_emb.csv"
CSV_FILENAME = f"{CASE} best features.csv"
PLOT_FILENAME = f"{CASE} SHAP feature values"
NUM_ITERATIONS = 5
......@@ -25,10 +26,11 @@ def main():
dl = data_loader.FallDataLoader(FALL_FILENAME).load_data()
X, y = dl.get_data()
#X['Random'] = np.random.rand(len(X),1) # add random noise col
cols = X.columns
X = np.array(X)
y = np.array(y)
total_shap_df = pd.DataFrame()
for seed in range(NUM_ITERATIONS):
print('#'*40, '{} of {} iterations'.format(seed+1, NUM_ITERATIONS), '#' * 40)
......@@ -41,7 +43,7 @@ def main():
importances = shap_sorted_df['shap_values']
features = shap_sorted_df['feature']
file_writer.write_shap_importance_plot(features, importances, cfg.REPORTS_DIR, PLOT_FILENAME)
file_writer.write_shap_importance_plot(features, importances, cfg.REPORTS_PLOTS_DIR, PLOT_FILENAME)
file_writer.write_csv(shap_sorted_df, cfg.REPORTS_DIR, CSV_FILENAME)
def get_best_shap_features(X: np.ndarray, y: np.ndarray,
......@@ -53,8 +55,8 @@ def get_best_shap_features(X: np.ndarray, y: np.ndarray,
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
model = xgb.XGBClassifier(n_estimators=400,
learning_rate=0.1,
model = xgb.XGBClassifier(n_estimators=200,
learning_rate=0.07,
objective='binary:logistic',
scale_pos_weight=scale_pos_weight,
eval_metric='logloss',
......@@ -71,7 +73,6 @@ def get_best_shap_features(X: np.ndarray, y: np.ndarray,
model.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_val, y_val)],
eval_metric=gini_xgb,
early_stopping_rounds=10,
verbose=0)
y_val_pred = model.predict_proba(X_val)[:,1]
y_scores_new = (y_val_pred > 0.5)
......
import pandas as pd
import numpy as np
import config as cfg
import os
import csv
import joblib
from pathlib import Path
from data import make_dataset_full, make_dataset_emb
from tools import file_reader, file_writer, explainer
from sklearn.model_selection import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
EPOCHS = 200
def main():
target_name = "Complete"
step_size = 10
for idx in range(1, cfg.ATS_RESOLUTION+1, step_size):
logloss_train, logloss_test = list(), list()
auc_train, auc_test = list(), list()
for ats_res in range(idx, idx+step_size):
make_dataset_full.main(ats_resolution=ats_res)
make_dataset_emb.main(ats_resolution=ats_res)
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'complete_emb.csv')
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
X = df.drop([target_name], axis=1)
y = df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
stratify=y,
random_state=0)
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
params = {"n_estimators": EPOCHS,
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.07,
"seed": 0
}
model = xgb.XGBClassifier(**params)
eval_set = [(X_train, y_train), (X_test, y_test)]
eval_metric = ["auc", "logloss"]
model.fit(X_train, y_train,
eval_set=eval_set,
eval_metric=eval_metric,
verbose=False)
results = model.evals_result()
logloss_train.append((ats_res, results['validation_0']['logloss']))
logloss_test.append((ats_res, results['validation_1']['logloss']))
auc_train.append((ats_res, results['validation_0']['auc']))
auc_test.append((ats_res, results['validation_1']['auc']))
fig, ax = plt.subplots()
x_axis = range(0, EPOCHS)
cmap = plt.cm.coolwarm
for entry in logloss_train:
ax.plot(x_axis, entry[1], label=f'Train ATS={entry[0]}', color=cmap(0.))
ax.text(x_axis[-1], entry[1][-1], f'ATS={entry[0]}', fontsize=2)
for entry in logloss_test:
ax.plot(x_axis, entry[1], label=f'Test ATS={entry[0]}', color=cmap(1.))
ax.text(x_axis[-1], entry[1][-1], f'ATS={entry[0]}', fontsize=2)
custom_lines = [Line2D([0], [0], color=cmap(0.), lw=4),
Line2D([0], [0], color=cmap(1.), lw=4)]
ax.legend(custom_lines, ['Train logloss',
'Test logloss'])
file_name = f"XGBoost logloss for ATS {idx}-{idx+(step_size-1)}"
plt.ylabel('Logloss')
plt.xlabel('Iterations')
plt.title(file_name)
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, f'{file_name}.pdf'),
dpi=300,
bbox_inches = "tight")
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -14,15 +14,18 @@ from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
USE_CROSS_VALID = True
USE_GROUPING = True
USE_GROUPING = False
ENABLE_EMB_VIZ = False
VERBOSE = False
VERBOSE = True
def main():
make_complete_emb()
make_compliance_emb()
make_fall_emb()
make_fall_test_emb()
def main(ats_resolution: int = None):
if ats_resolution == None:
ats_resolution = cfg.ATS_RESOLUTION
make_complete_emb(ats_resolution)
make_compliance_emb(ats_resolution)
make_fall_emb(ats_resolution)
make_fall_test_emb(ats_resolution)
def get_config(df_to_enc, target_name, artifacts_path):
return {
......@@ -30,21 +33,21 @@ def get_config(df_to_enc, target_name, artifacts_path):
"target_name": target_name,
"train_ratio": 0.8,
"network_layers": ([128]),
"epochs": 200,
"epochs": 5,
"batch_size": 32,
"verbose": VERBOSE,
"artifacts_path": artifacts_path
}
def make_complete_emb():
def make_complete_emb(ats_resolution):
target_name = 'Complete'
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
'complete.csv',
converters=ats)
if USE_GROUPING: # encode groups of ats
for col_idx in range(1, cfg.ATS_RESOLUTION-1, 5):
for col_idx in range(1, ats_resolution-1, 5):
df[f'{col_idx}_{col_idx+4}Ats'] = df[f'{col_idx}Ats'].apply(str) \
+ '_' + df[f'{col_idx+1}Ats'].apply(str) \
+ '_' + df[f'{col_idx+2}Ats'].apply(str) \
......@@ -56,37 +59,33 @@ def make_complete_emb():
else:
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.COMPLIANCE_EMB_DIR
#n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
#df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.COMPLETE_EMB_DIR
if USE_CROSS_VALID:
df_enc = encode_dataframe_cv(df_to_enc, target_name, artifacts_path)
df_enc = encode_dataframe_cv(df, target_name, artifacts_path)
else:
df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path)
df_enc = encode_dataframe(df, target_name, artifacts_path)
if USE_GROUPING:
df = df.drop(columns=list(df.filter(regex='_')))
else:
ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'complete_emb.csv')
if USE_CROSS_VALID:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'complete_emb_cv.csv')
else:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'complete_emb.csv')
def make_compliance_emb():
def make_compliance_emb(ats_resolution):
target_name = 'Compliance'
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
f'compliance.csv',
converters=ats)
if USE_GROUPING: # encode groups of ats
for col_idx in range(1, cfg.ATS_RESOLUTION-1, 5):
for col_idx in range(1, ats_resolution-1, 5):
df[f'{col_idx}_{col_idx+4}Ats'] = df[f'{col_idx}Ats'].apply(str) \
+ '_' + df[f'{col_idx+1}Ats'].apply(str) \
+ '_' + df[f'{col_idx+2}Ats'].apply(str) \
......@@ -111,25 +110,21 @@ def make_compliance_emb():
if USE_GROUPING:
df = df.drop(columns=list(df.filter(regex='_')))
else:
ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'compliance_emb.csv')
if USE_CROSS_VALID:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'compliance_emb_cv.csv')
else:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'compliance_emb.csv')
def make_fall_emb():
def make_fall_emb(ats_resolution):
target_name = 'Fall'
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
f'fall.csv',
converters=ats)
if USE_GROUPING: # encode groups of ats
for col_idx in range(1, cfg.ATS_RESOLUTION-1, 5):
for col_idx in range(1, ats_resolution-1, 5):
df[f'{col_idx}_{col_idx+4}Ats'] = df[f'{col_idx}Ats'].apply(str) \
+ '_' + df[f'{col_idx+1}Ats'].apply(str) \
+ '_' + df[f'{col_idx+2}Ats'].apply(str) \
......@@ -154,25 +149,21 @@ def make_fall_emb():
if USE_GROUPING:
df = df.drop(columns=list(df.filter(regex='_')))
else:
ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_emb.csv')
if USE_CROSS_VALID:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_emb_cv.csv')
else:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_emb.csv')
def make_fall_test_emb():
def make_fall_test_emb(ats_resolution):
ex = {str(i)+'Ex':str for i in range(1, cfg.EX_RESOLUTION+1)}
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
converters = {**ex, **ats}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
f'fall_test.csv',
converters=converters)
ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
ex_cols = [str(i)+'Ex' for i in range(1, cfg.EX_RESOLUTION+1)]
df_ats_to_enc = df.filter(regex=f'Fall|((\d+)[Ats])\w+', axis=1)
......@@ -192,11 +183,7 @@ def make_fall_test_emb():
df = df.drop(ats_cols + ex_cols, axis=1)
df = pd.concat([df.drop('Fall', axis=1), ats_enc, ex_enc, df.pop('Fall')], axis=1)
if USE_CROSS_VALID:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_test_emb_cv.csv')
else:
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
def encode_dataframe_cv(df_to_enc, target_name, artifacts_path):
params = get_config(df_to_enc, target_name, artifacts_path)
......@@ -207,10 +194,10 @@ def encode_dataframe_cv(df_to_enc, target_name, artifacts_path):
network = neural_embedder.NeuralEmbedder(**params)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
mode='min',
patience=3,
verbose=1)
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
mode='min',
patience=3,
verbose=0)
weights = []
for train_index, valid_index in skf.split(X, y):
......@@ -218,7 +205,7 @@ def encode_dataframe_cv(df_to_enc, target_name, artifacts_path):
y_train, y_valid = y[train_index], y[valid_index]
_ = network.fit(X_train, y_train, X_valid, y_valid,
callbacks=[callback])
callbacks=[es_callback])
embedded_weights = network.get_embedded_weights()
weights.append(embedded_weights)
......@@ -268,7 +255,7 @@ def encode_dataframe(df_to_enc, target_name, artifacts_path):
network.make_visualizations_from_network(extension='png')
df_to_enc = df_to_enc.drop(target_name, axis=1)
for index in range(df_to_enc.shape[1] - 1):
for index in range(df_to_enc.shape[1]):
column = df_to_enc.columns[index]
labels_column = labels[index]
embeddings_column = embedded_weights[index]
......
......@@ -4,7 +4,8 @@ from tools import file_reader, file_writer, feature_maker
from tools import preprocessor
import pandas as pd
def main(use_real_ats_names: bool = False):
def main(use_real_ats_names: bool = False,
ats_resolution: int = None):
clusters = file_reader.read_csv(cfg.INTERIM_DATA_DIR, 'cl.csv',
converters={'CitizenId': str, 'Cluster': int})
screenings = file_reader.read_csv(cfg.INTERIM_DATA_DIR, 'screenings.csv',
......@@ -12,15 +13,18 @@ def main(use_real_ats_names: bool = False):
fall_data = pd.DataFrame(file_reader.read_pickle(cfg.INTERIM_DATA_DIR, 'fd.pkl'))
fall_data = fall_data.drop_duplicates(["CitizenId", "Date"])
make_complete_case(screenings, clusters, use_real_ats_names)
make_compliance_case(screenings, clusters, use_real_ats_names)
make_fall_case(screenings, clusters, use_real_ats_names)
make_fall_test_case(screenings, clusters, fall_data, use_real_ats_names)
if ats_resolution == None:
ats_resolution = cfg.ATS_RESOLUTION
def make_complete_case(df, clusters, use_real_ats_names):
make_complete_case(screenings, clusters, use_real_ats_names, ats_resolution)
make_compliance_case(screenings, clusters, use_real_ats_names, ats_resolution)
make_fall_case(screenings, clusters, use_real_ats_names, ats_resolution)
make_fall_test_case(screenings, clusters, fall_data, use_real_ats_names, ats_resolution)
def make_complete_case(df, clusters, use_real_ats_names, ats_resolution):
df['Cluster'] = clusters['Cluster']
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats',
resolution=cfg.ATS_RESOLUTION)
resolution=ats_resolution)
df = feature_maker.make_complete_feature(df)
ats_cols = df.filter(regex='Ats', axis=1)
......@@ -34,10 +38,10 @@ def make_complete_case(df, clusters, use_real_ats_names):
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, f'complete.csv')
def make_compliance_case(df, clusters, use_real_ats_names):
def make_compliance_case(df, clusters, use_real_ats_names, ats_resolution):
df['Cluster'] = clusters['Cluster']
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats',
resolution=cfg.ATS_RESOLUTION)
resolution=ats_resolution)
df = feature_maker.make_compliance_feature(df)
ats_cols = df.filter(regex='Ats', axis=1)
......@@ -51,10 +55,10 @@ def make_compliance_case(df, clusters, use_real_ats_names):
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, f'compliance.csv')
def make_fall_case(df, clusters, use_real_ats_names):
def make_fall_case(df, clusters, use_real_ats_names, ats_resolution):
df['Cluster'] = clusters['Cluster']
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats',
resolution=cfg.ATS_RESOLUTION)
resolution=ats_resolution)
df = feature_maker.make_fall_feature(df)
ats_cols = df.filter(regex='Ats', axis=1)
......@@ -68,10 +72,10 @@ def make_fall_case(df, clusters, use_real_ats_names):
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, f'fall.csv')
def make_fall_test_case(df, clusters, fall_data, use_real_ats_names):
def make_fall_test_case(df, clusters, fall_data, use_real_ats_names, ats_resolution):
df['Cluster'] = clusters['Cluster']
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats',
resolution=cfg.ATS_RESOLUTION)
resolution=ats_resolution)
df = preprocessor.split_cat_columns(df, col_to_split='Ex', tag='Ex',
resolution=cfg.EX_RESOLUTION)
df = feature_maker.make_fall_test_feature(df, fall_data)
......
......@@ -145,7 +145,7 @@ def get_screenings_by_id(data, id):
single_screening['Ex'] = screening.ExerciseContent
single_screening['NumberEx'] = feature_maker.get_number_exercises(screening.ExerciseContent)
single_screening['HasFallRisk'] = sum(map(screening.ExerciseContent.count,
cfg.FALL_EXERCISES)) > cfg.FALL_EXERCISE_THRESHOLD
cfg.FALL_EXERCISES)) > cfg.FALL_EXERCISE_THRESHOLD
screenings = pd.concat([screenings, single_screening], axis=0, ignore_index=True)
......
#!/usr/bin/env python
import numpy as np
import config as cfg
import pandas as pd
from tools import file_reader, preprocessor, neural_embedder
from utility import metrics
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
CASES = ["Complete", "Compliance", "Fall", "Fall_test"]
class NetworkCategory:
def __init__(self, alias: str, unique_values: int):
self.alias = alias
self.unique_values = unique_values
self.embedding_size = self.get_embedding_size(unique_values)
def get_embedding_size(self, unique_values: int) -> int:
size = int(min(np.ceil(unique_values / 2), 50))
if size < 2:
return 2
else:
return size
def transpose_to_list(X):
features_list = []
for index in range(X.shape[1]):
features_list.append(X[..., [index]])
return features_list
def ginic(actual, pred):
n = len(actual)
a_s = actual[np.argsort(pred)]
a_c = a_s.cumsum()
giniSum = a_c.sum() / a_c[-1] - (n + 1) / 2.0
return giniSum / n
def gini_normalizedc(a, p):
return ginic(a, p) / ginic(a, a)
def get_categorial_cols(df, target_name):
cat_list = []
for category in df:
if not category == target_name and is_string_dtype(df[category]):
cat_list.append(NetworkCategory(category, df[category].nunique()))
return cat_list
def get_numerical_cols(df, target_name):
num_list = []
for category in df:
if not category == target_name and is_numeric_dtype(df[category]):
num_list.append(category)
return num_list
def build_embedding_network(cat_cols, num_cols):
# Make numerical layers
numerical_inputs = []
numerical_outputs = []
for category in num_cols:
input_category = tf.keras.layers.Input(shape=(1,))
output_category = tf.keras.layers.Dense(1, name=category)(input_category)
numerical_inputs.append(input_category)
numerical_outputs.append(output_category)
# Make embedding layers
embedding_inputs = []
embedding_outputs = []
for category in cat_cols:
input_category = tf.keras.layers.Input(shape=(1,))
output_category = tf.keras.layers.Embedding(input_dim=category.unique_values,
output_dim=category.embedding_size,
name=category.alias)(input_category)
output_category = tf.keras.layers.Reshape(target_shape=(category.embedding_size,))(output_category)
embedding_inputs.append(input_category)
embedding_outputs.append(output_category)
# Concatenate layers
model_inputs = numerical_inputs + embedding_inputs
model_outputs = numerical_outputs + embedding_outputs
# Make hidden layers
output_model = tf.keras.layers.Concatenate()(model_outputs)
layer_sizes = [80, 20, 10]
dropout_rates = [.35, .15, .15]
for layer_size, dropout_rate in zip(layer_sizes, dropout_rates):
output_model = tf.keras.layers.Dense(layer_size)(output_model)
output_model = tf.keras.layers.Activation("relu")(output_model)
output_model = tf.keras.layers.Dropout(dropout_rate)(output_model)
# Make final layer
output_model = tf.keras.layers.Dense(1)(output_model)
output_model = tf.keras.layers.Activation('sigmoid')(output_model)
metrics = [
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='auc'),
]
model = tf.keras.Model(inputs=model_inputs, outputs=output_model)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
return model
def main():
ats_cols = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
'complete.csv',
converters=ats_cols)
target_name = "Complete"
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
# Collect embedded and numerical cols
cat_cols = get_categorial_cols(df, target_name)
num_cols = get_numerical_cols(df, target_name)
# Prepare the data
X, y = preprocessor.get_X_y(df, target_name)
X, labels = preprocessor.encode_vector_label(X, n_numerical_cols)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train[:,:n_numerical_cols])
X_test_sc = scaler.transform(X_test[:,:n_numerical_cols])
X_train = np.concatenate([X_train_sc, X_train[:,n_numerical_cols:]], axis=1)
X_test = np.concatenate([X_test_sc, X_test[:,n_numerical_cols:]], axis=1)
# Network training
K = 5
runs_per_fold = 3
n_epochs = 10
cv_ginis = []
full_valid_preds = np.zeros(np.shape(X_train)[0])
y_preds = np.zeros((np.shape(X_test)[0],K))
kfold = StratifiedKFold(n_splits=K, random_state=0, shuffle=True)
model = build_embedding_network(cat_cols, num_cols)
for i, (f_ind, outf_ind) in enumerate(kfold.split(X_train, y_train)):
X_train_f, X_valid_f = X_train[f_ind], X_train[outf_ind]
y_train_f, y_valid_f = y_train[f_ind], y_train[outf_ind]