Commit 42ddc2fb authored by thecml's avatar thecml
Browse files

improved data loader, kmodes tuner now use all fts

parent 98c37ca2
Pipeline #66376 passed with stage
in 3 minutes and 47 seconds
......@@ -10,6 +10,13 @@ threshold_training: 10
fall_exercise_threshold: 3
fall_exercises: ['8058','8062','8066','8077','8074','8059','8071','8067']
# Settings for data loader -------------------------------------------------
#
features_to_normalize: ['BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts']
features_to_scale: ['Gender_Male', 'Gender_Female', 'BirthYear',
'Cluster', 'LoanPeriod', 'NumberAts']
# Settings for dataset -------------------------------------------------
#
......
%% Cell type:code id: tags:
```
import tensorflow as tf
import numpy as np
import pandas as pd
from pathlib import Path
import paths as pt
import yaml
from tools import data_loader, preprocessor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
FILENAME = "complete_emb.csv"
def make_model(input_dim):
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(80,
input_dim=input_dim,
activation='relu'))
model.add(tf.keras.layers.Dropout(0.35))
model.add(tf.keras.layers.Dense(20, activation='relu'))
model.add(tf.keras.layers.Dropout(0.15))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dropout(0.15))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
metrics = [
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='auc'),
]
model.compile(loss='binary_crossentropy',
optimizer="Adam",
metrics=metrics)
return model
# Load settings
with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
# Load the data
dl = data_loader.CompleteDataLoader(FILENAME).load_data()
file_name = "complete_emb.csv"
dl = data_loader.CompleteDataLoader(file_name, settings).load_data()
features = dl.get_features()
X, y = dl.get_data()
# Calculate class weight
neg, pos = np.bincount(y)
class_weight = preprocessor.get_class_weight(neg, pos)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,
stratify=y, random_state=0)
# Scale the data
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
numeric_feats = ['BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts']
scaler = StandardScaler()
X_train_enc = pd.DataFrame(scaler.fit_transform(X_train[numeric_feats]),
columns=numeric_feats)
X_test_enc = pd.DataFrame(scaler.transform(X_test[numeric_feats]),
columns=numeric_feats)
num_X_train = X_train.drop(numeric_feats, axis=1)
num_X_test = X_test.drop(numeric_feats, axis=1)
X_train_sc = pd.concat([num_X_train[['Gender_Male', 'Gender_Female']], X_train_enc,
num_X_train[['Rand', '1Ats', '2Ats', '3Ats', '4Ats', '5Ats',
'6Ats', '7Ats', '8Ats', '9Ats', '10Ats']]], axis=1)
X_test_sc = pd.concat([num_X_test[['Gender_Male', 'Gender_Female']], X_test_enc,
num_X_test[['Rand', '1Ats', '2Ats', '3Ats', '4Ats', '5Ats',
'6Ats', '7Ats', '8Ats', '9Ats', '10Ats']]], axis=1)
num_X_test[['Rand', '1Ats', '2Ats', '3Ats', '4Ats', '5Ats',
'6Ats', '7Ats', '8Ats', '9Ats', '10Ats']]], axis=1)
```
%% Cell type:code id: tags:
```
X_train = np.array(X_train)
X_train_sc = np.array(X_train_sc)
X_test = np.array(X_test)
X_test_sc = np.array(X_test_sc)
y_train = np.array(y_train)
y_test = np.array(y_test)
# Train the model
model = make_model(input_dim=X_train_sc.shape[1])
history = model.fit(X_train_sc, y_train, epochs=10,
class_weight=class_weight,
batch_size=32, verbose=False)
```
%% Output
WARNING:tensorflow:From C:\Users\cml\miniconda3\envs\py38-air\lib\site-packages\tensorflow\python\ops\array_ops.py:5043: calling gather (from tensorflow.python.ops.array_ops) with validate_indices is deprecated and will be removed in a future version.
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
%% Cell type:code id: tags:
```
def predict(data):
global model, scaler
data[:,2:6] = scaler.transform(data[:,2:6])
y_pred = model.predict(data).reshape(-1, 1)
y_pred = (y_pred>0.5)
print(np.array(list(zip(1-y_pred.reshape(data.shape[0]),y_pred.reshape(data.shape[0])))))
return np.hstack((1-y_pred,y_pred))
import lime
import lime.lime_tabular
explainer = lime.lime_tabular.LimeTabularExplainer(X_train, mode='classification',
class_names=['No complete', 'Complete'],
feature_names=features)
exp = explainer.explain_instance(X_test[27], predict, num_features=X_train.shape[1])
```
%% Output
[[0 1]
[1 0]
[1 0]
[0 1]
...
[0 1]
[1 0]
[1 0]
[1 0]]
[0 1]]
%% Cell type:code id: tags:
```
exp.as_list()
```
%% Output
[('10Ats > 0.03', 0.2427313425010318),
('1Ats > 0.07', 0.228786077364526),
('5Ats <= -0.04', 0.2162816045292137),
('7Ats <= -0.02', -0.19197284406846732),
('2Ats > -0.03', -0.18743482165180125),
('3Ats <= -0.05', 0.1783008517465215),
('Cluster > 11.00', -0.15853002984685913),
('8Ats <= 0.06', -0.15029155731374058),
('6Ats > 0.08', 0.14579099835375428),
('29.00 < BirthYear <= 35.00', 0.13720219200227843),
('LoanPeriod > 1627.50', -0.13008272614366065),
('Gender_Female <= 0.00', -0.1151530501828192),
('9Ats <= -0.01', -0.10795251202962702),
('0.00 < Gender_Male <= 1.00', 0.05549970710388356),
('8.00 < NumberAts <= 14.00', 0.047719018547033754),
('Rand <= 0.25', 0.03187258101954761),
('0.03 < 4Ats <= 0.08', -0.031384553333102014)]
[('10Ats > 0.03', 0.20651574242348936),
('3Ats <= -0.05', 0.192865743355257),
('1Ats > 0.07', 0.18353986581301904),
('2Ats > -0.03', -0.18296575285910985),
('7Ats <= -0.02', -0.16874453649744492),
('9Ats <= -0.01', -0.13263677767888812),
('8Ats <= 0.06', -0.1273679310001168),
('6Ats > 0.08', 0.11597565308592392),
('5Ats <= -0.04', 0.11306311651048453),
('Gender_Female <= 0.00', -0.08515210039953246),
('8.00 < NumberAts <= 14.00', 0.06138616356339817),
('LoanPeriod > 1627.50', -0.05305334898011001),
('0.00 < Gender_Male <= 1.00', 0.05105843326784843),
('29.00 < BirthYear <= 35.00', 0.04509384864824174),
('Rand <= 0.25', 0.03759473681614202),
('0.03 < 4Ats <= 0.08', 0.0045078832741941325),
('Cluster > 11.00', 2.0051051251007468e-05)]
%% Cell type:code id: tags:
```
model.predict(np.array([X_test_sc[27],]))
```
%% Output
array([[0.60586447]], dtype=float32)
array([[0.59483784]], dtype=float32)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -9,7 +9,6 @@ import tensorflow as tf
from pathlib import Path
tf.get_logger().setLevel('ERROR')
NUM_ITER = 10
CASES = ["Complete", "Compliance", "Fall", "Fall_test"]
......
......@@ -4,47 +4,57 @@ import paths as pt
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RandomizedSearchCV
from tools import file_reader, file_writer, preprocessor, feature_maker
from sklearn.model_selection import GridSearchCV
from tools import file_reader, file_writer, preprocessor, data_loader, feature_maker
from kmodes import kmodes
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from pathlib import Path
import paths as pt
import yaml
def main():
df = file_reader.read_csv(pt.INTERIM_DATA_DIR, 'screenings.csv',
converters={'CitizenId': str})
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats', resolution=10)
df = feature_maker.make_complete_feature(df)
general_cols = df[['CitizenId', 'Gender', 'BirthYear', 'LoanPeriod']]
ats_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
df = pd.concat([general_cols, ats_cols, df[['Complete']]], axis=1)
class ClusterMaker(BaseEstimator, TransformerMixin):
def __init__(self, init='random', n_clusters=1, n_init=1):
self.init = init
self.n_clusters = n_clusters
self.n_init = n_init
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
model = kmodes.KModes(init=self.init, n_clusters=self.n_clusters,
n_init=self.n_init,
cat_dissim=kmodes.ng_dissim,
n_jobs=-1)
model.fit(X.iloc[:, 4:].astype(str))
predictions = model.predict(X.iloc[:, 4:].astype(str))
predictions = pd.Series(predictions, name="Cluster")
X = X.iloc[:, 1:4]
X = X.reset_index(drop=True)
X['Cluster'] = predictions
return X
class ClusterMaker(BaseEstimator, TransformerMixin):
def __init__(self, init='random', n_clusters=1, n_init=1, ats_resolution=10):
self.init = init
self.n_clusters = n_clusters
self.n_init = n_init
self.ats_resolution = ats_resolution
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
model = kmodes.KModes(init=self.init, n_clusters=self.n_clusters,
n_init=self.n_init,
cat_dissim=kmodes.ng_dissim,
n_jobs=-1)
model.fit(X.iloc[:,-self.ats_resolution:].astype(str))
predictions = model.predict(X.iloc[:,-self.ats_resolution:].astype(str))
predictions = pd.Series(predictions, name="Cluster")
X = X.iloc[: , :-self.ats_resolution].reset_index(drop=True)
X['Cluster'] = predictions
return X
def main():
# Load settings
with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
# Load screenings and make ats from them
screenings = file_reader.read_csv(pt.INTERIM_DATA_DIR, 'screenings.csv',
converters={'CitizenId': str})
df_ats = preprocessor.split_cat_columns(screenings, col_to_split='Ats', tag='Ats', resolution=10)
df_ats = feature_maker.make_complete_feature(df_ats, settings)
ats_cols = df_ats.filter(regex='((\d+)[Ats])\w+', axis=1)
# Load processed emb dataset
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
X, y = dl.get_data()
# Merge datasets
X = pd.concat([X, ats_cols], axis=1)
pipeline = Pipeline([
('cluster_maker', ClusterMaker()),
('clf', RandomForestClassifier(random_state=0))
('clf', RandomForestClassifier(random_state=0, class_weight="balanced"))
])
param_grid = [
......@@ -55,27 +65,23 @@ def main():
}
]
scoring = 'average_precision'
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid,
n_iter=10, scoring='neg_log_loss',
cv=skf, n_jobs=-1, verbose=3,
random_state=0)
X = df[['CitizenId', 'Gender', 'BirthYear', 'LoanPeriod'] + list(ats_cols)]
y = df['Complete']
clf = GridSearchCV(pipeline, param_grid=param_grid,
scoring=scoring, cv=skf)
random_search.fit(X, y)
clf.fit(X, y)
print('\nAll results:')
print(random_search.cv_results_)
print(clf.cv_results_)
print('\nBest estimator:')
print(random_search.best_estimator_)
print('\nBest normalized gini score for %d-fold search with %d parameter combinations:' % (5, 5))
print(random_search.best_score_ * 2 - 1)
print(clf.best_estimator_)
print('\nBest score:')
print(clf.best_score_)
print('\nBest hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
file_writer.write_csv(results, pt.REPORTS_DIR, 'kmodes-settings-random-grid-search-results.csv')
print(clf.best_params_)
results = pd.DataFrame(clf.cv_results_)
file_writer.write_csv(results, pt.REPORTS_DIR, 'kmodes-settings-grid-search-results.csv')
if __name__ == '__main__':
main()
from tools import data_loader
import tensorflow as tf
import kerastuner as kt
from pathlib import Path
import paths as pt
import shutil
CASE = "Complete"
COMPLETE_FILENAME = "complete_with_embeddings.csv"
FALL_FILENAME = "fall_with_embeddings.csv"
SCALING_STRATEGY = "Standard"
def create_model(hp, input_dim=14):
model = tf.keras.models.Sequential()
hp_activation = hp.Choice('activation', ['softmax', 'relu', 'sigmoid', 'linear'])
for i in range(hp.Int('num_layers', 0, 2)):
model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
min_value=32,
max_value=128,
step=8),
input_dim=input_dim,
activation=hp_activation))
hp_dropout = hp.Choice('dropout', values=[0.1, 0.2, 0.5])
model.add(tf.keras.layers.Dropout(hp_dropout))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer=tf.keras.optimizers.Adam(),
loss='binary_crossentropy',
metrics=['accuracy'])
return model
def main():
if CASE == "Complete":
X_train, X_test, y_train, y_test = data_loader.CompleteDataLoader(
COMPLETE_FILENAME).load_data().prepare_data_split(SCALING_STRATEGY, 0.3)
else:
X_train, X_test, y_train, y_test = data_loader.FallDataLoader(
FALL_FILENAME).load_data().prepare_data_split(SCALING_STRATEGY, 0.3)
tuner = kt.BayesianOptimization(create_model,
objective='val_accuracy', max_trials=20, executions_per_trial=2,
directory=Path.joinpath(pt.REPORTS_DIR, 'keras_tuner'),
project_name='complete_mlp',
seed=0)
tuner.search(X_train, y_train, epochs=50,
validation_data=(X_test, y_test))
print(tuner.get_best_hyperparameters(num_trials=1)[0].values)
shutil.rmtree(Path.joinpath(pt.REPORTS_DIR, 'keras_tuner'))
if __name__ == '__main__':
main()
import numpy as np
import pandas as pd
import paths as pt
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from tools import data_loader, file_writer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import cross_validate, cross_val_score
import matplotlib.pyplot as plt
from pathlib import Path
import paths as pt
import yaml
def main():
with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
X, y = dl.get_data()
param_grid = {
'n_estimators': [400, 800],
'class_weight': ['balanced'],
'max_features': ['auto'],
'max_depth' : [6],
'min_samples_split' : [10],
'min_samples_leaf': [3],
'criterion' : ['gini']
}
model = RandomForestClassifier(random_state=0,
class_weight="balanced")
rounds = 5
outer_scores = np.zeros(rounds)
nested_scores = np.zeros(rounds)
metric = 'average_precision'
best_params = list()
for i in range(rounds):
#Define both cross-validation objects (inner & outer)
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
# Non-nested parameter search and scoring
clf = GridSearchCV(estimator=model, param_grid=param_grid,
cv=inner_cv, scoring=metric)
clf.fit(X, y)
outer_scores[i] = clf.best_score_
best_params.append(clf.best_params_)
# Nested CV with parameter optimization
nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv,
scoring=metric)
nested_scores[i] = nested_score.mean()
score_difference = outer_scores - nested_scores
print("Avg. difference of {:6f} with std. dev. of {:6f}."
.format(score_difference.mean(), score_difference.std()))
# Print the best params per round
for i, best_param in enumerate(best_params):
print(f"Round {i+1}: {best_param}")
# Plot scores on each round for nested and non-nested cross-validation
plt.style.use('seaborn')
plt.tight_layout()
plt.figure(figsize=(10,5))
outer_scores_line, = plt.plot(outer_scores, color='orange')
nested_line, = plt.plot(nested_scores, color='steelblue')
plt.ylabel("Score", fontsize="14")
plt.legend([outer_scores_line, nested_line],
["Non-Nested CV", "Nested CV"],
bbox_to_anchor=(0, .4, .5, 0))
plt.title("Non-Nested vs Nested Cross-Validation",
x=.5, y=1.1, fontsize="15")
# Plot bar chart of the difference.
plt.figure(figsize=(10,5))
plt.tight_layout()
difference_plot = plt.bar(range(rounds), score_difference)
plt.xlabel("Individual Trial #")
plt.legend([difference_plot],
["Non-Nested CV - Nested CV Score"],
bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")
plt.show()