Commit 3a92b513 authored by thecml's avatar thecml
Browse files

added adasyn test, decision tree viz

parent da6e77d2
Pipeline #64875 passed with stage
in 3 minutes and 4 seconds
......@@ -11,7 +11,5 @@ train_ratio: 0.8
batch_size: 32
num_epochs: 5
verbose: True
# Network Hyperparams --------------------------------------
network_layers: [128]
\ No newline at end of file
network_layers: [128]
optimizer: "Adam"
\ No newline at end of file
......@@ -11,7 +11,5 @@ train_ratio: 0.8
batch_size: 32
num_epochs: 20
verbose: True
# Network Hyperparams --------------------------------------
network_layers: [128]
\ No newline at end of file
network_layers: [128]
optimizer: "Adam"
\ No newline at end of file
......@@ -11,7 +11,5 @@ train_ratio: 0.8
batch_size: 32
num_epochs: 10
verbose: True
# Network Hyperparams --------------------------------------
network_layers: [128]
\ No newline at end of file
network_layers: [128]
optimizer: "Adam"
\ No newline at end of file
......@@ -13,4 +13,4 @@ fall_exercises: ['8058','8062','8066','8077','8074','8059','8071','8067']
# Settings for dataset -------------------------------------------------
#
use_real_ats_names: False
use_real_ats_names: True
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
......@@ -26,7 +26,6 @@ def main():
dl = data_loader.FallDataLoader(FALL_FILENAME).load_data()
X, y = dl.get_data()
#X['Random'] = np.random.rand(len(X),1) # add random noise col
cols = X.columns
X = np.array(X)
y = np.array(y)
......
import numpy as np
import pandas as pd
import paths as pt
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from tools import data_loader, file_writer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
FILENAME = "fall_emb.csv"
def main():
dl = data_loader.FallDataLoader(FILENAME).load_data()
features = dl.get_features()
X_train, X_test, y_train, y_test = dl.prepare_data_split(test_size=0.7,
scaling_strategy="Standard")
param_grid = {
'n_estimators': [200, 400, 600, 800, 1000],
'max_features': ['auto'],
'max_depth' : [3],
'min_samples_split' : [2],
'min_samples_leaf': [3],
'criterion' :['gini']
}
model = RandomForestClassifier(random_state=0,
class_weight="balanced")
cv_rfc = GridSearchCV(estimator=model, param_grid=param_grid,
cv=5, verbose=2, n_jobs=-1, scoring='roc_auc')
cv_rfc.fit(X_train, y_train)
print("Best: %f using %s" % (cv_rfc.best_score_, cv_rfc.best_params_))
rf = cv_rfc.best_estimator_
print(f"\nACC_TEST: {accuracy_score(y_test, rf.predict(X_test))}")
print(f"ROC_AUC_TEST: {roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])}")
if __name__ == "__main__":
main()
\ No newline at end of file
import numpy as np
import pandas as pd
import paths as pt
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from tools import data_loader, file_writer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import ADASYN
FILENAME = "fall_emb.csv"
def main():
dl = data_loader.FallDataLoader(FILENAME).load_data()
features = dl.get_features()
X_train, X_test, y_train, y_test = dl.prepare_data_split(test_size=0.7,
scaling_strategy="Standard")
param_grid = {
'model__n_estimators': [200, 400, 600, 800, 1000],
'model__max_features': ['auto'],
'model__max_depth' : [3],
'model__min_samples_split' : [2],
'model__min_samples_leaf': [3],
'model__criterion' :['gini']
}
rf = RandomForestClassifier(random_state=0)
adasyn = ADASYN(random_state=0)
pipeline = Pipeline([('sampling', adasyn), ('model', rf)])
cv_rfc = GridSearchCV(estimator=pipeline, param_grid=param_grid,
cv=5, verbose=2, n_jobs=-1, scoring='roc_auc')
cv_rfc.fit(X_train, y_train)
print("Best: %f using %s" % (cv_rfc.best_score_, cv_rfc.best_params_))
rf = cv_rfc.best_estimator_
print(f"\nACC_TEST: {accuracy_score(y_test, rf.predict(X_test))}")
print(f"ROC_AUC_TEST: {roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])}")
if __name__ == "__main__":
main()
\ No newline at end of file
import numpy as np
import pandas as pd
import paths as pt
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from tools import data_loader, file_writer
from sklearn.model_selection import StratifiedKFold
CASE = "Complete"
COMPLETE_FILENAME = "complete_count.csv"
FALL_FILENAME = "fall_count.csv"
SCALING_STRATEGY = "Standard"
def main():
if CASE == "Complete":
X, y = data_loader.CompleteDataLoader(COMPLETE_FILENAME) \
.load_data().prepare_data(SCALING_STRATEGY)
else:
X, y = data_loader.FallDataLoader(FALL_FILENAME) \
.load_data().prepare_data(SCALING_STRATEGY)
params = {
'min_child_weight': [1, 5, 10, 20],
'gamma': [0.1, 0.2, 0.5, 1, 1.5, 2, 5, 10],
'subsample': [0.2, 0.4, 0.6, 0.8, 1.0],
'colsample_bytree': [0.2, 0.4, 0.6, 0.8, 1.0],
'max_depth': [1, 2, 3, 4, 5, 10, 20, 50]
}
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
xgb = XGBClassifier(learning_rate=0.1,
n_estimators=400,
scale_pos_weight=scale_pos_weight,
objective='binary:logistic',
eval_metric='logloss',
use_label_encoder=False,
seed=0)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
random_search = RandomizedSearchCV(xgb,
param_distributions=params,
n_iter=10,
scoring='neg_log_loss',
n_jobs=-1,
cv=skf,
verbose=3,
random_state=0)
random_search.fit(X, y)
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (5, 5))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
file_writer.write_csv(results, pt.REPORTS_DIR, 'xgb-random-grid-search-results.csv')
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -28,6 +28,8 @@ def make_complete_count():
unique_ats = list(set(np.concatenate(unique_ats)))
df_ats = preprocessor.extract_cat_count(df, unique_ats, cols_ats, 'Ats_')
df_ats = df_ats.drop(['Ats_0'], axis=1)
df = df.drop(cols_ats, axis=1)
df = pd.concat([df.drop(case, axis=1), df_ats, df[[case]]], axis=1)
......@@ -46,6 +48,8 @@ def make_compliance_count():
unique_ats = list(set(np.concatenate(unique_ats)))
df_ats = preprocessor.extract_cat_count(df, unique_ats, cols_ats, 'Ats_')
df_ats = df_ats.drop(['Ats_0'], axis=1)
df = df.drop(cols_ats, axis=1)
df = pd.concat([df.drop(case, axis=1), df_ats, df[[case]]], axis=1)
......@@ -65,6 +69,8 @@ def make_fall_count():
unique_ats = list(set(np.concatenate(unique_ats)))
df_ats = preprocessor.extract_cat_count(df, unique_ats, cols_ats, 'Ats_')
df_ats = df_ats.drop(['Ats_0'], axis=1)
df = df.drop(cols_ats, axis=1)
df = pd.concat([df.drop(case, axis=1), df_ats, df[[case]]], axis=1)
......
......@@ -44,13 +44,14 @@ def main(ats_resolution: int = None):
# Encode dataframe given params
model_path = Path.joinpath(pt.ROOT_DIR, emb_cfg['model_path'])
df_enc = encode_dataframe(df=df_to_enc,
target_name=emb_cfg['target_name'],
batch_size=emb_cfg['batch_size'],
train_ratio=emb_cfg['train_ratio'],
epochs=emb_cfg['num_epochs'],
network_layers=emb_cfg['network_layers'],
verbose=emb_cfg['verbose'],
model_path=model_path)
target_name=emb_cfg['target_name'],
batch_size=emb_cfg['batch_size'],
train_ratio=emb_cfg['train_ratio'],
epochs=emb_cfg['num_epochs'],
optimizer=emb_cfg['optimizer'],
network_layers=emb_cfg['network_layers'],
verbose=emb_cfg['verbose'],
model_path=model_path)
df_rand = pd.DataFrame(np.random.rand(len(df),1), columns=['Rand'])
df = pd.concat([df.drop(target_name, axis=1), df_rand, df_enc, df.pop(target_name)], axis=1)
......@@ -81,14 +82,14 @@ def make_fall_test_emb(ats_resolution):
df = pd.concat([df.drop('Fall', axis=1), ats_enc, ex_enc, df.pop('Fall')], axis=1)
file_writer.write_csv(df, pt.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
def encode_dataframe(df, target_name, batch_size, train_ratio,
epochs, network_layers, verbose, model_path):
def encode_dataframe(df, target_name, batch_size, train_ratio, epochs,
optimizer, network_layers, verbose, model_path):
X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(df,
target_name,
train_ratio)
network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name, epochs=epochs,
batch_size=batch_size, network_layers=network_layers,
verbose=verbose, model_path=model_path)
optimizer_fn=optimizer, verbose=verbose, model_path=model_path)
network.fit(X_train, y_train, X_val, y_val)
network.save_model()
embedded_weights = network.get_embedded_weights()
......@@ -114,14 +115,16 @@ def encode_dataframe(df, target_name, batch_size, train_ratio,
return df_to_enc
def encode_dataframe_cv(df_to_enc, target_name, artifacts_path):
params = get_config(df_to_enc, target_name, artifacts_path)
X, y = preprocessor.get_X_y(df_to_enc, target_name)
def encode_dataframe_cv(df, target_name, batch_size, train_ratio,
epochs, network_layers, verbose, model_path):
X, y = preprocessor.get_X_y(df, target_name)
X, labels = preprocessor.encode_vector_label(X)
y = np.array(y)
network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name, epochs=epochs,
batch_size=batch_size, network_layers=network_layers,
verbose=verbose, model_path=model_path)
network = neural_embedder.NeuralEmbedder(**params)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
mode='min',
......@@ -150,7 +153,7 @@ def encode_dataframe_cv(df_to_enc, target_name, artifacts_path):
if ENABLE_EMB_VIZ:
network.make_visualizations_from_network(extension='png')
df_to_enc = df_to_enc.drop(target_name, axis=1)
df_to_enc = df.drop(target_name, axis=1)
for index in range(df_to_enc.shape[1]):
column = df_to_enc.columns[index]
labels_column = labels[index]
......
......@@ -138,7 +138,7 @@ def get_screenings_by_id(data, id, settings):
single_screening['Ats'] = feature_maker.get_ats(citizen_data.ats, end_date, settings)
single_screening['NumberAts'] = feature_maker.get_number_ats(citizen_data.ats, end_date)
single_screening['LoanPeriod'] = feature_maker.get_loan_period(citizen_data.ats, end_date)
single_screening['LoanPeriod'] = feature_maker.get_avg_loan_period(citizen_data.ats, end_date)
single_screening['Needs'] = screening.NeedForHelpScore
single_screening['NeedsReason'] = screening.NeedForHelpReason
......
......@@ -11,7 +11,7 @@ from sklearn.model_selection import train_test_split
import xgboost as xgb
DATA_DIR = pt.PROCESSED_DATA_DIR
CASES = ["Complete", "Compliance", "Fall", "Fall_test"]
CASES = ["Complete", "Compliance", "Fall"]
def main(dataset_version : str = 'emb'):
for case in CASES:
......@@ -47,7 +47,7 @@ def main(dataset_version : str = 'emb'):
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
#df['Random'] = np.random.rand(len(df),1) # add random noise col
X = df.drop([target_name], axis=1)
y = df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
......
......@@ -6,8 +6,8 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
class BaseDataLoader(ABC):
def __init__(self, file_name=None, converters=None):
"""Initilizer method that takes a file name"""
def __init__(self, file_name, converters=None):
"""Initilizer method that takes a file name and optionally a converter"""
self.file_name = file_name
self.converters = converters
......@@ -15,62 +15,50 @@ class BaseDataLoader(ABC):
def load_data(self):
"""Loads the data from a data set at startup"""
@abstractmethod
def prepare_data(self, scaling_strategy=None):
"""Prepares the data from a data set"""
@abstractmethod
def prepare_data_split(self, scaling_strategy=None, test_size=None):
"""Prepares and splits the data from a data set"""
class ComplianceDataLoader(BaseDataLoader):
def load_data(self):
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
self.file_name,
converters=self.converters)
X = df.drop(['Compliance'], axis=1)
y = df['Compliance']
self.X = X
self.y = y
return self
def get_data(self):
"""Returns the features and target"""
return self.X, self.y
def get_features(self):
"""Returns the feature names"""
return self.X.columns
def prepare_data(self, scaling_strategy: str = "Standard"):
def prepare_data(self, scaling_strategy=None):
"""Prepares the data from a data set"""
X = np.array(self.X)
y = np.array(self.y)
emb_cols = self.X.filter(regex='((\d+)[Ats])\w+', axis=1)
n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
if scaling_strategy == "Standard":
scaler = StandardScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
else:
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
if scaling_strategy != None:
emb_cols = self.X.filter(regex='((\d+)[Ats])\w+', axis=1)
n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
if scaling_strategy == "Standard":
scaler = StandardScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
else:
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
return X, y
def prepare_data_split(self, scaling_strategy: str = "Standard", test_size: float = 0.3):
def prepare_data_split(self, test_size, scaling_strategy=None):
"""Prepares and splits the data from a data set"""
X = np.array(self.X)
y = np.array(self.y)
emb_cols = self.X.filter(regex='((\d+)[Ats])\w+', axis=1)
n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
if scaling_strategy == "Standard":
scaler = StandardScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
else:
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
if scaling_strategy != None:
emb_cols = self.X.filter(regex='((\d+)[Ats])\w+', axis=1)
n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
if scaling_strategy == "Standard":
scaler = StandardScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
else:
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
stratify=y, random_state=0)
......@@ -87,49 +75,17 @@ class CompleteDataLoader(BaseDataLoader):
self.X = X
self.y = y
return self
def get_data(self):
return self.X, self.y
def get_features(self):
return self.X.columns
def prepare_data(self, scaling_strategy: str = "Standard"):
X = np.array(self.X)
y = np.array(self.y)
emb_cols = self.X.filter(regex='((\d+)[Ats])\w+', axis=1)
n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
if scaling_strategy == "Standard":
scaler = StandardScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
else:
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
return X, y
def prepare_data_split(self, scaling_strategy: str = "Standard", test_size: float = 0.3):
X = np.array(self.X)
y = np.array(self.y)
emb_cols = self.X.filter(regex='((\d+)[Ats])\w+', axis=1)
n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
if scaling_strategy == "Standard":
scaler = StandardScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
else:
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
stratify=y, random_state=0)
return X_train, X_test, y_train, y_test
class ComplianceDataLoader(BaseDataLoader):
def load_data(self):
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
self.file_name,
converters=self.converters)
X = df.drop(['Compliance'], axis=1)
y = df['Compliance']
self.X = X
self.y = y
return self
class FallDataLoader(BaseDataLoader):
def load_data(self):
......@@ -141,49 +97,6 @@ class FallDataLoader(BaseDataLoader):
self.X = X
self.y = y
return self
def get_data(self):
return self.X, self.y
def get_features(self):
return self.X.columns
def prepare_data(self, scaling_strategy: str = "Standard"):
X = np.array(self.X)
y = np.array(self.y)
emb_cols = self.X.filter(regex='((\d+)[Ats])\w+', axis=1)
n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
if scaling_strategy == "Standard":
scaler = StandardScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
else:
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
return X, y
def prepare_data_split(self, scaling_strategy: str, test_size: float):
X = np.array(self.X)
y = np.array(self.y)
emb_cols = self.X.filter(regex='((\d+)[Ats])\w+', axis=1)
n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
if scaling_strategy == "Standard":
scaler = StandardScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
else:
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
stratify=y, random_state=0)
return X_train, X_test, y_train, y_test
class FallTestDataLoader(BaseDataLoader):
def load_data(self):
......@@ -196,40 +109,40 @@ class FallTestDataLoader(BaseDataLoader):
self.y = y
return self
def get_data(self):
return self.X, self.y
def prepare_data(self, scaling_strategy: str = "Standard"):
def prepare_data(self, scaling_strategy: str = None):
X = np.array(self.X)
y = np.array(self.y)
emb_cols = self.X.filter(regex='((\d+)[Ats|Ex])\w+', axis=1)
n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
if scaling_strategy == "Standard":
scaler = StandardScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
else:
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
if scaling_strategy != None:
emb_cols = self.X.filter(regex='((\d+)[Ats|Ex])\w+', axis=1)
n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
if scaling_strategy == "Standard":
scaler = StandardScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
else:
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
return X, y