Commit 2bf015d6 authored by thecml's avatar thecml
Browse files

added notebook for mlp with shap

parent fcfe4493
Pipeline #63670 passed with stage
in 3 minutes and 22 seconds
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -51,7 +51,7 @@ FALL_TEST_RF_DIR = Path.joinpath(ROOT_DIR, 'models/fall_test/random_forest')
FALL_TEST_EMB_DIR = Path.joinpath(ROOT_DIR, 'models/fall_test/embeddings')
GENERAL_FEATURES = ['Gender', 'Age', 'Cluster']
ATS_RESOLUTION = 50
ATS_RESOLUTION = 10
EX_RESOLUTION = 9
ATS_DELIMITER = 6
THRESHOLD_WEEKS = 8
......
......@@ -23,7 +23,7 @@ def main():
header_list = ['CitizenId'] + cols_ats
df = df[header_list]
model = kmodes.KModes(init='Huang', n_clusters=10, n_init=15, n_jobs=-1)
model = kmodes.KModes(init='Huang', n_clusters=20, n_init=15, n_jobs=-1)
model.fit(df.iloc[:, 1:].astype(str))
predictions = model.predict(df.iloc[:, 1:].to_numpy())
......
......@@ -13,29 +13,29 @@ from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
USE_CROSS_VALID = True
USE_CROSS_VALID = False
USE_GROUPING = False
ENABLE_EMB_VIZ = False
VERBOSE = True
def main(ats_resolution: int = None):
if ats_resolution == None:
ats_resolution = cfg.ATS_RESOLUTION
make_complete_emb(ats_resolution)
make_compliance_emb(ats_resolution)
make_fall_emb(ats_resolution)
make_fall_test_emb(ats_resolution)
#make_compliance_emb(ats_resolution)
#make_fall_emb(ats_resolution)
#make_fall_test_emb(ats_resolution)
def get_config(df_to_enc, target_name, artifacts_path):
def get_config(df_to_enc, target_name, artifacts_path,
batch_size=32, epochs=5):
return {
"df": df_to_enc,
"target_name": target_name,
"train_ratio": 0.8,
"network_layers": ([128]),
"epochs": 5,
"batch_size": 32,
"verbose": VERBOSE,
"epochs": epochs,
"batch_size": batch_size,
"verbose": True,
"artifacts_path": artifacts_path
}
......@@ -46,37 +46,21 @@ def make_complete_emb(ats_resolution):
'complete.csv',
converters=ats)
if USE_GROUPING: # encode groups of ats
for col_idx in range(1, ats_resolution-1, 5):
df[f'{col_idx}_{col_idx+4}Ats'] = df[f'{col_idx}Ats'].apply(str) \
+ '_' + df[f'{col_idx+1}Ats'].apply(str) \
+ '_' + df[f'{col_idx+2}Ats'].apply(str) \
+ '_' + df[f'{col_idx+3}Ats'].apply(str) \
+ '_' + df[f'{col_idx+4}Ats'].apply(str)
emb_cols = df.filter(regex='((\d+)_(\d+)[Ats])\w+', axis=1)
df = df[['Gender', 'BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts', 'Complete']]
df = pd.concat([df.drop(target_name, axis=1), emb_cols, df.pop(target_name)], axis=1)
else:
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
#n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
#df_to_enc = df.iloc[:,n_numerical_cols:]
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.COMPLETE_EMB_DIR
if USE_CROSS_VALID:
df_enc = encode_dataframe_cv(df, target_name, artifacts_path)
else:
df_enc = encode_dataframe(df, target_name, artifacts_path)
if USE_GROUPING:
df = df.drop(columns=list(df.filter(regex='_')))
else:
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
config = get_config(df_to_enc, target_name, artifacts_path, epochs=5)
df_enc = encode_dataframe(config)
df_rand = pd.DataFrame(np.random.rand(len(df),1), columns=['Rand'])
df = pd.concat([df.drop(target_name, axis=1), df_enc, df_rand, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'complete_emb.csv')
def make_compliance_emb(ats_resolution):
target_name = 'Compliance'
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
......@@ -84,38 +68,20 @@ def make_compliance_emb(ats_resolution):
f'compliance.csv',
converters=ats)
if USE_GROUPING: # encode groups of ats
for col_idx in range(1, ats_resolution-1, 5):
df[f'{col_idx}_{col_idx+4}Ats'] = df[f'{col_idx}Ats'].apply(str) \
+ '_' + df[f'{col_idx+1}Ats'].apply(str) \
+ '_' + df[f'{col_idx+2}Ats'].apply(str) \
+ '_' + df[f'{col_idx+3}Ats'].apply(str) \
+ '_' + df[f'{col_idx+4}Ats'].apply(str)
emb_cols = df.filter(regex='((\d+)_(\d+)[Ats])\w+', axis=1)
df = df[['Gender', 'BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts', 'Compliance']]
df = pd.concat([df.drop(target_name, axis=1), emb_cols, df.pop(target_name)], axis=1)
else:
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.COMPLIANCE_EMB_DIR
if USE_CROSS_VALID:
df_enc = encode_dataframe_cv(df_to_enc, target_name, artifacts_path)
else:
df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path)
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
if USE_GROUPING:
df = df.drop(columns=list(df.filter(regex='_')))
else:
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
config = get_config(df_to_enc, target_name, artifacts_path, epochs=5)
df_enc = encode_dataframe(config)
df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'compliance_emb.csv')
def make_fall_emb(ats_resolution):
target_name = 'Fall'
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
......@@ -123,35 +89,17 @@ def make_fall_emb(ats_resolution):
f'fall.csv',
converters=ats)
if USE_GROUPING: # encode groups of ats
for col_idx in range(1, ats_resolution-1, 5):
df[f'{col_idx}_{col_idx+4}Ats'] = df[f'{col_idx}Ats'].apply(str) \
+ '_' + df[f'{col_idx+1}Ats'].apply(str) \
+ '_' + df[f'{col_idx+2}Ats'].apply(str) \
+ '_' + df[f'{col_idx+3}Ats'].apply(str) \
+ '_' + df[f'{col_idx+4}Ats'].apply(str)
emb_cols = df.filter(regex='((\d+)_(\d+)[Ats])\w+', axis=1)
df = df[['Gender', 'BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts', 'Fall']]
df = pd.concat([df.drop(target_name, axis=1), emb_cols, df.pop(target_name)], axis=1)
else:
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.FALL_EMB_DIR
if USE_CROSS_VALID:
df_enc = encode_dataframe_cv(df_to_enc, target_name, artifacts_path)
else:
df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path)
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
if USE_GROUPING:
df = df.drop(columns=list(df.filter(regex='_')))
else:
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
config = get_config(df_to_enc, target_name, artifacts_path, epochs=10)
df_enc = encode_dataframe(config)
df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_emb.csv')
......@@ -173,18 +121,49 @@ def make_fall_test_emb(ats_resolution):
df_ex_to_enc = df_ex_to_enc.drop(['NumberFalls'], axis=1)
artifacts_path = cfg.FALL_TEST_EMB_DIR
if USE_CROSS_VALID:
ats_enc = encode_dataframe_cv(df_ats_to_enc, 'Fall', artifacts_path)
ex_enc = encode_dataframe_cv(df_ex_to_enc, 'Fall', artifacts_path)
else:
ats_enc = encode_dataframe(df_ats_to_enc, 'Fall', artifacts_path)
ex_enc = encode_dataframe(df_ex_to_enc, 'Fall', artifacts_path)
ats_enc = encode_dataframe(df_ats_to_enc, 'Fall', artifacts_path)
ex_enc = encode_dataframe(df_ex_to_enc, 'Fall', artifacts_path)
df = df.drop(ats_cols + ex_cols, axis=1)
df = pd.concat([df.drop('Fall', axis=1), ats_enc, ex_enc, df.pop('Fall')], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
def encode_dataframe(config):
X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(config['df'],
config['target_name'],
config['train_ratio'])
network = neural_embedder.NeuralEmbedder(df=config['df'],
target_name=config['target_name'],
epochs=config['epochs'],
batch_size=config['batch_size'],
verbose=config['verbose'],
artifacts_path=config['artifacts_path'])
network.fit(X_train, y_train, X_val, y_val)
network.save_model()
embedded_weights = network.get_embedded_weights()
network.save_weights(embedded_weights)
network.save_labels(labels)
if ENABLE_EMB_VIZ:
network.make_visualizations_from_network(extension='png')
df_to_enc = config['df'].drop(config['target_name'], axis=1)
for index in range(df_to_enc.shape[1]):
column = df_to_enc.columns[index]
labels_column = labels[index]
embeddings_column = embedded_weights[index]
pca = PCA(n_components=1)
Y = pca.fit_transform(embeddings_column)
y_array = np.concatenate(Y)
mapping = dict(zip(labels_column.classes_, y_array))
target_name = config['target_name']
file_writer.write_mapping(mapping,
Path.joinpath(cfg.PROCESSED_DATA_DIR, 'embeddings'),
f'{target_name.lower()}_{column}.csv')
df_to_enc[column] = df_to_enc[column].replace(to_replace=mapping)
return df_to_enc
def encode_dataframe_cv(df_to_enc, target_name, artifacts_path):
params = get_config(df_to_enc, target_name, artifacts_path)
......@@ -237,38 +216,5 @@ def encode_dataframe_cv(df_to_enc, target_name, artifacts_path):
return df_to_enc
def encode_dataframe(df_to_enc, target_name, artifacts_path):
params = get_config(df_to_enc, target_name, artifacts_path)
X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(df_to_enc,
target_name,
params['train_ratio'])
network = neural_embedder.NeuralEmbedder(**params)
network.fit(X_train, y_train, X_val, y_val)
network.save_model()
embedded_weights = network.get_embedded_weights()
network.save_weights(embedded_weights)
network.save_labels(labels)
if ENABLE_EMB_VIZ:
network.make_visualizations_from_network(extension='png')
df_to_enc = df_to_enc.drop(target_name, axis=1)
for index in range(df_to_enc.shape[1]):
column = df_to_enc.columns[index]
labels_column = labels[index]
embeddings_column = embedded_weights[index]
pca = PCA(n_components=1)
Y = pca.fit_transform(embeddings_column)
y_array = np.concatenate(Y)
mapping = dict(zip(labels_column.classes_, y_array))
file_writer.write_mapping(mapping,
Path.joinpath(cfg.PROCESSED_DATA_DIR, 'embeddings'),
f'{target_name.lower()}_{column}.csv')
df_to_enc[column] = df_to_enc[column].replace(to_replace=mapping)
return df_to_enc
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -12,7 +12,8 @@ from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
CASES = ["Complete", "Compliance", "Fall", "Fall_test"]
CASE = "Complete"
FILENAME = "complete.csv"
class NetworkCategory:
def __init__(self, alias: str, unique_values: int):
......@@ -114,19 +115,18 @@ def build_embedding_network(cat_cols, num_cols):
def main():
ats_cols = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
'complete.csv',
FILENAME,
converters=ats_cols)
target_name = "Complete"
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
# Collect embedded and numerical cols
cat_cols = get_categorial_cols(df, target_name)
num_cols = get_numerical_cols(df, target_name)
cat_cols = get_categorial_cols(df, CASE)
num_cols = get_numerical_cols(df, CASE)
# Prepare the data
X, y = preprocessor.get_X_y(df, target_name)
X, y = preprocessor.get_X_y(df, CASE)
X, labels = preprocessor.encode_vector_label(X, n_numerical_cols)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)
......@@ -148,8 +148,6 @@ def main():
kfold = StratifiedKFold(n_splits=K, random_state=0, shuffle=True)
model = build_embedding_network(cat_cols, num_cols)
for i, (f_ind, outf_ind) in enumerate(kfold.split(X_train, y_train)):
X_train_f, X_valid_f = X_train[f_ind], X_train[outf_ind]
y_train_f, y_valid_f = y_train[f_ind], y_train[outf_ind]
......@@ -157,7 +155,10 @@ def main():
X_test_f = X_test
# Upsampling
pos = (pd.Series(y_train_f == 1))
if CASE == "Complete":
pos = (pd.Series(y_train_f == 0))
else:
pos = (pd.Series(y_train_f == 1))
X_train_f = np.concatenate((X_train_f, X_train_f[pos]), axis=0)
y_train_f = np.concatenate((y_train_f, y_train_f[pos]), axis=0)
......@@ -171,6 +172,7 @@ def main():
valid_preds = 0
for j in range(runs_per_fold):
model = build_embedding_network(cat_cols, num_cols)
model.fit(transpose_to_list(X_train_f), y_train_f,
epochs=n_epochs, batch_size=32,
validation_data=(transpose_to_list(X_valid_f), y_valid_f),
......
......@@ -39,6 +39,9 @@ class ComplianceDataLoader(BaseDataLoader):
def get_data(self):
return self.X, self.y
def get_features(self):
return self.X.columns
def prepare_data(self, scaling_strategy: str = "Standard"):
X = np.array(self.X)
......@@ -91,6 +94,9 @@ class CompleteDataLoader(BaseDataLoader):
def get_data(self):
return self.X, self.y
def get_features(self):
return self.X.columns
def prepare_data(self, scaling_strategy: str = "Standard"):
X = np.array(self.X)
y = np.array(self.y)
......@@ -124,7 +130,7 @@ class CompleteDataLoader(BaseDataLoader):
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
stratify=y, random_state=0)
stratify=y, random_state=0)
return X_train, X_test, y_train, y_test
......@@ -142,6 +148,9 @@ class FallDataLoader(BaseDataLoader):
def get_data(self):
return self.X, self.y
def get_features(self):
return self.X.columns
def prepare_data(self, scaling_strategy: str = "Standard"):
X = np.array(self.X)
y = np.array(self.y)
......
......@@ -37,7 +37,7 @@ class NeuralEmbedder:
def __init__(self,
df: pd.DataFrame,
target_name: str,
train_ratio: float,
train_ratio: float = 0.8,
network_layers: List[int] = (32, 32),
dropout_rate: float = 0,
activation_fn: str = "relu",
......@@ -81,7 +81,7 @@ class NeuralEmbedder:
self.DEFAULT_LABELS_FILENAME = 'labels.pkl'
self.DEFAULT_SCALER_FILENAME = 'scaler.pkl'
self.DEFAULT_PATH_VISUALIZATIONS = 'visualizations'
self.mode = "full"
self.mode = "emb"
self.model = self.__make_model()
def __get_categorial_cols(self, df: pd.DataFrame, target_name: str) -> List:
......
......@@ -86,7 +86,7 @@ def get_class_weight(neg: int, pos: int) -> dict:
"""
This method computes the class weight for a
classification problem given the number of
negatives and positive labels
negative and positive labels
:param neg: number of negative labels
:param pos: number of positive labels
:return: the class weight as a dictionary
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment