Commit 7c4dda75 authored by thecml's avatar thecml
Browse files

added option to group ats

parent a445a6df
Pipeline #63173 passed with stage
in 2 minutes and 59 seconds
......@@ -6,8 +6,9 @@ from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
CASE = "Complete"
COMPLETE_FILENAME = "complete_emb.csv"
FALL_FILENAME = "fall_emb.csv"
COMPLETE_FILENAME = "complete_emb_cv.csv"
COMPLIANCE_FILENAME = "compliance_emb_cv.csv"
FALL_FILENAME = "fall_emb_cv.csv"
COLORS = plt.rcParams['axes.prop_cycle'].by_key()['color']
METRICS = [
......@@ -46,6 +47,9 @@ def main():
if CASE == "Complete":
X, y = data_loader.CompleteDataLoader(COMPLETE_FILENAME) \
.load_data().prepare_data()
elif CASE == "Compliance":
X, y = data_loader.ComplianceDataLoader(COMPLIANCE_FILENAME) \
.load_data().prepare_data()
else:
X, y = data_loader.FallDataLoader(FALL_FILENAME) \
.load_data().prepare_data()
......@@ -54,20 +58,20 @@ def main():
test_size=0.3, random_state=0)
model = make_model(input_dim=X.shape[1])
no_class_weight_history = model.fit(np.array(X_train), np.array(y_train), epochs=25,
validation_data=(np.array(X_valid), np.array(y_valid)), batch_size=16)
no_class_weight_history = model.fit(np.array(X_train), np.array(y_train), epochs=50,
validation_data=(np.array(X_valid), np.array(y_valid)), batch_size=16, verbose=False)
neg, pos = np.bincount(y_train)
initial_bias = np.log([pos/neg])
model = make_model(input_dim=X.shape[1], output_bias=initial_bias)
no_class_weight_bias_history = model.fit(np.array(X_train), np.array(y_train), epochs=25,
validation_data=(np.array(X_valid), np.array(y_valid)), batch_size=16)
no_class_weight_bias_history = model.fit(np.array(X_train), np.array(y_train), epochs=50,
validation_data=(np.array(X_valid), np.array(y_valid)), batch_size=16, verbose=False)
class_weight = preprocessor.get_class_weight(neg, pos)
model = make_model(input_dim=X.shape[1])
class_weight_history = model.fit(np.array(X_train), np.array(y_train), epochs=25,
class_weight_history = model.fit(np.array(X_train), np.array(y_train), epochs=50,
validation_data=(np.array(X_valid), np.array(y_valid)), batch_size=16,
class_weight=class_weight)
class_weight=class_weight, verbose=False)
make_plots(no_class_weight_history, f"{CASE} no class weight", 0, 'loss')
make_plots(no_class_weight_bias_history, f"{CASE} no class weight bias", 1, 'loss')
......
......@@ -14,6 +14,7 @@ from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
USE_CROSS_VALID = True
USE_GROUPING = True
ENABLE_EMB_VIZ = False
VERBOSE = False
......@@ -42,9 +43,20 @@ def make_complete_emb():
'complete.csv',
converters=ats)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
if USE_GROUPING: # encode groups of ats
for col_idx in range(1, cfg.ATS_RESOLUTION-1, 5):
df[f'{col_idx}_{col_idx+4}Ats'] = df[f'{col_idx}Ats'].apply(str) \
+ '_' + df[f'{col_idx+1}Ats'].apply(str) \
+ '_' + df[f'{col_idx+2}Ats'].apply(str) \
+ '_' + df[f'{col_idx+3}Ats'].apply(str) \
+ '_' + df[f'{col_idx+4}Ats'].apply(str)
emb_cols = df.filter(regex='((\d+)_(\d+)[Ats])\w+', axis=1)
df = df[['Gender', 'BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts', 'Complete']]
df = pd.concat([df.drop(target_name, axis=1), emb_cols, df.pop(target_name)], axis=1)
else:
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.COMPLIANCE_EMB_DIR
......@@ -53,8 +65,12 @@ def make_complete_emb():
else:
df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path)
ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
df = df.drop(ats_cols, axis=1)
if USE_GROUPING:
df = df.drop(columns=list(df.filter(regex='_')))
else:
ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
df = df.drop(ats_cols, axis=1)
df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
if USE_CROSS_VALID:
......@@ -69,7 +85,19 @@ def make_compliance_emb():
f'compliance.csv',
converters=ats)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
if USE_GROUPING: # encode groups of ats
for col_idx in range(1, cfg.ATS_RESOLUTION-1, 5):
df[f'{col_idx}_{col_idx+4}Ats'] = df[f'{col_idx}Ats'].apply(str) \
+ '_' + df[f'{col_idx+1}Ats'].apply(str) \
+ '_' + df[f'{col_idx+2}Ats'].apply(str) \
+ '_' + df[f'{col_idx+3}Ats'].apply(str) \
+ '_' + df[f'{col_idx+4}Ats'].apply(str)
emb_cols = df.filter(regex='((\d+)_(\d+)[Ats])\w+', axis=1)
df = df[['Gender', 'BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts', 'Compliance']]
df = pd.concat([df.drop(target_name, axis=1), emb_cols, df.pop(target_name)], axis=1)
else:
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
......@@ -80,8 +108,12 @@ def make_compliance_emb():
else:
df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path)
ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
df = df.drop(ats_cols, axis=1)
if USE_GROUPING:
df = df.drop(columns=list(df.filter(regex='_')))
else:
ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
df = df.drop(ats_cols, axis=1)
df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
if USE_CROSS_VALID:
......@@ -96,7 +128,19 @@ def make_fall_emb():
f'fall.csv',
converters=ats)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
if USE_GROUPING: # encode groups of ats
for col_idx in range(1, cfg.ATS_RESOLUTION-1, 5):
df[f'{col_idx}_{col_idx+4}Ats'] = df[f'{col_idx}Ats'].apply(str) \
+ '_' + df[f'{col_idx+1}Ats'].apply(str) \
+ '_' + df[f'{col_idx+2}Ats'].apply(str) \
+ '_' + df[f'{col_idx+3}Ats'].apply(str) \
+ '_' + df[f'{col_idx+4}Ats'].apply(str)
emb_cols = df.filter(regex='((\d+)_(\d+)[Ats])\w+', axis=1)
df = df[['Gender', 'BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts', 'Fall']]
df = pd.concat([df.drop(target_name, axis=1), emb_cols, df.pop(target_name)], axis=1)
else:
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
......@@ -107,8 +151,12 @@ def make_fall_emb():
else:
df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path)
ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
df = df.drop(ats_cols, axis=1)
if USE_GROUPING:
df = df.drop(columns=list(df.filter(regex='_')))
else:
ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
df = df.drop(ats_cols, axis=1)
df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
if USE_CROSS_VALID:
......@@ -187,7 +235,7 @@ def encode_dataframe_cv(df_to_enc, target_name, artifacts_path):
network.make_visualizations_from_network(extension='png')
df_to_enc = df_to_enc.drop(target_name, axis=1)
for index in range(df_to_enc.shape[1] - 1):
for index in range(df_to_enc.shape[1]):
column = df_to_enc.columns[index]
labels_column = labels[index]
embeddings_column = new_weights[index]
......
......@@ -76,7 +76,7 @@ def main(dataset_version : str = 'emb'):
early_stopping_rounds = 50
if optimize_rounds:
eval_set=[(X_valid_split, y_valid_split)]
fit_model = model.fit(X_train_split, y_train_split,
fit_model = model.fit(X_train_split, y_train_split,
eval_set=eval_set,
eval_metric=metrics.gini_xgb,
early_stopping_rounds=early_stopping_rounds,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment