Commit 946d6d23 authored by thecml's avatar thecml
Browse files

introduced settings yaml and paths file

parent 90f7a573
Pipeline #64262 failed with stage
in 2 minutes and 57 seconds
import argparse
import config as cfg
import paths as pt
from src.data import parse_and_clean_data, make_screenings
from src.data import make_clusters, make_dataset_full
from src.data import make_dataset_count, make_dataset_emb
......@@ -41,7 +41,7 @@ def main():
use_real_ats_names = parsed_args.use_real_ats_names
run_full_pipeline = parsed_args.run_full_pipeline
print(f"Client started. Using this configuration:")
print(f"Raw data dictionary: {cfg.RAW_DATA_DIR_2020}")
print(f"Raw data dictionary: {pt.RAW_DATA_DIR_2020}")
print(f"Dataset year: {dataset_year}")
print(f"Dataset version: {dataset_version}")
print(f"Visualization enabled: {enable_visualization}")
......@@ -60,7 +60,7 @@ def main():
print("Making clusters ...")
make_clusters.main()
print(f"Completed making cluster model. It can be found at: {cfg.CLUSTERS_DIR}\n")
print(f"Completed making cluster model. It can be found at: {pt.CLUSTERS_DIR}\n")
print("Making full dataset ...")
make_dataset_full.main(use_real_ats_names)
......@@ -73,15 +73,15 @@ def main():
make_dataset_count.main()
print("\nCompleted generating datasets at:")
print(f"Interim data dictionary: {cfg.INTERIM_DATA_DIR}")
print(f"Processed data dictionary: {cfg.PROCESSED_DATA_DIR}\n")
print(f"Interim data dictionary: {pt.INTERIM_DATA_DIR}")
print(f"Processed data dictionary: {pt.PROCESSED_DATA_DIR}\n")
print(f"Making 4 XGBoost models based on version: {dataset_version} ...\n")
make_xgb_models.main(dataset_version)
print(f"Completed making models. Models and SHAP plots can be found at:\n" +
f"{cfg.COMPLETE_XGB_DIR}\n" + f"{cfg.COMPLIANCE_XGB_DIR}\n" +
f"{cfg.FALL_XGB_DIR}\n" + f"{cfg.FALL_TEST_XGB_DIR}" + "\n")
f"{pt.COMPLETE_XGB_DIR}\n" + f"{pt.COMPLIANCE_XGB_DIR}\n" +
f"{pt.FALL_XGB_DIR}\n" + f"{pt.FALL_TEST_XGB_DIR}" + "\n")
if __name__ == "__main__":
main()
\ No newline at end of file
---
# Settings for parser -------------------------------------------------
#
ats_resolution: 10
ex_resolution: 9
ats_delimiter: 6
threshold_weeks: 8
threshold_training: 10
fall_exercise_threshold: 3
fall_exercises: ['8058','8062','8066','8077','8074','8059','8071','8067']
# Settings for dataset -------------------------------------------------
#
use_real_ats_names: False
#!/usr/bin/env python
import numpy as np
import config as cfg
import paths as pt
from tools import file_writer, data_loader
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
......@@ -43,8 +43,8 @@ def main():
importances = shap_sorted_df['shap_values']
features = shap_sorted_df['feature']
file_writer.write_shap_importance_plot(features, importances, cfg.REPORTS_PLOTS_DIR, PLOT_FILENAME)
file_writer.write_csv(shap_sorted_df, cfg.REPORTS_DIR, CSV_FILENAME)
file_writer.write_shap_importance_plot(features, importances, pt.REPORTS_PLOTS_DIR, PLOT_FILENAME)
file_writer.write_csv(shap_sorted_df, pt.REPORTS_DIR, CSV_FILENAME)
def get_best_shap_features(X: np.ndarray, y: np.ndarray,
cols: List[str], seed: int):
......
#!/usr/bin/env python
import numpy as np
import config as cfg
import paths as pt
from typing import List
from tools import file_reader, file_writer, preprocessor, classifiers
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
from pathlib import Path
tf.get_logger().setLevel('ERROR')
NUM_ITER = 10
CASES = ["Complete", "Compliance", "Fall", "Fall_test"]
......@@ -22,9 +23,9 @@ class CVResult:
self.rec = rec
self.rocauc = rocauc
ATS_COLS = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)] \
ATS_COLS = [str(i)+'Ats' for i in range(1, 10+1)] \
+ ['Cluster', 'LoanPeriod', 'NumberAts']
EX_COLS = [str(i)+'Ex' for i in range(1, cfg.EX_RESOLUTION+1)] + ['NumberEx']
EX_COLS = [str(i)+'Ex' for i in range(1, 9+1)] + ['NumberEx']
CLF_NAMES = ["MLP", "LR", "XGB", "RF", "SVM", "KNN"]
CLASSIFIERS = {
"MLP": classifiers.train_mlp_cv,
......@@ -37,30 +38,30 @@ CLASSIFIERS = {
def load_complete():
ats = {str(i)+'Ats':str for i in range(1,11)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
'complete.csv',
converters=ats)
return df
def load_fall():
converters = {str(i)+'Ats':str for i in range(1,11)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
'fall.csv',
converters=converters)
return df
def load_compliance():
converters = {str(i)+'Ats':str for i in range(1,11)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
'compliance.csv',
converters=converters)
return df
def load_fall_test():
ex = {str(i)+'Ex':str for i in range(1, cfg.EX_RESOLUTION+1)}
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
ex = {str(i)+'Ex':str for i in range(1, 9+1)}
ats = {str(i)+'Ats':str for i in range(1, 10+1)}
converters = {**ex, **ats}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
'fall_test.csv',
converters=converters)
return df
......@@ -69,7 +70,7 @@ def main():
for case in CASES:
results_filename = f"{case} baseline results.txt"
# Version 1
with open(Path.joinpath(cfg.REPORTS_DIR, results_filename), "w+") as text_file:
with open(Path.joinpath(pt.REPORTS_DIR, results_filename), "w+") as text_file:
text_file.write(f"{case} version 1 - without Ats and/or Ex columns")
if case == "Complete":
......@@ -107,24 +108,24 @@ def main():
case, 1, "without Ats and/or Ex columns")
# Version 2
with open(Path.joinpath(cfg.REPORTS_DIR, results_filename), "a") as text_file:
with open(Path.joinpath(pt.REPORTS_DIR, results_filename), "a") as text_file:
text_file.write("\n\n")
text_file.write(f"{case} version 2 - with embeddings")
if case == "Complete":
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'complete_emb.csv')
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'complete_emb.csv')
X = df.drop(['Complete'], axis=1)
y = df['Complete']
elif case == "Compliance":
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'compliance_emb.csv')
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'compliance_emb.csv')
X = df.drop(['Compliance'], axis=1)
y = df['Compliance']
elif case == "Fall":
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'fall_emb.csv')
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'fall_emb.csv')
X = df.drop(['Fall'], axis=1)
y = df['Fall']
else:
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
X = df.drop(['Fall'], axis=1)
y = df['Fall']
......@@ -146,24 +147,24 @@ def main():
case, 2, "with embeddings")
# Version 3
with open(Path.joinpath(cfg.REPORTS_DIR, results_filename), "a") as text_file:
with open(Path.joinpath(pt.REPORTS_DIR, results_filename), "a") as text_file:
text_file.write("\n\n")
text_file.write(f"{case} version 3 - with counts")
if case == "Complete":
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'complete_count.csv')
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'complete_count.csv')
X = df.drop(['Complete'], axis=1)
y = df['Complete']
elif case == "Compliance":
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'compliance_count.csv')
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'compliance_count.csv')
X = df.drop(['Compliance'], axis=1)
y = df['Compliance']
elif case == "Fall":
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'fall_count.csv')
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'fall_count.csv')
X = df.drop(['Fall'], axis=1)
y = df['Fall']
else:
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'fall_test_count.csv')
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'fall_test_count.csv')
X = df.drop(['Fall'], axis=1)
y = df['Fall']
......@@ -219,7 +220,7 @@ def make_and_print_scores(classifer_name: str, k: int, res_acc: List,
metrics = " - ".join(["{}: {:.4f} (+/- {:.2f})".format(r_m.name, r_m.result, r_s.result)
for r_m, r_s in (zip(results_mean, results_std) or [])])
text ="\r{} K={}: {}".format(classifer_name, k, metrics)
with open(Path.joinpath(cfg.REPORTS_DIR, results_filename), "a") as text_file:
with open(Path.joinpath(pt.REPORTS_DIR, results_filename), "a") as text_file:
text_file.write(text)
def make_plots(y_test: np.ndarray, results: np.ndarray,
......@@ -227,16 +228,16 @@ def make_plots(y_test: np.ndarray, results: np.ndarray,
roc_file_name = f"{case_name} version {version_number} - ROC curves.pdf"
results_list = list(results)
file_writer.write_roc_curve(y_test, results_list,
cfg.REPORTS_PLOTS_DIR, roc_file_name, case_subtitle)
file_writer.write_accuracy_plot(results_list, NUM_ITER, CLF_NAMES, cfg.REPORTS_PLOTS_DIR,
pt.REPORTS_PLOTS_DIR, roc_file_name, case_subtitle)
file_writer.write_accuracy_plot(results_list, NUM_ITER, CLF_NAMES, pt.REPORTS_PLOTS_DIR,
f"{case_name} version {version_number} - Accuracy.pdf", case_subtitle)
file_writer.write_precision_plot(results_list, NUM_ITER, CLF_NAMES, cfg.REPORTS_PLOTS_DIR,
file_writer.write_precision_plot(results_list, NUM_ITER, CLF_NAMES, pt.REPORTS_PLOTS_DIR,
f"{case_name} version {version_number} - Precision.pdf", case_subtitle)
file_writer.write_recall_plot(results_list, NUM_ITER, CLF_NAMES, cfg.REPORTS_PLOTS_DIR,
file_writer.write_recall_plot(results_list, NUM_ITER, CLF_NAMES, pt.REPORTS_PLOTS_DIR,
f"{case_name} version {version_number} - Recall.pdf", case_subtitle)
file_writer.write_rocauc_plot(results_list, NUM_ITER, CLF_NAMES, cfg.REPORTS_PLOTS_DIR,
file_writer.write_rocauc_plot(results_list, NUM_ITER, CLF_NAMES, pt.REPORTS_PLOTS_DIR,
f"{case_name} version {version_number} - ROCAUC.pdf", case_subtitle)
file_writer.write_cm_plot(y_test, results_list[2][1], cfg.REPORTS_PLOTS_DIR,
file_writer.write_cm_plot(y_test, results_list[2][1], pt.REPORTS_PLOTS_DIR,
f"{case_name} version {version_number} - CM.pdf", "XGB - " f'{case_subtitle}')
if __name__ == '__main__':
......
#!/usr/bin/env python
import numpy as np
import pandas as pd
import config as cfg
import paths as pt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import RobustScaler, MaxAbsScaler, QuantileTransformer
from tools import classifiers, data_loader
......@@ -41,16 +41,16 @@ def main():
MaxAbsScaler(), QuantileTransformer(), QuantileTransformer(random_state=0),
QuantileTransformer(output_distribution='normal', random_state=0)]
output_filename = f"{case} scaling results.txt"
with open(Path.joinpath(cfg.REPORTS_DIR, output_filename), "w+") as text_file:
with open(Path.joinpath(pt.REPORTS_DIR, output_filename), "w+") as text_file:
text_file.write(f"{case} case using {len(clfs)} clfs and {len(scalers)} scalers\n\n")
for clf_name, clf in zip(clf_names, clfs):
for scaler_name, scaler in zip(scaler_names, scalers):
with open(Path.joinpath(cfg.REPORTS_DIR, output_filename), "a") as text_file:
with open(Path.joinpath(pt.REPORTS_DIR, output_filename), "a") as text_file:
text_file.write(f"Results for {clf_name}, {scaler_name}:\n")
X_sc = pd.DataFrame(scaler.fit_transform(X.iloc[:,:n_scale_cols]))
X_new = pd.concat([X_sc, X.iloc[:,n_scale_cols:]], axis=1)
_, result_acc, result_pre, result_recall, result_rocauc, _ = clf(X_new, y)
with open(Path.joinpath(cfg.REPORTS_DIR, output_filename), "a") as text_file:
with open(Path.joinpath(pt.REPORTS_DIR, output_filename), "a") as text_file:
text_file.write(f"Accuracy: {round(np.mean(result_acc), 3)}\n")
text_file.write(f"Precision: {round(np.mean(result_pre), 3)}\n")
text_file.write(f"Recall: {round(np.mean(result_recall), 3)}\n")
......
import pandas as pd
import numpy as np
from tools import preprocessor, file_reader, explainer
import config as cfg
import paths as pt
import os
import csv
import joblib
from pathlib import Path
def main():
model = file_reader.read_joblib(cfg.COMPLETE_XGB_DIR,
model = file_reader.read_joblib(pt.COMPLETE_XGB_DIR,
'complete_xgboost.joblib')
input_data = {"Gender": [0],
"BirthYear": [46],
......@@ -18,9 +18,9 @@ def main():
new_data_df['NumberAts'] = len(new_data_df['Ats'][0].split(","))
df = preprocessor.split_cat_columns(new_data_df, col_to_split='Ats',
tag='Ats',
resolution=cfg.ATS_RESOLUTION)
resolution=10)
cols_ats = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
cols_ats = [str(i)+'Ats' for i in range(1, 10+1)]
header_list = ['Gender', 'BirthYear', 'Cluster',
'LoanPeriod', 'NumberAts'] + cols_ats
df = df.reindex(columns=header_list)
......@@ -29,8 +29,8 @@ def main():
df['Cluster'] = 14
df['Cluster'] = pd.to_numeric(df['Cluster'])
for i in range(1, cfg.ATS_RESOLUTION+1):
path = Path.joinpath(cfg.PROCESSED_DATA_DIR, 'embeddings')
for i in range(1, 10+1):
path = Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings')
embedding = file_reader.read_embedding(path, f'complete_{i}Ats.csv')
column = f'{i}Ats'
df[column] = df[column].replace(to_replace=embedding)
......
import pandas as pd
import numpy as np
from tools import preprocessor, file_reader, explainer
import config as cfg
from tools import file_reader, explainer
import paths as pt
from pathlib import Path
def main():
model = file_reader.read_joblib(cfg.COMPLETE_XGB_DIR,
model = file_reader.read_joblib(pt.COMPLETE_XGB_DIR,
'complete_xgboost.joblib')
converters = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
df = file_reader.read_csv(cfg.TESTS_FILES_DIR,
converters = {str(i)+'Ats':str for i in range(1, 10+1)}
df = file_reader.read_csv(pt.TESTS_FILES_DIR,
'test_citizens.csv',
converters=converters)
for i in range(1, cfg.ATS_RESOLUTION+1):
path = Path.joinpath(cfg.PROCESSED_DATA_DIR, 'embeddings')
for i in range(1, 10+1):
path = Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings')
embedding = file_reader.read_embedding(path, f'complete_{i}Ats.csv')
column = f'{i}Ats'
df[column] = df[column].replace(to_replace=embedding)
......
import pandas as pd
import numpy as np
import config as cfg
import paths as pt
import os
import csv
import joblib
......@@ -18,14 +18,14 @@ def main():
target_name = "Complete"
step_size = 10
for idx in range(1, cfg.ATS_RESOLUTION+1, step_size):
for idx in range(1, 10+1, step_size):
logloss_train, logloss_test = list(), list()
auc_train, auc_test = list(), list()
for ats_res in range(idx, idx+step_size):
make_dataset_full.main(ats_resolution=ats_res)
make_dataset_emb.main(ats_resolution=ats_res)
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'complete_emb.csv')
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'complete_emb.csv')
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
......@@ -79,7 +79,7 @@ def main():
plt.ylabel('Logloss')
plt.xlabel('Iterations')
plt.title(file_name)
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, f'{file_name}.pdf'),
plt.savefig(Path.joinpath(pt.REPORTS_PLOTS_DIR, f'{file_name}.pdf'),
dpi=300,
bbox_inches = "tight")
......
import pandas as pd
import numpy as np
from tools import preprocessor, file_reader
import config as cfg
import paths as pt
import os
import csv
import joblib
from pathlib import Path
def main():
model = file_reader.read_joblib(cfg.FALL_TEST_XGB_DIR,
model = file_reader.read_joblib(pt.FALL_TEST_XGB_DIR,
'fall_test_xgboost.joblib')
for gender in range(0, 2):
......@@ -36,14 +36,14 @@ def main():
df = preprocessor.split_cat_columns(new_data_df, col_to_split='Ats',
tag='Ats',
resolution=cfg.ATS_RESOLUTION)
resolution=10)
df = preprocessor.split_cat_columns(df, col_to_split='Ex',
tag='Ex',
resolution=cfg.ATS_RESOLUTION)
resolution=10)
cols_ats = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
cols_ex = [str(i)+'Ex' for i in range(1, cfg.EX_RESOLUTION+1)]
cols_ats = [str(i)+'Ats' for i in range(1, 10+1)]
cols_ex = [str(i)+'Ex' for i in range(1, 9+1)]
header_list = ['Gender', 'BirthYear', "Cluster",
"LoanPeriod", "NumberSplit", "NumberScreening",
"NumberWeeks", "MeanEvaluation", "NumberFalls",
......@@ -53,15 +53,15 @@ def main():
df = df.reindex(columns=header_list)
df = df.fillna('0')
for i in range(1, cfg.ATS_RESOLUTION+1):
path = Path.joinpath(cfg.PROCESSED_DATA_DIR, 'embeddings')
for i in range(1, 10+1):
path = Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings')
embedding = file_reader.read_embedding(path, f'fall_test_{i}Ats.csv')
column = f'{i}Ats'
df[column] = df[column].replace(to_replace=embedding)
df[column] = pd.to_numeric(df[column])
for i in range(1, cfg.EX_RESOLUTION+1):
path = Path.joinpath(cfg.PROCESSED_DATA_DIR, 'embeddings')
for i in range(1, 9+1):
path = Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings')
embedding = file_reader.read_embedding(path, f'fall_test_{i}Ex.csv')
column = f'{i}Ex'
df[column] = df[column].replace(to_replace=embedding)
......
#!/usr/bin/env python
import config as cfg
import paths as pt
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
......@@ -11,11 +11,10 @@ from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
def main():
df = file_reader.read_csv(cfg.INTERIM_DATA_DIR, 'screenings.csv',
df = file_reader.read_csv(pt.INTERIM_DATA_DIR, 'screenings.csv',
converters={'CitizenId': str})
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats',
resolution=cfg.ATS_RESOLUTION)
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats', resolution=10)
df = feature_maker.make_complete_feature(df)
general_cols = df[['CitizenId', 'Gender', 'BirthYear', 'LoanPeriod']]
......@@ -76,7 +75,7 @@ def main():
print('\nBest hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
file_writer.write_csv(results, cfg.REPORTS_DIR, 'kmodes-settings-random-grid-search-results.csv')
file_writer.write_csv(results, pt.REPORTS_DIR, 'kmodes-settings-random-grid-search-results.csv')
if __name__ == '__main__':
main()
from tools import data_loader
import tensorflow as tf
import kerastuner as kt
import config as cfg
from pathlib import Path
import paths as pt
import shutil
CASE = "Complete"
......@@ -38,7 +38,7 @@ def main():
tuner = kt.BayesianOptimization(create_model,
objective='val_accuracy', max_trials=20, executions_per_trial=2,
directory=Path.joinpath(cfg.REPORTS_DIR, 'keras_tuner'),
directory=Path.joinpath(pt.REPORTS_DIR, 'keras_tuner'),
project_name='complete_mlp',
seed=0)
......@@ -47,7 +47,7 @@ def main():
print(tuner.get_best_hyperparameters(num_trials=1)[0].values)
shutil.rmtree(Path.joinpath(cfg.REPORTS_DIR, 'keras_tuner'))
shutil.rmtree(Path.joinpath(pt.REPORTS_DIR, 'keras_tuner'))
if __name__ == '__main__':
main()
......
import numpy as np
import pandas as pd
import config as cfg
import paths as pt
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
......@@ -64,7 +64,7 @@ def main():
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
file_writer.write_csv(results, cfg.REPORTS_DIR, 'xgb-random-grid-search-results.csv')
file_writer.write_csv(results, pt.REPORTS_DIR, 'xgb-random-grid-search-results.csv')
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -2,24 +2,25 @@
import numpy as np
import pandas as pd
import config as cfg
import paths as pt
import yaml
from typing import List
from kmodes import kmodes
from pathlib import Path
from tools import file_reader, file_writer, preprocessor
USE_ATS_NAMES = False
def main():
df = file_reader.read_csv(cfg.INTERIM_DATA_DIR, 'screenings.csv',
with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
df = file_reader.read_csv(pt.INTERIM_DATA_DIR, 'screenings.csv',
converters={'CitizenId': str})
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats',
resolution=cfg.ATS_RESOLUTION)
resolution=settings['ats_resolution'])
if USE_ATS_NAMES:
if settings['use_real_ats_names']:
df = preprocessor.replace_cat_values(df)
cols_ats = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
cols_ats = [str(i)+'Ats' for i in range(1, settings['ats_resolution']+1)]
header_list = ['CitizenId'] + cols_ats
df = df[header_list]
......@@ -39,9 +40,9 @@ def main():
cluster_centroids = pd.DataFrame(dict([i for i in zip(range(0, len(model.cluster_centroids_)),
model.cluster_centroids_)]))
file_writer.write_joblib(model, cfg.CLUSTERS_DIR, 'km.joblib')
file_writer.write_csv(cluster_centroids, cfg.INTERIM_DATA_DIR, f'cluster_centroids.csv')
file_writer.write_csv(clusters, cfg.INTERIM_DATA_DIR, 'cl.csv')
file_writer.write_joblib(model, pt.CLUSTERS_DIR, 'km.joblib')
file_writer.write_csv(cluster_centroids, pt.INTERIM_DATA_DIR, f'cluster_centroids.csv')
file_writer.write_csv(clusters, pt.INTERIM_DATA_DIR, 'cl.csv')
if __name__ == '__main__':
main()
#!/usr/bin/env python
import config as cfg
import paths as pt
from tools import file_reader, file_writer
from tools import preprocessor
from utility import embedder
import pandas as pd
import numpy as np
from pathlib import Path
import yaml
def main():
make_complete_count()
......@@ -13,75 +15,83 @@ def main():
make_fall_test()
def make_complete_count():
with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
case = 'Complete'
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
ats = {str(i)+'Ats':str for i in range(1, settings['ats_resolution']+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
f'complete.csv',
converters=ats)
cols_ats = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]