Commit 1ef99e99 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

completed first version of client

parent 0647cddd
Pipeline #53971 passed with stage
in 2 minutes and 41 seconds
import argparse
import os
import config as cfg
from src.data import parse_and_clean_data, make_screenings
from src.data import make_clusters, make_dataset_full
from src.data import make_dataset_count, make_dataset_emb
from src.model import make_xgb_models
def dir_path(string):
if os.path.isdir(string):
return string
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise NotADirectoryError(string)
raise argparse.ArgumentTypeError('Boolean value expected.')
def parse_arguments():
parser = argparse.ArgumentParser(description='A client for AIR')
#parser.add_argument('--path', type=dir_path,
# help='path to data dictionary with raw files')
parser.add_argument('--dataset-year', type=str, default="2020",
choices=['2019', '2020'], help='string indicating dataset year')
parser.add_argument('--dataset-version', type=str, default="emb",
choices=['emb', 'ohe'], help='string indicating dataset version')
parser.add_argument("--enable-visualization", type=str2bool, nargs='?',
const=True, default=False,
help="bolean indicating if visualization should be enabled.")
parser.add_argument("--use-real-ats-names", type=str2bool, nargs='?',
const=True, default=False,
help="bolean indicating if we should use real ats names.")
return parser.parse_args()
def main():
parsed_args = parse_arguments()
#data_path = parsed_args.path
print("Client started. Now parsing and cleaning data ...")
parse_and_clean_data.main()
dataset_year = parsed_args.dataset_year
dataset_version = parsed_args.dataset_version
enable_visualization = parsed_args.enable_visualization
use_real_ats_names = parsed_args.use_real_ats_names
print(f"Client started. Using this configuration:")
print(f"Raw data dictionary: {cfg.RAW_DATA_DIR_2020}")
print(f"Dataset year: {dataset_year}")
print(f"Dataset version: {dataset_version}")
print(f"Visualization enabled: {enable_visualization}")
print(f"Use real ATS names: {use_real_ats_names}\n")
print("Now parsing and cleaning data ...")
if dataset_year == '2019':
parse_and_clean_data.main(year=dataset_year)
else:
parse_and_clean_data.main()
print("Extracting screenings ...")
make_screenings.main()
print("Making clusters ...")
make_clusters.main()
print(f"Completed making cluster model. It can be found at: {cfg.CLUSTERS_DIR}\n")
print("Making full dataset ...")
make_dataset_full.main()
make_dataset_full.main(use_real_ats_names)
if dataset_version == "emb":
print("Making dataset with embedded ats ...")
make_dataset_emb.main(enable_visualization)
else:
print("Making dataset with one-hot encoded ats ...")
make_dataset_count.main()
print("\nCompleted generating datasets at:")
print(f"Interim data dictionary: {cfg.INTERIM_DATA_DIR}")
print(f"Processed data dictionary: {cfg.PROCESSED_DATA_DIR}\n")
print("Making dataset with embedded ats ...")
make_dataset_emb.main()
print(f"Now making 4 XGBoost models based on {dataset_version} ...")
make_xgb_models.main()
print("Making dataset with one-hot encoded ats ...")
make_dataset_count.main()
print(f"Completed making models. Models and SHAP plots can be found at:\n" +
f"{cfg.COMPLETE_XGB_DIR}\n" + f"{cfg.COMPLIANCE_XGB_DIR}\n" +
f"{cfg.FALL_XGB_DIR}\n" + f"{cfg.FALL_TEST_XGB_DIR}" + "\n")
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -38,16 +38,8 @@
"name": "python388jvsc74a57bd059ff6fbb0321898508cf6243593820bf2585fcfb6693fd00e85ec94ed8847fd0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
"version": ""
},
"orig_nbformat": 2
},
......
#!/usr/bin/env python
from pandas.core.arrays import boolean
import config as cfg
from tools import file_reader, file_writer, feature_maker
from tools import preprocessor, neural_embedder
......@@ -8,13 +9,13 @@ import numpy as np
from pathlib import Path
from sklearn.decomposition import PCA
def main():
make_complete_emb()
make_compliance_emb()
make_fall_emb()
make_fall_test_emb()
def main(enable_visualization: bool = True):
make_complete_emb(enable_visualization)
make_compliance_emb(enable_visualization)
make_fall_emb(enable_visualization)
make_fall_test_emb(enable_visualization)
def make_complete_emb():
def make_complete_emb(enable_visualization):
case = 'Complete'
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
......@@ -50,7 +51,9 @@ def make_complete_emb():
embedded_weights = network.get_embedded_weights()
network.save_weights(embedded_weights)
network.save_labels(labels)
network.make_visualizations_from_network(extension='png')
if enable_visualization:
network.make_visualizations_from_network(extension='png')
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
......@@ -71,7 +74,7 @@ def make_complete_emb():
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'complete_emb.csv')
def make_compliance_emb():
def make_compliance_emb(enable_visualization):
case = 'Compliance'
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
......@@ -107,7 +110,9 @@ def make_compliance_emb():
embedded_weights = network.get_embedded_weights()
network.save_weights(embedded_weights)
network.save_labels(labels)
network.make_visualizations_from_network(extension='png')
if enable_visualization:
network.make_visualizations_from_network(extension='png')
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
......@@ -128,7 +133,7 @@ def make_compliance_emb():
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'compliance_emb.csv')
def make_fall_emb():
def make_fall_emb(enable_visualization):
case = 'Fall'
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
......@@ -164,7 +169,9 @@ def make_fall_emb():
embedded_weights = network.get_embedded_weights()
network.save_weights(embedded_weights)
network.save_labels(labels)
network.make_visualizations_from_network(extension='png')
if enable_visualization:
network.make_visualizations_from_network(extension='png')
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
......@@ -185,7 +192,7 @@ def make_fall_emb():
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_emb.csv')
def make_fall_test_emb():
def make_fall_test_emb(enable_visualization):
ex = {str(i)+'Ex':str for i in range(1, cfg.EX_RESOLUTION+1)}
ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
converters = {**ex, **ats}
......@@ -202,15 +209,15 @@ def make_fall_test_emb():
df_ex_to_enc = df.filter(regex=f'Fall|((\d+)[Ex])\w+', axis=1)
df_ex_to_enc = df_ex_to_enc.drop(['NumberFalls'], axis=1)
ats_enc = encode_dataframe(df_ats_to_enc, 'Fall')
ex_enc = encode_dataframe(df_ex_to_enc, 'Fall')
ats_enc = encode_dataframe(df_ats_to_enc, 'Fall', enable_visualization)
ex_enc = encode_dataframe(df_ex_to_enc, 'Fall', enable_visualization)
df = df.drop(ats_cols + ex_cols, axis=1)
df = pd.concat([df.drop('Fall', axis=1), ats_enc, ex_enc, df.pop('Fall')], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
def encode_dataframe(df_to_enc, case):
def encode_dataframe(df_to_enc, case, enable_visualization):
target_name = case
train_ratio = 0.9
......@@ -236,8 +243,10 @@ def encode_dataframe(df_to_enc, case):
embedded_weights = network.get_embedded_weights()
network.save_weights(embedded_weights)
network.save_labels(labels)
network.make_visualizations_from_network(extension='png')
if enable_visualization:
network.make_visualizations_from_network(extension='png')
df_to_enc = df_to_enc.drop('Fall', axis=1)
for index in range(df_to_enc.shape[1] - 1):
column = df_to_enc.columns[index]
......
......@@ -4,9 +4,7 @@ from tools import file_reader, file_writer, feature_maker
from tools import preprocessor
import pandas as pd
USE_CAT_NAMES = False
def main():
def main(use_real_ats_names: bool = False):
clusters = file_reader.read_csv(cfg.INTERIM_DATA_DIR, 'cl.csv',
converters={'CitizenId': str, 'Cluster': int})
screenings = file_reader.read_csv(cfg.INTERIM_DATA_DIR, 'screenings.csv',
......@@ -14,12 +12,12 @@ def main():
fall_data = pd.DataFrame(file_reader.read_pickle(cfg.INTERIM_DATA_DIR, 'fd.pkl'))
fall_data = fall_data.drop_duplicates(["CitizenId", "Date"])
make_complete_case(screenings, clusters)
make_compliance_case(screenings, clusters)
make_fall_case(screenings, clusters)
make_fall_test_case(screenings, clusters, fall_data)
make_complete_case(screenings, clusters, use_real_ats_names)
make_compliance_case(screenings, clusters, use_real_ats_names)
make_fall_case(screenings, clusters, use_real_ats_names)
make_fall_test_case(screenings, clusters, fall_data, use_real_ats_names)
def make_complete_case(df, clusters):
def make_complete_case(df, clusters, use_real_ats_names):
df['Cluster'] = clusters['Cluster']
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats',
resolution=cfg.ATS_RESOLUTION)
......@@ -29,14 +27,14 @@ def make_complete_case(df, clusters):
general_cols = df[['Gender', 'BirthYear', 'Cluster', 'LoanPeriod']]
df = pd.concat([general_cols, ats_cols, df[['Complete']]], axis=1)
if USE_CAT_NAMES:
if use_real_ats_names:
ats = file_reader.read_csv(cfg.REFERENCES_DIR, 'ats.csv',
converters={'ats_id': str})
df = preprocessor.replace_cat_values(df, ats)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, f'complete.csv')
def make_compliance_case(df, clusters):
def make_compliance_case(df, clusters, use_real_ats_names):
df['Cluster'] = clusters['Cluster']
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats',
resolution=cfg.ATS_RESOLUTION)
......@@ -46,14 +44,14 @@ def make_compliance_case(df, clusters):
general_cols = df[['Gender', 'BirthYear', 'Cluster', 'LoanPeriod']]
df = pd.concat([general_cols, ats_cols, df[['Compliance']]], axis=1)
if USE_CAT_NAMES:
if use_real_ats_names:
ats = file_reader.read_csv(cfg.REFERENCES_DIR, 'ats.csv',
converters={'ats_id': str})
df = preprocessor.replace_cat_values(df, ats)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, f'compliance.csv')
def make_fall_case(df, clusters):
def make_fall_case(df, clusters, use_real_ats_names):
df['Cluster'] = clusters['Cluster']
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats',
resolution=cfg.ATS_RESOLUTION)
......@@ -63,14 +61,14 @@ def make_fall_case(df, clusters):
general_cols = df[['Gender', 'BirthYear', 'Cluster', 'LoanPeriod']]
df = pd.concat([general_cols, ats_cols, df[['Fall']]], axis=1)
if USE_CAT_NAMES:
if use_real_ats_names:
ats = file_reader.read_csv(cfg.REFERENCES_DIR, 'ats.csv',
converters={'ats_id': str})
df = preprocessor.replace_cat_values(df, ats)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, f'fall.csv')
def make_fall_test_case(df, clusters, fall_data):
def make_fall_test_case(df, clusters, fall_data, use_real_ats_names):
df['Cluster'] = clusters['Cluster']
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats',
resolution=cfg.ATS_RESOLUTION)
......@@ -86,7 +84,7 @@ def make_fall_test_case(df, clusters, fall_data):
'NumberWeeksNoTraining', 'Needs', 'Physics']]
df = pd.concat([general_cols, ats_ex_cols, df[['Fall']]], axis=1)
if USE_CAT_NAMES:
if use_real_ats_names:
ats = file_reader.read_csv(cfg.REFERENCES_DIR, 'ats.csv',
converters={'ats_id': str})
ex = file_reader.read_csv(cfg.REFERENCES_DIR, 'ex.csv',
......
#!/usr/bin/env python
import pathlib
import numpy as np
import pandas as pd
import config as cfg
import os
from pathlib import Path
from tools import file_writer, parser, cleaner
def main():
parser2020 = parser.Parser2020()
ats = parser2020.parse_assistive_aids(cfg.PATHS_2020[0], cfg.RAW_DATA_DIR_2020)
td = parser2020.parse_training_done(cfg.PATHS_2020[1], cfg.RAW_DATA_DIR_2020)
sc = parser2020.parse_screening_content(cfg.PATHS_2020[1], cfg.RAW_DATA_DIR_2020)
tc = parser2020.parse_training_cancelled(cfg.PATHS_2020[1], cfg.RAW_DATA_DIR_2020)
ss = parser2020.parse_status_set(cfg.PATHS_2020[1], cfg.RAW_DATA_DIR_2020)
ic = parser2020.parse_iso_classes(cfg.PATHS_2019[5], cfg.RAW_DATA_DIR_2019)
fd = parser2020.parse_fall_data(cfg.PATHS_2020[2], cfg.RAW_DATA_DIR_2020)
def main(year: int = '2020'):
if year == '2019':
parser19 = parser.Parser2019()
ats = parser19.parse_assistive_aids(cfg.PATHS_2019[0], cfg.RAW_DATA_DIR_2019)
td = parser19.parse_training_done(cfg.PATHS_2019[1], cfg.RAW_DATA_DIR_2019)
sc = parser19.parse_screening_content(cfg.PATHS_2019[1], cfg.RAW_DATA_DIR_2019)
tc = parser19.parse_training_cancelled(cfg.PATHS_2019[1], cfg.RAW_DATA_DIR_2019)
ss = parser19.parse_status_set(cfg.PATHS_2019[1], cfg.RAW_DATA_DIR_2019)
if os.path.isfile(Path.joinpath(cfg.RAW_DATA_DIR_2019, cfg.PATHS_2019[5])):
ic = parser19.parse_iso_classes(cfg.PATHS_2019[5], cfg.RAW_DATA_DIR_2019)
else:
print("No iso class file found, continuing ...")
ic = pd.DataFrame()
fd = parser19.parse_fall_data(cfg.PATHS_2019[2], cfg.RAW_DATA_DIR_2019)
else:
parser20 = parser.Parser2020()
ats = parser20.parse_assistive_aids(cfg.PATHS_2020[0], cfg.RAW_DATA_DIR_2020)
td = parser20.parse_training_done(cfg.PATHS_2020[1], cfg.RAW_DATA_DIR_2020)
sc = parser20.parse_screening_content(cfg.PATHS_2020[1], cfg.RAW_DATA_DIR_2020)
tc = parser20.parse_training_cancelled(cfg.PATHS_2020[1], cfg.RAW_DATA_DIR_2020)
ss = parser20.parse_status_set(cfg.PATHS_2020[1], cfg.RAW_DATA_DIR_2020)
if os.path.isfile(Path.joinpath(cfg.RAW_DATA_DIR_2019, cfg.PATHS_2019[5])):
ic = parser20.parse_iso_classes(cfg.PATHS_2019[5], cfg.RAW_DATA_DIR_2019)
else:
print("No iso class file found, continuing ...")
ic = pd.DataFrame()
fd = parser20.parse_fall_data(cfg.PATHS_2020[2], cfg.RAW_DATA_DIR_2020)
cleaner2020 = cleaner.Cleaner2020()
patient_data = td[['CitizenId', 'Gender', 'BirthYear']].drop_duplicates(keep='first')
......@@ -30,7 +55,10 @@ def main():
file_writer.write_pickle(training_done, cfg.INTERIM_DATA_DIR, 'td.pkl')
file_writer.write_pickle(training_cancelled, cfg.INTERIM_DATA_DIR, 'tc.pkl')
file_writer.write_pickle(assistive_aids, cfg.INTERIM_DATA_DIR, 'ats.pkl')
file_writer.write_pickle(ic, cfg.INTERIM_DATA_DIR, 'ic.pkl')
if not ic.empty:
file_writer.write_pickle(ic, cfg.INTERIM_DATA_DIR, 'ic.pkl')
file_writer.write_pickle(fall_data, cfg.INTERIM_DATA_DIR, 'fd.pkl')
if __name__ == "__main__":
......
......@@ -13,22 +13,34 @@ import xgboost as xgb
DATA_DIR = cfg.PROCESSED_DATA_DIR
CASES = ["Complete", "Compliance", "Fall", "Fall_test"]
def main():
def main(dataset_version : str = 'emb'):
for case in CASES:
if case == "Complete":
df = file_reader.read_csv(DATA_DIR, 'complete_emb.csv')
if dataset_version == 'ohe':
df = file_reader.read_csv(DATA_DIR, 'complete_count.csv')
else:
df = file_reader.read_csv(DATA_DIR, 'complete_emb.csv')
model_dir = cfg.COMPLETE_XGB_DIR
target_name = "Complete"
elif case == "Compliance":
df = file_reader.read_csv(DATA_DIR, 'compliance_emb.csv')
if dataset_version == 'ohe':
df = file_reader.read_csv(DATA_DIR, 'compliance_count.csv')
else:
df = file_reader.read_csv(DATA_DIR, 'compliance_emb.csv')
model_dir = cfg.COMPLIANCE_XGB_DIR
target_name = "Compliance"
elif case == "Fall":
df = file_reader.read_csv(DATA_DIR, 'fall_emb.csv')
if dataset_version == 'ohe':
df = file_reader.read_csv(DATA_DIR, 'fall_count.csv')
else:
df = file_reader.read_csv(DATA_DIR, 'fall_emb.csv')
model_dir = cfg.FALL_XGB_DIR
target_name = "Fall"
else:
df = file_reader.read_csv(DATA_DIR, 'fall_test_emb.csv')
if dataset_version == 'ohe':
df = file_reader.read_csv(DATA_DIR, 'fall_test_count.csv')
else:
df = file_reader.read_csv(DATA_DIR, 'fall_test_emb.csv')
model_dir = cfg.FALL_TEST_XGB_DIR
target_name = "Fall"
......@@ -87,6 +99,7 @@ def main():
f'{case.lower()}_xgb_cm.pdf', case)
file_writer.write_joblib(model, model_dir, f'{case.lower()}_xgboost.joblib')
print(f"Scores for {case} XGBoost model:")
print(f"Accuracy: {np.around(accuracy_score(y_test, y_pred), decimals=3)}")
print(f"Precision: {np.around(precision_score(y_test, y_pred), decimals=3)}")
print(f"Recall: {np.around(recall_score(y_test, y_pred), decimals=3)}")
......
......@@ -59,7 +59,7 @@ def main():
'Vælg en borger fra listen',
('Jens Jensen', 'Hans Hansen', 'Tom Tomsen')
)
st.write('Du har valgt:', citizen)
if st.checkbox('Vis data for borger', value=True):
......
......@@ -76,7 +76,8 @@ class Cleaner2020(BaseCleaner):
ats = remove_rows_with_old_dates(ats, cfg.LEND_DATE)
ats = remove_deprecated_device_data(ats)
ats = remove_tainted_histories(ats)
ats = drop_invalid_devices(ats, ic)
if not ic.empty:
ats = drop_invalid_devices(ats, ic)
return ats
def clean_fall_data(self, fd: pd.DataFrame) -> pd.DataFrame:
......@@ -125,7 +126,8 @@ class Cleaner2019(BaseCleaner):
ats = remove_rows_with_old_dates(ats, cfg.RETURN_DATE)
ats = remove_deprecated_device_data(ats)
ats = remove_tainted_histories(ats)
ats = drop_invalid_devices(ats, ic)
if not ic.empty:
ats = drop_invalid_devices(ats, ic)
return ats
def clean_fall_data(self, fd: pd.DataFrame) -> pd.DataFrame:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment