Commit 58c01f93 authored by thecml's avatar thecml
Browse files

updated code based on lint feedback

parent df195d24
Pipeline #96352 failed with stage
in 3 minutes and 49 seconds
.vscode/settings.json
...@@ -96,7 +96,6 @@ def main(): ...@@ -96,7 +96,6 @@ def main():
cases = ["Risk"] cases = ["Risk"]
for case in cases: for case in cases:
target_settings = load_settings(pt.CONFIGS_DIR, f'{case.lower()}.yaml') target_settings = load_settings(pt.CONFIGS_DIR, f'{case.lower()}.yaml')
data_settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
output_filename = f"{case} model baseline.csv" output_filename = f"{case} model baseline.csv"
header = ['clf', 'version', 'accuracy_mean', 'accuracy_std', header = ['clf', 'version', 'accuracy_mean', 'accuracy_std',
'precision_mean', 'precision_std', 'recall_mean', 'precision_mean', 'precision_std', 'recall_mean',
......
...@@ -8,8 +8,7 @@ import joblib ...@@ -8,8 +8,7 @@ import joblib
from pathlib import Path from pathlib import Path
def main(): def main():
model = file_reader.read_joblib(pt.RISK_XGB_DIR, model = file_reader.read_joblib(pt.MODELS_DIR, 'risk_xgb.joblib')
'fall_test_xgboost.joblib')
for gender in range(0, 2): for gender in range(0, 2):
input_data = {"Gender": [gender], input_data = {"Gender": [gender],
......
...@@ -35,7 +35,9 @@ def main(): ...@@ -35,7 +35,9 @@ def main():
X = X[:10000] X = X[:10000]
y = y[:10000] y = y[:10000]
model = RandomSurvivalForest(random_state=0) model = RandomSurvivalForest(n_estimators=200,
max_depth=3,
random_state=0)
kf = KFold(n_splits=5, shuffle=True, random_state=0) kf = KFold(n_splits=5, shuffle=True, random_state=0)
c_index_scores = list() c_index_scores = list()
......
...@@ -14,7 +14,7 @@ def main(): ...@@ -14,7 +14,7 @@ def main():
ss = loader.load_status_set(pt.PATHS_2020[1], pt.RAW_DATA_DIR_2020) ss = loader.load_status_set(pt.PATHS_2020[1], pt.RAW_DATA_DIR_2020)
ic = loader.load_iso_classes('isoall.txt', pt.REFERENCES_DIR) ic = loader.load_iso_classes('isoall.txt', pt.REFERENCES_DIR)
cleaner2020 = cleaner.Cleaner2020() cleaner2020 = cleaner.Cleaner()
patient_data = td[['CitizenId', 'Gender', 'BirthYear']].drop_duplicates(keep='first') patient_data = td[['CitizenId', 'Gender', 'BirthYear']].drop_duplicates(keep='first')
patient_data = cleaner2020.clean_patient_data(patient_data) patient_data = cleaner2020.clean_patient_data(patient_data)
screening_content = cleaner2020.clean_screening_content(sc, patient_data) screening_content = cleaner2020.clean_screening_content(sc, patient_data)
...@@ -22,31 +22,31 @@ def main(): ...@@ -22,31 +22,31 @@ def main():
training_done = cleaner2020.clean_training_done(td, patient_data) training_done = cleaner2020.clean_training_done(td, patient_data)
training_cancelled = cleaner2020.clean_training_cancelled(tc, patient_data) training_cancelled = cleaner2020.clean_training_cancelled(tc, patient_data)
assistive_aids = cleaner2020.clean_assistive_aids(ats, ic) assistive_aids = cleaner2020.clean_assistive_aids(ats, ic)
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'sc.pkl'), 'wb') as fd: with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'sc.pkl'), 'wb') as fd:
outfile = BytesIO() outfile = BytesIO()
file_writer.write_pickle(screening_content, outfile) file_writer.write_pickle(screening_content, outfile)
outfile.seek(0) outfile.seek(0)
shutil.copyfileobj(outfile, fd) shutil.copyfileobj(outfile, fd)
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'ss.pkl'), 'wb') as fd: with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'ss.pkl'), 'wb') as fd:
outfile = BytesIO() outfile = BytesIO()
file_writer.write_pickle(status_set, outfile) file_writer.write_pickle(status_set, outfile)
outfile.seek(0) outfile.seek(0)
shutil.copyfileobj(outfile, fd) shutil.copyfileobj(outfile, fd)
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'td.pkl'), 'wb') as fd: with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'td.pkl'), 'wb') as fd:
outfile = BytesIO() outfile = BytesIO()
file_writer.write_pickle(training_done, outfile) file_writer.write_pickle(training_done, outfile)
outfile.seek(0) outfile.seek(0)
shutil.copyfileobj(outfile, fd) shutil.copyfileobj(outfile, fd)
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'tc.pkl'), 'wb') as fd: with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'tc.pkl'), 'wb') as fd:
outfile = BytesIO() outfile = BytesIO()
file_writer.write_pickle(training_cancelled, outfile) file_writer.write_pickle(training_cancelled, outfile)
outfile.seek(0) outfile.seek(0)
shutil.copyfileobj(outfile, fd) shutil.copyfileobj(outfile, fd)
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'ats.pkl'), 'wb') as fd: with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'ats.pkl'), 'wb') as fd:
outfile = BytesIO() outfile = BytesIO()
file_writer.write_pickle(assistive_aids, outfile) file_writer.write_pickle(assistive_aids, outfile)
...@@ -58,6 +58,6 @@ def main(): ...@@ -58,6 +58,6 @@ def main():
file_writer.write_pickle(ic, outfile) file_writer.write_pickle(ic, outfile)
outfile.seek(0) outfile.seek(0)
shutil.copyfileobj(outfile, fd) shutil.copyfileobj(outfile, fd)
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
#!/usr/bin/env python #!/usr/bin/env python
import paths as pt import paths as pt
from tools import file_reader, file_writer, data_loader from tools import file_reader, file_writer
from tools import preprocessor from tools import preprocessor
from utility import embedder from utility import embedder
from utility.settings import load_settings from utility.settings import load_settings
...@@ -9,23 +9,22 @@ import numpy as np ...@@ -9,23 +9,22 @@ import numpy as np
from pathlib import Path from pathlib import Path
from io import StringIO from io import StringIO
import shutil import shutil
from pathlib import Path
def main(): def main():
for label_name in ["Complete", "Compliance", "Fall", "Risk"]: for label_name in ["Complete", "Compliance", "Fall", "Risk"]:
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml') data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
ats_resolution = data_settings['ats_resolution'] ats_resolution = data_settings['ats_resolution']
if label_name == "Risk": if label_name == "Risk":
target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml') target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
ex_resolution = target_settings['ex_resolution'] ex_resolution = target_settings['ex_resolution']
if label_name in ["Complete", "Compliance", "Fall"]: if label_name in ["Complete", "Compliance", "Fall"]:
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)} ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
infile = StringIO() infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv' file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd: with open(Path.joinpath(file_path, file_name), 'r', encoding='utf8') as fd:
shutil.copyfileobj(fd, infile) shutil.copyfileobj(fd, infile)
infile.seek(0) infile.seek(0)
df = file_reader.read_csv(infile, converters=ats) df = file_reader.read_csv(infile, converters=ats)
...@@ -36,11 +35,11 @@ def main(): ...@@ -36,11 +35,11 @@ def main():
infile = StringIO() infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv' file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd: with open(Path.joinpath(file_path, file_name), 'r', encoding='utf8') as fd:
shutil.copyfileobj(fd, infile) shutil.copyfileobj(fd, infile)
infile.seek(0) infile.seek(0)
df = file_reader.read_csv(infile, converters=converters) df = file_reader.read_csv(infile, converters=converters)
if label_name in ["Complete", "Compliance", "Fall"]: if label_name in ["Complete", "Compliance", "Fall"]:
cols_ats = [str(i)+'Ats' for i in range(1, ats_resolution+1)] cols_ats = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
unique_ats = [df[f'{i}Ats'].unique() for i in range(1, ats_resolution+1)] unique_ats = [df[f'{i}Ats'].unique() for i in range(1, ats_resolution+1)]
...@@ -68,7 +67,7 @@ def main(): ...@@ -68,7 +67,7 @@ def main():
outfile = StringIO() outfile = StringIO()
file_path = pt.PROCESSED_DATA_DIR file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}_count.csv' file_name = f'{label_name.lower()}_count.csv'
with open(Path.joinpath(file_path, file_name), 'w', newline='') as fd: with open(Path.joinpath(file_path, file_name), 'w', newline='', encoding='utf8') as fd:
file_writer.write_csv(df, outfile) file_writer.write_csv(df, outfile)
outfile.seek(0) outfile.seek(0)
shutil.copyfileobj(outfile, fd) shutil.copyfileobj(outfile, fd)
......
...@@ -12,26 +12,26 @@ def main(ats_resolution: int = None): ...@@ -12,26 +12,26 @@ def main(ats_resolution: int = None):
infile = StringIO() infile = StringIO()
file_path = pt.INTERIM_DATA_DIR file_path = pt.INTERIM_DATA_DIR
file_name = 'screenings.csv' file_name = 'screenings.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd: with open(Path.joinpath(file_path, file_name), 'r', encoding='utf8') as fd:
shutil.copyfileobj(fd, infile) shutil.copyfileobj(fd, infile)
infile.seek(0) infile.seek(0)
screenings = file_reader.read_csv(infile, converters={'CitizenId': str}) screenings = file_reader.read_csv(infile, converters={'CitizenId': str})
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml') data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
if ats_resolution == None: if ats_resolution == None:
ats_resolution = data_settings['ats_resolution'] ats_resolution = data_settings['ats_resolution']
df = screenings.copy() df = screenings.copy()
accum_screenings = labeler.accumulate_screenings(df, data_settings) accum_screenings = labeler.accumulate_screenings(df, data_settings)
for label_name in ['Complete', 'Compliance', 'Fall', 'Risk']: for label_name in ['Complete', 'Compliance', 'Fall', 'Risk']:
target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml') target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
features = target_settings['features'] features = target_settings['features']
if label_name == "Risk": if label_name == "Risk":
ex_resolution = target_settings['ex_resolution'] ex_resolution = target_settings['ex_resolution']
risk_period_months = target_settings['risk_period_months'] risk_period_months = target_settings['risk_period_months']
if label_name == 'Complete': if label_name == 'Complete':
df = labeler.make_complete_label(accum_screenings) df = labeler.make_complete_label(accum_screenings)
elif label_name == 'Compliance': elif label_name == 'Compliance':
...@@ -40,7 +40,7 @@ def main(ats_resolution: int = None): ...@@ -40,7 +40,7 @@ def main(ats_resolution: int = None):
df = labeler.make_fall_label(accum_screenings) df = labeler.make_fall_label(accum_screenings)
else: else:
df = labeler.make_risk_label(accum_screenings, risk_period_months) df = labeler.make_risk_label(accum_screenings, risk_period_months)
df = preprocessor.split_cat_columns(df, col_to_split='Ats', df = preprocessor.split_cat_columns(df, col_to_split='Ats',
tag='Ats', resolution=ats_resolution) tag='Ats', resolution=ats_resolution)
if label_name == "Risk": if label_name == "Risk":
...@@ -57,10 +57,11 @@ def main(ats_resolution: int = None): ...@@ -57,10 +57,11 @@ def main(ats_resolution: int = None):
outfile = StringIO() outfile = StringIO()
file_path = pt.PROCESSED_DATA_DIR file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv' file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'w', newline='') as fd: with open(Path.joinpath(file_path, file_name), 'w',
newline='', encoding='utf8') as fd:
file_writer.write_csv(df, outfile) file_writer.write_csv(df, outfile)
outfile.seek(0) outfile.seek(0)
shutil.copyfileobj(outfile, fd) shutil.copyfileobj(outfile, fd)
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
...@@ -6,10 +6,11 @@ import pandas as pd ...@@ -6,10 +6,11 @@ import pandas as pd
import paths as pt import paths as pt
from tools import file_reader, file_writer, inputter from tools import file_reader, file_writer, inputter
from utility.settings import load_settings from utility.settings import load_settings
from utility import data_dto, dataset from utility import data_dto
from pandas.tseries.offsets import DateOffset from pandas.tseries.offsets import DateOffset
from io import StringIO, BytesIO from io import StringIO, BytesIO
import shutil import shutil
from typing import List, Tuple
def main(): def main():
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'sc.pkl'), 'rb') as fd: with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'sc.pkl'), 'rb') as fd:
...@@ -53,7 +54,7 @@ def main(): ...@@ -53,7 +54,7 @@ def main():
shutil.copyfileobj(outfile, fd) shutil.copyfileobj(outfile, fd)
def get_screenings(data, settings): def get_screenings(data, settings):
ids = dataset.create_union_of_ids(data.sc, data.ss, data.td, data.tc) ids = create_union_of_ids(data.sc, data.ss, data.td, data.tc)
all_screenings = pd.DataFrame() all_screenings = pd.DataFrame()
for id in ids: for id in ids:
...@@ -177,5 +178,11 @@ def get_screenings_by_id(data, id, settings): ...@@ -177,5 +178,11 @@ def get_screenings_by_id(data, id, settings):
return screenings return screenings
def create_union_of_ids(*args: Tuple) -> List[str]:
ids = []
for arg in args:
ids = list(set().union(ids, arg['CitizenId'].unique()))
return ids
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
...@@ -4,23 +4,19 @@ import numpy as np ...@@ -4,23 +4,19 @@ import numpy as np
from pathlib import Path from pathlib import Path
import paths as pt import paths as pt
import pyodbc import pyodbc
from tools import preprocessor from tools import preprocessor, file_reader
from datetime import date, datetime, timedelta from datetime import date, datetime, timedelta
import random import random
from io import StringIO
import shutil
def main(): def main():
# Load data # Load data
df_home_care = pd.read_csv(Path.joinpath(pt.RAW_DATA_DIR_TEST, "Hjemmepleje.csv"), dir = pt.RAW_DATA_DIR_TEST
encoding="iso-8859-1", df_home_care = read_csv(dir, "Hjemmepleje.csv", encoding="iso-8859-1", skiprows=2)
skiprows=2) df_ats = read_csv(dir, "Hjælpemidler.csv", encoding="iso-8859-1", skiprows=2,
df_ats = pd.read_csv(Path.joinpath(pt.RAW_DATA_DIR_TEST, "Hjælpemidler.csv"), conveters={'HMI nr': str, 'Kategori ISO nummer': str})
encoding="iso-8859-1", df_training = read_csv(dir, "Træning.csv", encoding="iso-8859-1", skiprows=2)
skiprows=2,
converters={'HMI nr': str, 'Kategori ISO nummer': str})
df_training = pd.read_csv(Path.joinpath(pt.RAW_DATA_DIR_TEST, "Træning.csv"),
encoding="iso-8859-1",
skiprows=2)
df_general = df_home_care.drop_duplicates(subset=["Borger Id"])[["Borger Id", df_general = df_home_care.drop_duplicates(subset=["Borger Id"])[["Borger Id",
"Alder (aktuel)"]].reset_index(drop=True) "Alder (aktuel)"]].reset_index(drop=True)
...@@ -64,7 +60,7 @@ def main(): ...@@ -64,7 +60,7 @@ def main():
df_general['CPR'] = df_general.apply(lambda x: get_ssn(x['Alder (aktuel)'], x['Køn']), axis=1) df_general['CPR'] = df_general.apply(lambda x: get_ssn(x['Alder (aktuel)'], x['Køn']), axis=1)
# Insert general data # Db
server = "tcp:air-db-server.database.windows.net,1433" server = "tcp:air-db-server.database.windows.net,1433"
database = "air-db" database = "air-db"
username = "airadmin" username = "airadmin"
...@@ -72,6 +68,7 @@ def main(): ...@@ -72,6 +68,7 @@ def main():
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password) cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = cnxn.cursor() cursor = cnxn.cursor()
# General data
ids = list() ids = list()
for index, row in df_general.iterrows(): for index, row in df_general.iterrows():
cursor.execute("INSERT INTO dbo.citizen (first_name,last_name,ssn,age,gender) values(?,?,?,?,?)", cursor.execute("INSERT INTO dbo.citizen (first_name,last_name,ssn,age,gender) values(?,?,?,?,?)",
...@@ -149,5 +146,16 @@ def main(): ...@@ -149,5 +146,16 @@ def main():
cursor.close() cursor.close()
def read_csv(file_path, file_name, conveters=None,
encoding=None, skiprows=None):
infile = StringIO()
with open(Path.joinpath(file_path, file_name), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
return file_reader.read_csv(infile,
converters=conveters,
encoding=encoding,
skiprows=skiprows)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
\ No newline at end of file
...@@ -35,7 +35,7 @@ def main(): ...@@ -35,7 +35,7 @@ def main():
settings).load_data() settings).load_data()
X, y = dl.prepare_data() X, y = dl.prepare_data()
else: else:
settings = load_settings(pt.CONFIGS_DIR, "fall.yaml") settings = load_settings(pt.CONFIGS_DIR, "risk.yaml")
file_name = f'risk_{DATASET_VERSION}.csv' file_name = f'risk_{DATASET_VERSION}.csv'
dl = data_loader.RiskDataLoader(pt.PROCESSED_DATA_DIR, dl = data_loader.RiskDataLoader(pt.PROCESSED_DATA_DIR,
file_name, file_name,
......
from pathlib import Path from pathlib import Path
PATHS_2019 = ['DigiRehab_BorgerID_TrainingDone.xlsx',
'DigiRehab_BorgerID_TrainingCancelled.xlsx',
'DigiRehab_BorgerID_StatusSet.xlsx',
'DigiRehab_BorgerID_ScreeningContent.xlsx',
'HjaelpemidlerUdtraek.csv',
'isoall.txt',
'DigiRehab_borgerIDALL.csv',
'clusters.csv']
PATHS_2020 = ['borgere_hmi_Rasmus_BorgerId_Gender_BirthYear.xlsx', PATHS_2020 = ['borgere_hmi_Rasmus_BorgerId_Gender_BirthYear.xlsx',
'DrPatientData_RasmusPlusBorgerIdMinusCPR_2020.xlsx', 'DrPatientData_RasmusPlusBorgerIdMinusCPR_2020.xlsx',
'Observationer_Rasmus_BorgerId_Gender_BirthYear.xlsx', 'Observationer_Rasmus_BorgerId_Gender_BirthYear.xlsx',
......
import numpy as np """
import tensorflow as tf classifiers.py
import xgboost as xgb ====================================
Module to store classifers used for CV.
"""
from abc import ABC, abstractmethod
from typing import Tuple, List
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from tools import preprocessor from tools import preprocessor
from abc import ABC, abstractmethod from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier import numpy as np
from typing import Tuple, List import tensorflow as tf
import xgboost as xgb
class BaseClassifer(ABC): class BaseClassifer(ABC):
""" """
Base class for classifiers. Base class for classifiers.
""" """
def __init__(self, X, y): def __init__(self, X, y):
"""Initilizes inputs and targets variables""" """Initilizes inputs and targets variables."""
self.X = X self.X = X
self.y = y self.y = y
@abstractmethod @abstractmethod
def make_model(self): def make_model(self):
""" """
This method is an abstract method to be implemented This method is an abstract method to be implemented
by a concrete classifier. Must return a sklearn-compatible by a concrete classifier. Must return a sklearn-compatible
estimator object implementing 'fit' estimator object implementing 'fit'.
""" """
def evaluate(self, metrics:List = ['accuracy'], k: int=0) -> Tuple[dict, def evaluate(self, metrics:List=None, k: int=0) -> Tuple[dict,np.ndarray]:
np.ndarray]:
""" """
This method performs cross validation for k seeds This method performs cross validation for k seeds
on a given dataset X and y and outputs the results on a given dataset X and y and outputs the results
...@@ -47,26 +50,30 @@ class BaseClassifer(ABC): ...@@ -47,26 +50,30 @@ class BaseClassifer(ABC):
for metric in metrics: for metric in metrics:
results[metric] = res_validate[f'test_{metric}'] results[metric] = res_validate[f'test_{metric}']
return results return results
class KnnClassifier(BaseClassifer): class KnnClassifier(BaseClassifer):
def make_model(self): """KNN classifier."""
def make_model(self):
return KNeighborsClassifier(n_neighbors=10, return KNeighborsClassifier(n_neighbors=10,
weights='distance', weights='distance',
p=1) p=1)
class SvmClassifier(BaseClassifer): class SvmClassifier(BaseClassifer):
"""Support vector machine classifier."""
def make_model(self): def make_model(self):
return SVC(random_state=0, return SVC(random_state=0,
class_weight="balanced", class_weight="balanced",
probability=True) probability=True)
class LrClassifier(BaseClassifer): class LrClassifier(BaseClassifer):
"""Logistic regression classifier."""
def make_model(self): def make_model(self):
return LogisticRegression(max_iter=1000, return LogisticRegression(max_iter=1000,