Commit 58c01f93 authored by thecml's avatar thecml
Browse files

updated code based on lint feedback

parent df195d24
Pipeline #96352 failed with stage
in 3 minutes and 49 seconds
.vscode/settings.json
......@@ -96,7 +96,6 @@ def main():
cases = ["Risk"]
for case in cases:
target_settings = load_settings(pt.CONFIGS_DIR, f'{case.lower()}.yaml')
data_settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
output_filename = f"{case} model baseline.csv"
header = ['clf', 'version', 'accuracy_mean', 'accuracy_std',
'precision_mean', 'precision_std', 'recall_mean',
......
......@@ -8,8 +8,7 @@ import joblib
from pathlib import Path
def main():
model = file_reader.read_joblib(pt.RISK_XGB_DIR,
'fall_test_xgboost.joblib')
model = file_reader.read_joblib(pt.MODELS_DIR, 'risk_xgb.joblib')
for gender in range(0, 2):
input_data = {"Gender": [gender],
......
......@@ -35,7 +35,9 @@ def main():
X = X[:10000]
y = y[:10000]
model = RandomSurvivalForest(random_state=0)
model = RandomSurvivalForest(n_estimators=200,
max_depth=3,
random_state=0)
kf = KFold(n_splits=5, shuffle=True, random_state=0)
c_index_scores = list()
......
......@@ -14,7 +14,7 @@ def main():
ss = loader.load_status_set(pt.PATHS_2020[1], pt.RAW_DATA_DIR_2020)
ic = loader.load_iso_classes('isoall.txt', pt.REFERENCES_DIR)
cleaner2020 = cleaner.Cleaner2020()
cleaner2020 = cleaner.Cleaner()
patient_data = td[['CitizenId', 'Gender', 'BirthYear']].drop_duplicates(keep='first')
patient_data = cleaner2020.clean_patient_data(patient_data)
screening_content = cleaner2020.clean_screening_content(sc, patient_data)
......@@ -22,31 +22,31 @@ def main():
training_done = cleaner2020.clean_training_done(td, patient_data)
training_cancelled = cleaner2020.clean_training_cancelled(tc, patient_data)
assistive_aids = cleaner2020.clean_assistive_aids(ats, ic)
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'sc.pkl'), 'wb') as fd:
outfile = BytesIO()
file_writer.write_pickle(screening_content, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'ss.pkl'), 'wb') as fd:
outfile = BytesIO()
file_writer.write_pickle(status_set, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'td.pkl'), 'wb') as fd:
outfile = BytesIO()
file_writer.write_pickle(training_done, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'tc.pkl'), 'wb') as fd:
outfile = BytesIO()
file_writer.write_pickle(training_cancelled, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'ats.pkl'), 'wb') as fd:
outfile = BytesIO()
file_writer.write_pickle(assistive_aids, outfile)
......@@ -58,6 +58,6 @@ def main():
file_writer.write_pickle(ic, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == "__main__":
main()
\ No newline at end of file
#!/usr/bin/env python
import paths as pt
from tools import file_reader, file_writer, data_loader
from tools import file_reader, file_writer
from tools import preprocessor
from utility import embedder
from utility.settings import load_settings
......@@ -9,23 +9,22 @@ import numpy as np
from pathlib import Path
from io import StringIO
import shutil
from pathlib import Path
def main():
for label_name in ["Complete", "Compliance", "Fall", "Risk"]:
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
ats_resolution = data_settings['ats_resolution']
if label_name == "Risk":
target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
ex_resolution = target_settings['ex_resolution']
if label_name in ["Complete", "Compliance", "Fall"]:
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd:
with open(Path.joinpath(file_path, file_name), 'r', encoding='utf8') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=ats)
......@@ -36,11 +35,11 @@ def main():
infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd:
with open(Path.joinpath(file_path, file_name), 'r', encoding='utf8') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=converters)
if label_name in ["Complete", "Compliance", "Fall"]:
cols_ats = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
unique_ats = [df[f'{i}Ats'].unique() for i in range(1, ats_resolution+1)]
......@@ -68,7 +67,7 @@ def main():
outfile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}_count.csv'
with open(Path.joinpath(file_path, file_name), 'w', newline='') as fd:
with open(Path.joinpath(file_path, file_name), 'w', newline='', encoding='utf8') as fd:
file_writer.write_csv(df, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
......
......@@ -12,26 +12,26 @@ def main(ats_resolution: int = None):
infile = StringIO()
file_path = pt.INTERIM_DATA_DIR
file_name = 'screenings.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd:
with open(Path.joinpath(file_path, file_name), 'r', encoding='utf8') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
screenings = file_reader.read_csv(infile, converters={'CitizenId': str})
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
if ats_resolution == None:
ats_resolution = data_settings['ats_resolution']
df = screenings.copy()
accum_screenings = labeler.accumulate_screenings(df, data_settings)
for label_name in ['Complete', 'Compliance', 'Fall', 'Risk']:
target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
features = target_settings['features']
if label_name == "Risk":
ex_resolution = target_settings['ex_resolution']
risk_period_months = target_settings['risk_period_months']
if label_name == 'Complete':
df = labeler.make_complete_label(accum_screenings)
elif label_name == 'Compliance':
......@@ -40,7 +40,7 @@ def main(ats_resolution: int = None):
df = labeler.make_fall_label(accum_screenings)
else:
df = labeler.make_risk_label(accum_screenings, risk_period_months)
df = preprocessor.split_cat_columns(df, col_to_split='Ats',
tag='Ats', resolution=ats_resolution)
if label_name == "Risk":
......@@ -57,10 +57,11 @@ def main(ats_resolution: int = None):
outfile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'w', newline='') as fd:
with open(Path.joinpath(file_path, file_name), 'w',
newline='', encoding='utf8') as fd:
file_writer.write_csv(df, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -6,10 +6,11 @@ import pandas as pd
import paths as pt
from tools import file_reader, file_writer, inputter
from utility.settings import load_settings
from utility import data_dto, dataset
from utility import data_dto
from pandas.tseries.offsets import DateOffset
from io import StringIO, BytesIO
import shutil
from typing import List, Tuple
def main():
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'sc.pkl'), 'rb') as fd:
......@@ -53,7 +54,7 @@ def main():
shutil.copyfileobj(outfile, fd)
def get_screenings(data, settings):
ids = dataset.create_union_of_ids(data.sc, data.ss, data.td, data.tc)
ids = create_union_of_ids(data.sc, data.ss, data.td, data.tc)
all_screenings = pd.DataFrame()
for id in ids:
......@@ -177,5 +178,11 @@ def get_screenings_by_id(data, id, settings):
return screenings
def create_union_of_ids(*args: Tuple) -> List[str]:
ids = []
for arg in args:
ids = list(set().union(ids, arg['CitizenId'].unique()))
return ids
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -4,23 +4,19 @@ import numpy as np
from pathlib import Path
import paths as pt
import pyodbc
from tools import preprocessor
from tools import preprocessor, file_reader
from datetime import date, datetime, timedelta
import random
from io import StringIO
import shutil
def main():
# Load data
df_home_care = pd.read_csv(Path.joinpath(pt.RAW_DATA_DIR_TEST, "Hjemmepleje.csv"),
encoding="iso-8859-1",
skiprows=2)
df_ats = pd.read_csv(Path.joinpath(pt.RAW_DATA_DIR_TEST, "Hjælpemidler.csv"),
encoding="iso-8859-1",
skiprows=2,
converters={'HMI nr': str, 'Kategori ISO nummer': str})
df_training = pd.read_csv(Path.joinpath(pt.RAW_DATA_DIR_TEST, "Træning.csv"),
encoding="iso-8859-1",
skiprows=2)
dir = pt.RAW_DATA_DIR_TEST
df_home_care = read_csv(dir, "Hjemmepleje.csv", encoding="iso-8859-1", skiprows=2)
df_ats = read_csv(dir, "Hjælpemidler.csv", encoding="iso-8859-1", skiprows=2,
conveters={'HMI nr': str, 'Kategori ISO nummer': str})
df_training = read_csv(dir, "Træning.csv", encoding="iso-8859-1", skiprows=2)
df_general = df_home_care.drop_duplicates(subset=["Borger Id"])[["Borger Id",
"Alder (aktuel)"]].reset_index(drop=True)
......@@ -64,7 +60,7 @@ def main():
df_general['CPR'] = df_general.apply(lambda x: get_ssn(x['Alder (aktuel)'], x['Køn']), axis=1)
# Insert general data
# Db
server = "tcp:air-db-server.database.windows.net,1433"
database = "air-db"
username = "airadmin"
......@@ -72,6 +68,7 @@ def main():
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = cnxn.cursor()
# General data
ids = list()
for index, row in df_general.iterrows():
cursor.execute("INSERT INTO dbo.citizen (first_name,last_name,ssn,age,gender) values(?,?,?,?,?)",
......@@ -149,5 +146,16 @@ def main():
cursor.close()
def read_csv(file_path, file_name, conveters=None,
encoding=None, skiprows=None):
infile = StringIO()
with open(Path.joinpath(file_path, file_name), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
return file_reader.read_csv(infile,
converters=conveters,
encoding=encoding,
skiprows=skiprows)
if __name__ == '__main__':
main()
\ No newline at end of file
......@@ -35,7 +35,7 @@ def main():
settings).load_data()
X, y = dl.prepare_data()
else:
settings = load_settings(pt.CONFIGS_DIR, "fall.yaml")
settings = load_settings(pt.CONFIGS_DIR, "risk.yaml")
file_name = f'risk_{DATASET_VERSION}.csv'
dl = data_loader.RiskDataLoader(pt.PROCESSED_DATA_DIR,
file_name,
......
from pathlib import Path
PATHS_2019 = ['DigiRehab_BorgerID_TrainingDone.xlsx',
'DigiRehab_BorgerID_TrainingCancelled.xlsx',
'DigiRehab_BorgerID_StatusSet.xlsx',
'DigiRehab_BorgerID_ScreeningContent.xlsx',
'HjaelpemidlerUdtraek.csv',
'isoall.txt',
'DigiRehab_borgerIDALL.csv',
'clusters.csv']
PATHS_2020 = ['borgere_hmi_Rasmus_BorgerId_Gender_BirthYear.xlsx',
'DrPatientData_RasmusPlusBorgerIdMinusCPR_2020.xlsx',
'Observationer_Rasmus_BorgerId_Gender_BirthYear.xlsx',
......
import numpy as np
import tensorflow as tf
import xgboost as xgb
"""
classifiers.py
====================================
Module to store classifers used for CV.
"""
from abc import ABC, abstractmethod
from typing import Tuple, List
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from tools import preprocessor
from abc import ABC, abstractmethod
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from typing import Tuple, List
from keras.wrappers.scikit_learn import KerasClassifier
import numpy as np
import tensorflow as tf
import xgboost as xgb
class BaseClassifer(ABC):
"""
Base class for classifiers.
"""
def __init__(self, X, y):
"""Initilizes inputs and targets variables"""
"""Initilizes inputs and targets variables."""
self.X = X
self.y = y
@abstractmethod
def make_model(self):
"""
This method is an abstract method to be implemented
by a concrete classifier. Must return a sklearn-compatible
estimator object implementing 'fit'
estimator object implementing 'fit'.
"""
def evaluate(self, metrics:List = ['accuracy'], k: int=0) -> Tuple[dict,
np.ndarray]:
def evaluate(self, metrics:List=None, k: int=0) -> Tuple[dict,np.ndarray]:
"""
This method performs cross validation for k seeds
on a given dataset X and y and outputs the results
......@@ -47,26 +50,30 @@ class BaseClassifer(ABC):
for metric in metrics:
results[metric] = res_validate[f'test_{metric}']
return results
class KnnClassifier(BaseClassifer):
def make_model(self):
"""KNN classifier."""
def make_model(self):
return KNeighborsClassifier(n_neighbors=10,
weights='distance',
p=1)
class SvmClassifier(BaseClassifer):
"""Support vector machine classifier."""
def make_model(self):
return SVC(random_state=0,
class_weight="balanced",
probability=True)
class LrClassifier(BaseClassifer):
"""Logistic regression classifier."""
def make_model(self):
return LogisticRegression(max_iter=1000,
class_weight="balanced",
random_state=0)
class XgbClassifier(BaseClassifer):
"""XGBoost classifier."""
def make_model(self):
neg, pos = np.bincount(self.y)
scale_pos_weight = neg / pos
......@@ -80,12 +87,14 @@ class XgbClassifier(BaseClassifer):
return xgb.XGBClassifier(**params)
class RfClassifier(BaseClassifer):
"""Random Forest classifier."""
def make_model(self):
return RandomForestClassifier(n_estimators=800,
class_weight="balanced",
random_state=0)
class MlpClassifier(BaseClassifer):
"""Multi-layer Perceptron classifier."""
def make_model(self):
def make_keras_model():
model = tf.keras.models.Sequential()
......@@ -113,4 +122,3 @@ class MlpClassifier(BaseClassifer):
class_weight = preprocessor.get_class_weight(neg, pos)
return KerasClassifier(make_keras_model, epochs=20, batch_size=32,
class_weight=class_weight, verbose=False)
\ No newline at end of file
import pandas as pd
import numpy as np
import os
"""
cleaner.py
====================================
Module to clean raw data.
"""
from abc import ABC, abstractmethod
from typing import List
import pandas as pd
import numpy as np
class BaseCleaner(ABC):
"""
Base class for cleaners.
"""
@abstractmethod
def clean_clusters(self, cl):
"""Cleans the cluster data set"""
@abstractmethod
def clean_patient_data(self, ptd):
"""Cleans the patient data set"""
def clean_patient_data(self, patient_data):
"""Cleans the patient data set."""
@abstractmethod
def clean_screening_content(self, sc, ptd):
"""Cleans the screening content data set"""
def clean_screening_content(self, screening_content, patient_data):
"""Cleans the screening content data set."""
@abstractmethod
def clean_status_set(self, ss, ptd):
"""Cleans the status set data set"""
def clean_status_set(self, status_set, patient_data):
"""Cleans the status set data set."""
@abstractmethod
def clean_training_done(self, td, ptd):
"""Cleans the training done data set"""
def clean_training_done(self, training_done, patient_data):
"""Cleans the training done data set."""
@abstractmethod
def clean_training_cancelled(self, tc, ptd):
"""Cleans the training cancelled data set"""
def clean_training_cancelled(self, training_cancelled, patient_data):
"""Cleans the training cancelled data set."""
@abstractmethod
def clean_assistive_aids(self, ats, ic, ids):
"""Cleans the assistive aids data set"""
def clean_assistive_aids(self, ats, iso_classes):
"""Cleans the assistive aids data set."""
def remove_citizens_not_in_patient_data(self, train_data: pd.DataFrame,
patient_data: pd.DataFrame,
id_col: str) -> pd.DataFrame:
"""
This method removes citizens not in patient data set
This method removes citizens not in patient data set.
:param train_data: DigiRehab training data
:param patient_data: DigiRehab patient data
:param id_col: the name of the column identifing a citizen
......@@ -48,7 +52,7 @@ class BaseCleaner(ABC):
def remove_citizens_without_valid_id(self, df: pd.DataFrame) -> pd.DataFrame:
"""
This method removes citizens without a valid id
This method removes citizens without a valid id.
:param df: a dataframe
:return: cleaned dataframe
"""
......@@ -60,10 +64,10 @@ class BaseCleaner(ABC):
return df
def merge_train_and_patient_data(self, train_data: pd.DataFrame,
patient_data: pd.DataFrame,
id_col: str) -> pd.DataFrame:
patient_data: pd.DataFrame,
id_col: str) -> pd.DataFrame:
"""
This method merges the training and patient data
This method merges the training and patient data.
:param train_data: DigiRehab training data
:param patient_data: DigiRehab patient data
:param id_col: the name of the column identifing a citizen
......@@ -71,18 +75,18 @@ class BaseCleaner(ABC):
"""
return pd.merge(train_data, patient_data, on=id_col)
def sort_dataframe(self, df: pd.DataFrame, by: str) -> pd.DataFrame:
def sort_dataframe(self, df: pd.DataFrame, col_name: str) -> pd.DataFrame:
"""
This method sorts a dataframe based on a column name
This method sorts a dataframe based on a column name.
:param df: dataframe to be sorted
:param by: column name to sort by
:param col_name: column name to sort by
:return: sorted dataframe
"""
return df.sort_values(by)
return df.sort_values(col_name)
def filter_ats_on_ids(self, df: pd.DataFrame, ids: List[str]) -> pd.DataFrame:
"""
This method filters a dataframe containing ats
This method filters a dataframe containing ats.
data by a list of ids in the dataframe
:param df: dataframe containing ats
:param ids: ids to filter by
......@@ -92,7 +96,7 @@ class BaseCleaner(ABC):
def remove_tainted_histories(self, df: pd.DataFrame) -> pd.DataFrame:
"""
This method removed tainted loan histories
This method removed tainted loan histories.
:param df: dataframe containing ats
:return: a cleaned dataframe
"""
......@@ -102,7 +106,7 @@ class BaseCleaner(ABC):
def remove_deprecated_device_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""
This method removed deprecated device data
This method removed deprecated device data.
:param df: dataframe containing ats
:return: a cleaned dataframe
"""
......@@ -110,7 +114,7 @@ class BaseCleaner(ABC):
def remove_rows_with_old_dates(self, df: pd.DataFrame, date_col: str) -> pd.DataFrame:
"""
This method removes rows with old dates
This method removes rows with old dates.
:param df: dataframe containing ats
:param date_col: the name of the date column
:return: a cleaned dataframe
......@@ -118,10 +122,10 @@ class BaseCleaner(ABC):
df[date_col] = pd.to_datetime(df[date_col])
mask = (df[date_col] >= '1900-01-01') & (df[date_col] <= pd.Timestamp('today'))
return df.loc[mask]
def drop_invalid_devices(self, df: pd.DataFrame, iso_classes: pd.DataFrame) -> pd.DataFrame:
"""
This method removes invalid devices not in iso classes
This method removes invalid devices not in iso classes.
:param df: dataframe containing ats
:param iso_classes: dataframe with the iso classes
:return: a cleaned dataframe
......@@ -130,49 +134,63 @@ class BaseCleaner(ABC):
def remove_screenings_without_exercises(self, df: pd.DataFrame) -> pd.DataFrame:
"""
This method removes screenings without exercises
This method removes screenings without exercises.
:param df: dataframe containing screenings
:return: a cleaned dataframe