Commit d665f064 authored by thecml's avatar thecml
Browse files

added types and documentation

parent f490ecdf
Pipeline #93661 failed with stage
in 4 minutes and 1 second
......@@ -4,9 +4,9 @@
ats_iso_length: 6
ats_resolution: 10
alarm_ats: "222718"
threshold_weeks: 8
threshold_training: 10
fall_exercise_threshold: 3
fall_exercises: ['8058','8062','8066','8077','8074','8059','8071','8067']
alarm_ats: ['22271812', '22271813', '22271814', '22271816']
risk_period_months: 6
\ No newline at end of file
%% Cell type:code id: tags:
```
import yaml
from pathlib import Path
import pandas as pd
import numpy as np
import paths as pt
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from utility import metrics
from sklearn.metrics import confusion_matrix
from tools import data_loader, file_writer
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
# Load settings
with open(Path.joinpath(pt.CONFIGS_DIR, "complete_emb.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
protected_col_names = ['Gender_Male', 'Gender_Female']
protected_col_name = 'Gender'
y_col_name="Complete"
# Load the data
file_name = "complete_emb.csv"
dl = data_loader.CompleteDataLoader(file_name, settings).load_data()
X, y = dl.get_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
stratify=y, random_state=0)
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
params = {"n_estimators": 400,
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.1,
"eval_metric": "logloss",
"seed": 0
}
model = xgb.XGBClassifier(**params)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
df_test = pd.DataFrame([],columns=list(X.columns)+["Complete"]+["output"]+["output_prob"])
```
%% Cell type:code id: tags:
```
def get_df_w_metrics(df, protected_col_names, y_target_name, y_pred_name):
confusion_df = pd.DataFrame(columns=[protected_col_names, "FPR", "FNR"])
def get_df_w_metrics(df, protected_col_name, y_target_name, y_pred_name):
confusion_df = pd.DataFrame(columns=[protected_col_name, "FPR", "FNR"])
for name in list(df[protected_col_names].unique()):
a=df[df[protected_variable_name]==name][y_target_name]
b=df[df[protected_variable_name]==name][y_pred_name]
for name in list(df[protected_col_name].unique()):
a=df[df[protected_col_name]==name][y_target_name]
b=df[df[protected_col_name]==name][y_pred_name]
TN, FP, FN, TP = confusion_matrix(list(a), list(b),labels=[0, 1]).ravel()
TPR = TP/(TP+FN)
TNR = TN/(TN+FP)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
FPR = FP/(FP+TN)
FNR = FN/(TP+FN)
FDR = FP/(TP+FP)
ACC = (TP+TN)/(TP+FP+FN+TN)
LRplus=TPR/FPR
LRminus=FNR/TNR
F1=2*(PPV*TPR)/(PPV+TPR)
confusion_df = confusion_df.append({protected_variable_name:name, "TPR":TPR, "TNR":TNR, "FPR":FPR,
confusion_df = confusion_df.append({protected_col_name:name, "TPR":TPR, "TNR":TNR, "FPR":FPR,
"FNR":FNR, "PPV":PPV, "NPV":NPV, "FDR":FDR, "ACC":ACC,
"F1":F1, "LRplus":LRplus, "LRminus":LRminus, "TN":TN,
"FP":FP, "FN":FN, "TP":TP}, ignore_index=True)
return confusion_df
```
%% Cell type:code id: tags:
```
i=0
y_valid_pred = 0*y
valid_acc, valid_pre, valid_recall, valid_roc_auc = list(), list(), list(), list()
for train_index, valid_index in skf.split(X_train, y_train):
X_train_split, X_valid_split = X_train.iloc[train_index,:], X_train.iloc[valid_index,:]
y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]
optimize_rounds = True
early_stopping_rounds = 50
if optimize_rounds:
eval_set=[(X_valid_split, y_valid_split)]
fit_model = model.fit(X_train_split, y_train_split,
eval_set=eval_set,
eval_metric=metrics.gini_xgb,
early_stopping_rounds=early_stopping_rounds,
verbose=False)
else:
fit_model = model.fit(X_train_split, y_train_split)
pred = fit_model.predict_proba(X_valid_split)[:,1]
y_valid_pred.iloc[valid_index] = pred
y_valid_scores = (y_valid_pred.iloc[valid_index] > 0.5)
# Save data
y_true_pd=y_valid_split.to_frame().reset_index(drop=True)
y_pred_pd=y_valid_scores.apply(lambda x: 1 if x == True else 0).to_frame().reset_index(drop=True).rename(columns={"Complete" : "output"})
y_pred_prob_pd = pd.DataFrame(pred, columns = ["output_prob"])
df_subset = pd.concat([X_valid_split.reset_index(drop=True), y_true_pd, y_pred_pd, y_pred_prob_pd], axis=1)
df_test = df_test.append(df_subset, ignore_index=True)
# Save metrics
df_evaluate_proc = get_df_w_metrics(df_subset, protected_col_names, y_col_name, "output")
df_evaluate_proc = get_df_w_metrics(df_subset, protected_col_name, y_col_name, "output")
file_writer.write_csv(df_evaluate_proc, pt.INTERIM_DATA_DIR, "model"+str(i) + "_" + protected_col_name + ".csv")
for protected_col_name in protected_col_names:
df_evaluate_together = df_subset.copy()
#df_evaluate_together[protected_col_name] = "all"
df_evaluate_all = get_df_w_metrics(df_evaluate_together, protected_col_name, y_col_name, "output")
file_writer.write_csv(df_evaluate_all, pt.INTERIM_DATA_DIR, "model"+str(i) + "_" + protected_col_name + "_all.csv")
df_evaluate_together = df_subset.copy()
df_evaluate_all = get_df_w_metrics(df_evaluate_together, protected_col_name, y_col_name, "output")
file_writer.write_csv(df_evaluate_all, pt.INTERIM_DATA_DIR, "model"+str(i) + "_" + protected_col_name + "_all.csv")
valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))
valid_pre.append(precision_score(y_valid_split, y_valid_scores))
valid_recall.append(recall_score(y_valid_split, y_valid_scores))
valid_roc_auc.append(roc_auc_score(y_valid_split, y_valid_pred.iloc[valid_index]))
i=i+1
```
%% Cell type:code id: tags:
```
file_writer.write_csv(df_test, pt.INTERIM_DATA_DIR, "all_test_data.csv")
```
......
import pandas as pd
import numpy as np
from tools import preprocessor, file_reader, explainer
import paths as pt
import os
import csv
import joblib
from pathlib import Path
def main():
model = file_reader.read_joblib(pt.COMPLETE_XGB_DIR,
'complete_xgboost.joblib')
input_data = {"Gender": [0],
"BirthYear": [46],
"LoanPeriod": [360],
"Ats": ["093307,222718,181210"]}
new_data_df = pd.DataFrame.from_dict(input_data)
new_data_df['NumberAts'] = len(new_data_df['Ats'][0].split(","))
df = preprocessor.split_cat_columns(new_data_df, col_to_split='Ats',
tag='Ats',
resolution=10)
cols_ats = [str(i)+'Ats' for i in range(1, 10+1)]
header_list = ['Gender', 'BirthYear', 'Cluster',
'LoanPeriod', 'NumberAts'] + cols_ats
df = df.reindex(columns=header_list)
df = df.fillna('0')
df['Cluster'] = 14
df['Cluster'] = pd.to_numeric(df['Cluster'])
for i in range(1, 10+1):
path = Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings')
embedding = file_reader.read_embedding(path, f'complete_{i}Ats.csv')
column = f'{i}Ats'
df[column] = df[column].replace(to_replace=embedding)
df[column] = pd.to_numeric(df[column])
prediction = model.predict(df)
probability = model.predict_proba(df).max()
_, shap_values = explainer.get_shap_tree_explainer(model, X_test=df)
shap_values_flat = [round(float(val), 3) for val in shap_values[0]]
shap_values_dict = dict(zip(df.columns, shap_values_flat))
print(f"Predicted {int(prediction[0])} with probability {round(float(probability), 3)*100}%")
for item, amount in shap_values_dict.items():
print("{} ({})".format(item, amount))
if __name__ == "__main__":
main()
\ No newline at end of file
import pandas as pd
import numpy as np
from tools import file_reader, explainer
import paths as pt
from pathlib import Path
def main():
model = file_reader.read_joblib(pt.COMPLETE_XGB_DIR,
'complete_xgboost.joblib')
converters = {str(i)+'Ats':str for i in range(1, 10+1)}
df = file_reader.read_csv(pt.TESTS_FILES_DIR,
'test_citizens.csv',
converters=converters)
for i in range(1, 10+1):
path = Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings')
embedding = file_reader.read_embedding(path, f'complete_{i}Ats.csv')
column = f'{i}Ats'
df[column] = df[column].replace(to_replace=embedding)
df[column] = pd.to_numeric(df[column])
test_citizen = pd.DataFrame(df.iloc[0]).T
print(test_citizen)
prediction = model.predict(test_citizen)
probability = model.predict_proba(test_citizen).max()
_, shap_values = explainer.get_shap_tree_explainer(model, X_test=test_citizen)
shap_values_flat = [round(float(val), 3) for val in shap_values[0]]
shap_values_dict = dict(zip(test_citizen.columns, shap_values_flat))
print(f"Predicted {int(prediction[0])} with probability {round(float(probability), 3)*100}%")
for item, amount in shap_values_dict.items():
print("{} ({})".format(item, amount))
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -10,6 +10,8 @@ import csv
from utility.settings import load_settings
from utility.metrics import compute_mean, compute_std
NUM_ITER = 1
def get_version_subtitle(version):
if version == "NoAts":
return "without Ats and/or Ex columns"
......@@ -34,9 +36,6 @@ def load_data_embedded(case, settings):
elif case == "Compliance":
dl = data_loader.ComplianceDataLoader("compliance_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Alarm":
dl = data_loader.AlarmDataLoader("alarm_emb.csv", settings).load_data()
X, y = dl.get_data()
else:
dl = data_loader.FallDataLoader("fall_emb.csv", settings).load_data()
X, y = dl.get_data()
......@@ -49,9 +48,6 @@ def load_data_count(case, settings):
elif case == "Compliance":
dl = data_loader.ComplianceDataLoader("compliance_count.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Alarm":
dl = data_loader.AlarmDataLoader("alarm_count.csv", settings).load_data()
X, y = dl.get_data()
else:
dl = data_loader.FallDataLoader("fall_count.csv", settings).load_data()
X, y = dl.get_data()
......@@ -64,22 +60,18 @@ def load_data_ohe(case, settings):
elif case == "Compliance":
dl = data_loader.ComplianceDataLoader("compliance_ohe.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Alarm":
dl = data_loader.AlarmDataLoader("alarm_ohe.csv", settings).load_data()
X, y = dl.get_data()
else:
dl = data_loader.FallDataLoader("fall_ohe.csv", settings).load_data()
X, y = dl.get_data()
return X, y
def main():
num_iter = 1
clf_names = ['KNN', 'SVM', 'LR', 'XGB', 'RF', 'MLP']
num_clfs = len(clf_names)
metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'f1']
cases = ["Complete", "Compliance", "Fall"]
for case in cases:
target_settings = load_settings(f'{case.lower()}_emb.yaml')
target_settings = load_settings(f'{case.lower()}.yaml')
data_settings = load_settings("data.yaml")
output_filename = f"{case} model baseline.csv"
header = ['clf', 'version', 'accuracy_mean', 'accuracy_std',
......@@ -105,9 +97,9 @@ def main():
X, y = load_data_ohe(case, target_settings)
X, y = prepare_data(X, y, target_settings)
results = train_clf(X, y, version, output_filename, metrics, num_iter)
results = train_clf(X, y, version, output_filename, metrics, NUM_ITER)
subtitle = get_version_subtitle(version)
make_plots(results, metrics, num_iter, num_clfs,
make_plots(results, metrics, NUM_ITER, num_clfs,
clf_names, case, version, subtitle)
def train_clf(X, y, version, output_filename, metrics, num_iter):
......
......@@ -17,19 +17,15 @@ def main():
cases = ["Complete", "Compliance", "Fall"]
for case in cases:
if case == "Complete":
settings = load_settings("complete_emb.yaml")
settings = load_settings("complete.yaml")
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Compliance":
settings = load_settings("compliance_emb.yaml")
settings = load_settings("compliance.yaml")
dl = data_loader.ComplianceDataLoader("compliance_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Alarm":
settings = load_settings("alarm_emb.yaml")
dl = data_loader.AlarmDataLoader("alarm_count.csv", settings).load_data()
X, y = dl.get_data()
else:
settings = load_settings("fall_emb.yaml")
settings = load_settings("fall.yaml")
dl = data_loader.FallDataLoader("fall_emb.csv", settings).load_data()
X, y = dl.get_data()
......@@ -48,8 +44,8 @@ def main():
ascending=False)
shap_sorted_df = shap_sorted_df.reset_index()
importances = shap_sorted_df['shap_values']
features = shap_sorted_df['feature']
importances = list(shap_sorted_df['shap_values'])
features = list(shap_sorted_df['feature'])
plot_file_name = f"{case} SHAP feature values"
csv_file_name = f"{case} model features.csv"
file_writer.write_shap_importance_plot(features, importances, pt.REPORTS_PLOTS_DIR, plot_file_name)
......
......@@ -15,17 +15,16 @@ import matplotlib.pyplot as plt
def main():
# Load settings
with open(Path.joinpath(pt.CONFIGS_DIR, "fall_emb.yaml"), 'r') as stream:
with open(Path.joinpath(pt.CONFIGS_DIR, "fall.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
protected_col_name = "Gender_Male"
protected_col_name = "Gender"
y_col_name="Fall"
# Load the data
file_name = "fall_emb.csv"
dl = data_loader.AlarmDataLoader(file_name, settings).load_data()
X, y = dl.get_data()
X = X.drop(['Gender_Female'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
stratify=y, random_state=0)
......
......@@ -27,14 +27,13 @@ class DummyNormalizer(BaseEstimator, TransformerMixin):
class BoxCoxNormalizer(BaseEstimator, TransformerMixin):
def fit_transform(self, X, case=None):
if case == "Risk":
numeric_feats = ['Gender_Male', 'Gender_Female', 'BirthYear', 'Cluster',
numeric_feats = ['Gender', 'BirthYear', 'Cluster',
'LoanPeriod', 'NumberSplit', 'NumberScreening', 'NumberWeeks',
'MeanEvaluation', 'NumberFalls', 'NumberTraining', 'NumberTrainingWeek',
'TimeBetweenTraining', 'NumberWeeksNoTraining', 'Needs', 'Physics',
'NumberAts', 'NumberEx']
else:
numeric_feats = ['Gender_Male', 'Gender_Female', 'BirthYear',
'Cluster', 'LoanPeriod', 'NumberAts']
numeric_feats = ['Gender', 'BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts']
skewed_feats = X[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.25]
skewed_feats = skewed_feats.index
......
......@@ -2,4 +2,5 @@
# Settings for api -------------------------------------------------
#
ats_resolution: 10
\ No newline at end of file
ats_resolution: 10
alarm_ats: "222718"
\ No newline at end of file
......@@ -171,9 +171,10 @@ def predict_alarm(incoming_data: InputData):
ats_resolution = settings['ats_resolution']
data = validate_data(incoming_data)
alarm_ats = settings['alarm_ats']
incoming_ats = [x.strip(' ') for x in data['Ats'].split(",")]
if any(x in list(['222718']) for x in incoming_ats) == True:
raise HTTPException(status_code=400, detail="An alarm cannot be in feature set")
if any(x in list([alarm_ats]) for x in incoming_ats) == True:
raise HTTPException(status_code=400, detail=f"Ats {alarm_ats} cannot be in feature set")
df = prepare_data(data, ats_resolution)
model = read_joblib("alarm_rsf.joblib")
......@@ -279,7 +280,8 @@ def generate_arguments(df: pd.DataFrame, ats_resolution: int, case: str, prob: f
else:
arguments.append(f'Uden et {i}. hjælpemiddel.')
loan_period_argument = f"og en gennemsnitlig låneperiode på {int(df.iloc[0].LoanPeriod)} dage"
loan_period = int(df.iloc[0].LoanPeriod)
loan_period_argument = f"og en gennemsnitlig låneperiode på {loan_period} dage"
arguments.append(loan_period_argument)
arguments.append("gennemfører" if case == "Complete" else "falder")
......
......@@ -31,7 +31,6 @@ def main(ats_resolution: int = None):
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
# Encode dataframe given params
target_settings = load_settings(f'{label_name.lower()}.yaml')
model_path = Path.joinpath(pt.ROOT_DIR, target_settings['model_path'])
df_enc = encode_dataframe(df=df_to_enc,
......@@ -52,9 +51,12 @@ def encode_dataframe(df, target_name, batch_size, train_ratio, epochs,
X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(df,
target_name,
train_ratio)
network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name, epochs=epochs,
batch_size=batch_size, network_layers=network_layers,
optimizer_fn=optimizer, verbose=verbose, model_path=model_path)
network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name,
epochs=epochs, batch_size=batch_size,
network_layers=network_layers,
optimizer_fn=optimizer,
verbose=verbose,
model_path=model_path)
network.fit(X_train, y_train, X_val, y_val)
network.save_model()
embedded_weights = network.get_embedded_weights()
......@@ -86,8 +88,10 @@ def encode_dataframe_cv(df, target_name, batch_size, train_ratio,
X, labels = preprocessor.encode_vector_label(X)
y = np.array(y)
network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name, epochs=epochs,
batch_size=batch_size, network_layers=network_layers,
network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name,
epochs=epochs,
batch_size=batch_size,
network_layers=network_layers,
verbose=verbose, model_path=model_path)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
......
......@@ -27,7 +27,7 @@ def main(ats_resolution: int = None):
else:
df = labeler.make_fall_label(accum_screenings)
# Split cat columns by ATS resolution
# Split cat columns by ats resolution
df = preprocessor.split_cat_columns(df, col_to_split='Ats',
tag='Ats', resolution=ats_resolution)
......
......@@ -10,7 +10,7 @@ import yaml
from utility.settings import load_settings
def main():
for label_name in ["Complete", "Compliance", "Alarm", "Fall"]:
for label_name in ["Complete", "Compliance", "Fall"]:
settings = load_settings("data.yaml")
ats = {str(i)+'Ats':str for i in range(1, settings['ats_resolution']+1)}
......
......@@ -19,6 +19,7 @@ def main():
data = data_dto.Data(sc, ss, td, tc, ats)
screenings = get_screenings(data, settings)
file_writer.write_csv(screenings, pt.INTERIM_DATA_DIR, 'screenings.csv')
def get_screenings(data, settings):
......@@ -76,9 +77,11 @@ def get_screenings_by_id(data, id, settings):
n_training_after = 0
if (n_screening == len(screening_split) - 1):
screening_date = screening.ScreeningDate
trainings_after = citizen_data.td.loc[(citizen_data.td['RatingDate'] > screening_date)]
rating_date_cond = citizen_data.td['RatingDate'] > screening_date
trainings_after = citizen_data.td.loc[rating_date_cond]
trainings_end_date = screening_date + DateOffset(weeks=15)
trainings_before_15 = trainings_after.loc[(trainings_after['RatingDate'] <= trainings_end_date)]
training_after_cond = (trainings_after['RatingDate'] <= trainings_end_date)
trainings_before_15 = trainings_after.loc[training_after_cond]
if len(trainings_before_15) > 0:
day_start = trainings_before_15.RatingDate.iloc[0]
day_end = trainings_before_15.RatingDate.iloc[-1]
......@@ -135,8 +138,10 @@ def get_screenings_by_id(data, id, settings):
single_screening['Ex'] = inputter.get_exercise_content(screening)
single_screening['NumberEx'] = inputter.get_number_exercises(screening)
single_screening['HasFallRisk'] = sum(map(screening.ExerciseContent.count,
settings['fall_exercises'])) > settings['fall_exercise_threshold']
ex_count = screening.ExerciseContent.count
ex_fall = settings['fall_exercises']
ex_threshold = settings['fall_exercise_threshold']
single_screening['HasFallRisk'] = sum(map(ex_count, ex_fall)) > ex_threshold
screenings = pd.concat([screenings, single_screening], axis=0, ignore_index=True)
......
......@@ -37,7 +37,7 @@ def main():
.diff().dt.days.fillna(0).astype(int)
# Tag alarm lends, save alarm citizens and filter subsequent lends
alarm_ats = "222718"
alarm_ats = settings['alarm_ats']
df['IsAlarmLend'] = df.apply(lambda x: 1 if alarm_ats in x['DevISOClass'] else 0, axis=1)
alarm_citizen_ids = list(df.loc[df['IsAlarmLend'] == 1]['CitizenId'])
alarm_dict = dict(df.loc[df['IsAlarmLend'] == 1][['CitizenId', 'DeltaLends']].values)
......
......@@ -11,9 +11,12 @@ from sklearn.svm import SVC
from tools import preprocessor
from abc import ABC, abstractmethod
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from typing import Tuple
from typing import Tuple, List
class BaseClassifer(ABC):
"""
Base class for classifiers.
"""
def __init__(self, X, y):
"""Initilizes inputs and targets variables"""
self.X = X
......@@ -21,9 +24,22 @@ class BaseClassifer(ABC):
@abstractmethod
def make_model(self):
"""Makes a model"""
"""
This method is an abstract method to be implemented
by a concrete classifier. Must return a sklearn-compatible
estimator object implementing 'fit'
"""
def evaluate(self, metrics=['accuracy'], k=0) -> Tuple[dict, np.ndarray]:
def evaluate(self, metrics:List = ['accuracy'], k: int=0) -> Tuple[dict,
np.ndarray]:
"""
This method performs cross validation for k seeds
on a given dataset X and y and outputs the results
of N splits given a scoring metric.
:param metrics: scoring metrics as a list
:param k: the number of seeds to use
:return: the results from a stratified K-fold CV process
"""
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=k)
model = self.make_model()
res_validate = cross_validate(model, self.X, self.y, cv=skf, scoring=metrics)
......@@ -93,7 +109,6 @@ class MlpClassifier(BaseClassifer):
optimizer="Adam",
metrics=metrics)
return model
neg, pos = np.bincount(self.y)
class_weight = preprocessor.get_class_weight(neg, pos)
return KerasClassifier(make_keras_model, epochs=20, batch_size=32,
......
......@@ -35,11 +35,23 @@ class BaseCleaner(ABC):
def remove_citizens_not_in_patient_data(self, train_data: pd.DataFrame,
patient_data: pd.DataFrame,
id: str) -> pd.DataFrame:
data = train_data[train_data[id].isin(patient_data[id].unique())]
id_col: str) -> pd.DataFrame:
"""
This method removes citizens not in patient data set
:param train_data: DigiRehab training data
:param patient_data: DigiRehab patient data
:param id_col: the name of the column identifing a citizen
:return: cleaned dataframe
"""
data = train_data[train_data[id_col].isin(patient_data[id_col].unique())]
return data
def remove_citizens_without_valid_id(self, df: pd.DataFrame) -> pd.DataFrame:
"""
This method removes citizens without a valid id
:param df: a dataframe
:return: cleaned dataframe
"""
df = df[df['CitizenId'] != "0000000000"]
df = df[df['CitizenId'] != '0']
df = df[df['CitizenId'] != "#VALUE!"]
......@@ -49,32 +61,79 @@ class BaseCleaner(ABC):
def merge_train_and_patient_data(self, train_data: pd.DataFrame,
patient_data: pd.DataFrame,
key: str) -> pd.DataFrame:
return pd.merge(train_data, patient_data, on=key)
id_col: str) -> pd.DataFrame:
"""
This method merges the training and patient data
:param train_data: DigiRehab training data
:param patient_data: DigiRehab patient data
:param id_col: the name of the column identifing a citizen
:return: merged dataframe
"""
return pd.merge(train_data, patient_data, on=id_col)
def sort_dataframe(self, data: pd.DataFrame, by: str) -> pd.DataFrame:
return data.sort_values(by)
def sort_dataframe(self, df: pd.DataFrame, by: str) -> pd.DataFrame:
"""