Commit 6684e69d authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

renamed lib to tools

parent a7683c81
Pipeline #39266 passed with stage
in 2 minutes and 15 seconds
......@@ -3,7 +3,7 @@
import numpy as np
import pandas as pd
import config as cfg
from lib import file_writer, parser, cleaner
from tools import file_writer, parser, cleaner
def main():
parser2020 = parser.Parser2020()
......
......@@ -5,7 +5,7 @@ import numpy as np
import pandas as pd
import config as cfg
import log as log
from lib import file_reader, file_writer
from tools import file_reader, file_writer
from utility import dataset, cluster_maker, kmodes_wrapper
def main():
......
......@@ -3,8 +3,8 @@
import pandas as pd
import numpy as np
import config as cfg
from lib import file_reader, file_writer
from lib import feature_maker, preprocessor
from tools import file_reader, file_writer
from tools import feature_maker, preprocessor
def main():
converters = {'CitizenId': str}
......
import config as cfg
import numpy as np
import pandas as pd
from lib import file_reader, file_writer, preprocessor
from tools import file_reader, file_writer, preprocessor
def main():
# Load the data
......
......@@ -3,7 +3,7 @@
import numpy as np
import pandas as pd
import config as cfg
from lib import file_reader, file_writer, feature_maker, preprocessor
from tools import file_reader, file_writer, feature_maker, preprocessor
from utility import data_dto, dataset
def main():
......
......@@ -3,7 +3,7 @@
import numpy as np
import pandas as pd
import config as cfg
from lib import file_reader, feature_maker
from tools import file_reader, feature_maker
from utility import cluster_maker, kmodes_wrapper
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
......
# !/usr/bin/env python
import config as cfg
import pandas as pd
import numpy as np
import os
import abc
class BaseCleaner(metaclass=abc.ABCMeta):
@abc.abstractmethod
def clean_clusters(self, cl):
"""Cleans the cluster data set"""
@abc.abstractmethod
def clean_patient_data(self, ptd):
"""Cleans the patient data set"""
@abc.abstractmethod
def clean_screening_content(self, sc, ptd):
"""Cleans the screening content data set"""
@abc.abstractmethod
def clean_status_set(self, ss, ptd):
"""Cleans the status set data set"""
@abc.abstractmethod
def clean_training_done(self, td, ptd):
"""Cleans the training done data set"""
@abc.abstractmethod
def clean_training_cancelled(self, tc, ptd):
"""Cleans the training cancelled data set"""
@abc.abstractmethod
def clean_assistive_aids(self, ats, ic, ids):
"""Cleans the assistive aids data set"""
@abc.abstractmethod
def clean_fall_data(self, fd):
"""Cleans the fall set"""
class Cleaner2020(BaseCleaner):
def clean_clusters(self, cl):
return cl
def clean_patient_data(self, ptd):
ptd = remove_citizens_without_valid_id(ptd)
return ptd
def clean_screening_content(self, sc, ptd):
sc = remove_citizens_not_in_patient_data(sc, ptd, cfg.CITIZEN_ID)
sc = merge_train_and_patient_data(sc, ptd, cfg.CITIZEN_ID)
sc = sort_dataframe(sc, [cfg.CITIZEN_ID, cfg.SCREENING_DATE])
return sc
def clean_status_set(self, ss, ptd):
ss = remove_citizens_not_in_patient_data(ss, ptd, cfg.CITIZEN_ID)
ss = merge_train_and_patient_data(ss, ptd, cfg.CITIZEN_ID)
ss = sort_dataframe(ss, [cfg.CITIZEN_ID, cfg.CHANGE_DATE])
return ss
def clean_training_done(self, td, ptd):
td = remove_citizens_not_in_patient_data(td, ptd, cfg.CITIZEN_ID)
td = sort_dataframe(td, [cfg.CITIZEN_ID, cfg.RATING_DATE])
return td
def clean_training_cancelled(self, tc, ptd):
tc = remove_citizens_not_in_patient_data(tc, ptd, cfg.CITIZEN_ID)
tc = merge_train_and_patient_data(tc, ptd, cfg.CITIZEN_ID)
tc = sort_dataframe(tc, [cfg.CITIZEN_ID, cfg.RATING_DATE])
return tc
def clean_assistive_aids(self, ats, ic, ids=None):
ats = sort_dataframe(ats, [cfg.CITIZEN_ID, cfg.LEND_DATE])
ats = remove_citizens_without_valid_id(ats)
ats = remove_rows_with_old_dates(ats, cfg.LEND_DATE)
ats = remove_deprecated_device_data(ats)
ats = remove_tainted_histories(ats)
ats = drop_invalid_devices(ats, ic)
return ats
def clean_fall_data(self, fd):
fd = remove_citizens_without_valid_id(fd)
fd = sort_dataframe(fd, [cfg.CITIZEN_ID, cfg.DATE])
return fd
class Cleaner2019(BaseCleaner):
def clean_clusters(self, cl):
return cl
def clean_patient_data(self, ptd):
ptd = remove_citizens_without_valid_id(ptd)
return ptd
def clean_screening_content(self, sc, ptd):
sc = remove_citizens_not_in_patient_data(sc, ptd, cfg.PATIENT_ID)
sc = remove_screenings_without_exercises(sc)
sc = merge_train_and_patient_data(sc, ptd, cfg.PATIENT_ID)
sc = sort_dataframe(sc, [cfg.CITIZEN_ID, cfg.SCREENING_DATE])
return sc
def clean_status_set(self, ss, ptd):
ss = remove_citizens_not_in_patient_data(ss, ptd, cfg.PATIENT_ID)
ss = merge_train_and_patient_data(ss, ptd, cfg.PATIENT_ID)
ss = sort_dataframe(ss, [cfg.CITIZEN_ID, cfg.CHANGE_DATE])
return ss
def clean_training_done(self, td, ptd):
td = remove_citizens_not_in_patient_data(td, ptd, cfg.PATIENT_ID)
td = merge_train_and_patient_data(td, ptd, cfg.PATIENT_ID)
td = sort_dataframe(td, [cfg.CITIZEN_ID, cfg.RATING_DATE])
return td
def clean_training_cancelled(self, tc, ptd):
tc = remove_citizens_not_in_patient_data(tc, ptd, cfg.PATIENT_ID)
tc = merge_train_and_patient_data(tc, ptd, cfg.PATIENT_ID)
tc = sort_dataframe(tc, [cfg.CITIZEN_ID, cfg.RATING_DATE])
return tc
def clean_assistive_aids(self, ats, ic, ids=None):
ats = sort_dataframe(ats, [cfg.CITIZEN_ID, cfg.LEND_DATE])
ats = filter_ats_on_ids(ats, ids)
ats = remove_rows_with_old_dates(ats, cfg.LEND_DATE)
ats = remove_rows_with_old_dates(ats, cfg.RETURN_DATE)
ats = remove_deprecated_device_data(ats)
ats = remove_tainted_histories(ats)
ats = drop_invalid_devices(ats, iso_classes)
return ats
def clean_fall_data(self, fd):
raise NotImplementedError
def drop_invalid_devices(ats, iso_classes):
return ats[ats[cfg.DEV_ISO_CLASS].isin(iso_classes.DevISOClass)]
def remove_screenings_without_exercises(df):
df = df[df[cfg.EXERCISE_CONTENT] != 'nan']
return df
def remove_citizens_not_in_patient_data(train_data, patient_data, id):
data = train_data[train_data[id].isin(patient_data[id].unique())]
return data
def remove_citizens_without_valid_id(df):
df = df[df[cfg.CITIZEN_ID] != "0000000000"]
df = df[df[cfg.CITIZEN_ID] != '0']
df = df[df[cfg.CITIZEN_ID] != "#VALUE!"]
df = df[df[cfg.CITIZEN_ID] != 'nan']
df = df[df[cfg.CITIZEN_ID] != '681']
return df
def merge_train_and_patient_data(train_data, patient_data, key):
return pd.merge(train_data, patient_data, on=key)
def sort_dataframe(data, by):
return data.sort_values(by)
def filter_ats_on_ids(ats, ids):
return ats[ats[cfg.CITIZEN_ID].isin(ids)]
def remove_tainted_histories(ats):
tained_ids = ats[ats[cfg.DEV_HMI_NUMBER] == '899,999'][cfg.CITIZEN_ID].unique()
ats = ats[np.logical_not(ats[cfg.CITIZEN_ID].isin(tained_ids))]
return ats
def remove_deprecated_device_data(ats):
return ats[ats[cfg.DEV_HMI_NUMBER] != '899,999']
def remove_rows_with_old_dates(ats, col):
ats[col] = pd.to_datetime(ats[col])
mask = (ats[col] >= '1900-01-01') & (ats[col] <= pd.Timestamp('today'))
return ats.loc[mask]
\ No newline at end of file
import pandas as pd
from lime.lime_tabular import LimeTabularExplainer
def train_and_save_explainer(model_data_and_results, device_features, X_devices,
fitted_model, tag, model_name, device_feature_name):
feature_names = model_data_and_results.features.iloc[0]
cat_feature_names = [item for item in device_features if item in feature_names]
categorical_features = None
if len(device_features):
categorical_features = [X_devices[model_data_and_results.features.iloc[0]].columns
.get_loc(col) for col in cat_feature_names]
explainer = LimeTabularExplainer(training_data=X_devices[feature_names].values,
feature_names=feature_names,
class_names=fitted_model.classes_,
categorical_features=categorical_features,
mode='classification',
discretize_continuous=True)
file_name = f'needs_{tag}_{model_name}_{device_feature_name}_explainer.pkl'
file_writer.write_explainer(explainer, cfg.MODELS_DIR, file_name)
def get_explainer(X_train, clf, feature_subset=None):
return LimeTabularExplainer(training_data=X_train.values,
feature_names=feature_subset,
class_names=clf.classes_)
def explain_prediction(X_row, clf, lime_explainer, max_features=6, feature_subset=None, test_mode=False):
if lime_explainer is None:
raise ValueError("Please create explainer first")
else:
if feature_subset:
explanation = lime_explainer.explain_instance(X_row[feature_subset].values, clf.predict_proba, num_features=max_features)
else:
explanation = lime_explainer.explain_instance(X_row, clf.predict_proba, num_features=max_features)
if test_mode:
return pd.DataFrame(explanation.as_list())
else:
file_writer.write_explainer(lime_explainer, cfg.MODELS_DIR, 'explainer')
file_name = f'{clf.__class__.__name__}_explanation.html'
file_writer.write_explanation(explanation, cfg.MODELS_DIR, file_name)
return pd.DataFrame(explanation.as_list())
\ No newline at end of file
import pandas as pd
import numpy as np
import datetime as dt
from collections import namedtuple
from pathlib import Path
import config as cfg
from utility import data_dto
from tools import preprocessor
def make_complete_feature(df):
df['Complete'] = df.groupby(['CitizenId'])['NumberCompleted'] \
.transform(lambda x: 1 if np.max(x) > 0 else 0)
df = df.astype({"Complete": int})
df_complete = df.loc[df['Complete'] == 1].drop_duplicates(subset='CitizenId')
df_fails = df.loc[df['Complete'] == 0].drop_duplicates(subset='CitizenId', keep='last')
df_all = pd.concat([df_complete, df_fails]).reset_index(drop=True)
return df_all
def make_number_completed(df):
threshold_weeks = 10
threshold_training = 7
for group_name, _ in df.groupby('CitizenId'):
n_completed = 0
cumsum_weeks = 0
cumsum_training = 0
items_weeks = df.loc[df.CitizenId == group_name].NumberWeeks.iteritems()
items_training = df.loc[df.CitizenId == group_name].NumberTraining.iteritems()
for (row_week, row_train) in zip(items_weeks, items_training):
cumsum_weeks += row_week[1]
cumsum_training += row_train[1]
if cumsum_weeks >= threshold_weeks and cumsum_training >= threshold_training:
cumsum_weeks = 0
cumsum_training = 0
n_completed += 1
df.loc[row_week[0], 'NumberCompleted'] = n_completed
return df
def make_improve_feature_absolute(df, threshold):
df['Improve'] = df.apply(lambda x: 1 if x['NeedsEnd']
<= (x['NeedsStart'] - threshold) else 0, axis=1)
return df
def make_improve_feature_relative(df):
df['Improve'] = df.apply(get_improve_diff, axis=1)
return df
def make_citizen_training(df):
df['NumberWeeksSum'] = get_col_cumsum(df, 'NumberWeeks')
df['NumberTrainingSum'] = get_col_cumsum(df, 'NumberTraining')
df['NeedsStartBaseline'] = get_col_first(df, 'NeedsStart')
df['MeanEvaluationMean'] = get_col_mean(df, 'MeanEvaluation')
df['StdEvaluationMean'] = get_col_mean(df, 'StdEvaluation')
df['NumberTrainingWeekMean'] = get_col_mean(df, 'NumberTrainingWeek')
df['MeanTimeBetweenTrainingMean'] = get_col_mean(df, 'MeanTimeBetweenTraining')
df['NumberCancelsSum'] = get_col_cumsum(df, 'NumberCancels')
df['MeanTimeBetweenCancelsMean'] = get_col_mean(df, 'MeanTimeBetweenCancels')
df['MeanNumberCancelsWeekMean'] = get_col_mean(df, 'MeanNumberCancelsWeek')
df['NeedsDifferenceMean'] = get_col_mean(df, 'NeedsDifference')
df['PhysicsDifferenceMean'] = get_col_mean(df, 'PhysicsDifference')
df['NumberExercisesMean'] = get_col_mean(df, 'NumberExercises')
return df
def make_citizen_ats(df):
df['NumberWeeksSum'] = get_col_cumsum(df, 'NumberWeeks')
df['NumberTrainingSum'] = get_col_cumsum(df, 'NumberTraining')
df['NumberAtsMean'] = get_col_mean(df, 'NumberAts')
return df
def get_improve_diff(row):
if row['NeedsStartBaseline'] != 0:
diff_pct = (row['NeedsStartBaseline'] - row['NeedsEnd']) / row['NeedsStartBaseline']
return 1 if diff_pct >= 0.1 else 0
else:
raise ValueError('NeedsStartBaseline was zero, cannot compute improve')
def get_col_cumsum(df, col):
return np.around(df.groupby(['CitizenId', 'NumberCompleted'])[col].transform(pd.Series.cumsum), decimals=2)
def get_col_mean(df, col):
return np.around(df.groupby(['CitizenId', 'NumberCompleted'])[col].transform(pd.Series.mean), decimals=2)
def get_col_max(df, col):
return df.groupby(['CitizenId', 'NumberCompleted'])[col].transform(pd.Series.max)
def get_col_first(df, col):
return df.groupby(['CitizenId', 'NumberCompleted'])[col].transform('first')
def get_number_of_falls(df):
df = df.drop_duplicates(["CitizenId", "Date"])
fall_dict = dict(df.groupby(['CitizenId'])['Date'].count())
return fall_dict
def get_number_falls(fd):
if not fd.empty:
falls = fd.drop_duplicates(["CitizenId", "Date"])
return len(falls)
else:
return 0
def get_ats(ats, end_date):
screening_ats = ','.join([str(elem)[:4] for elem in ats.DevISOClass[ats.LendDate <= end_date]])
if screening_ats:
return screening_ats
return np.nan
def get_number_ats(ats, end_date):
number_ats = ats.where(ats.LendDate <= end_date).groupby(['CitizenId'])['DevISOClass'].count()
if number_ats.any():
return number_ats[0]
return 0
def get_number_exercises(ex):
return len([str(elem) for elem in ex.split(',')])
def get_start_year(pre_screening):
return pd.to_datetime(pre_screening.ScreeningDate).year
def get_birth_year(sc):
return int(sc[cfg.BIRTH_YEAR].iloc[0]) + 1900
def get_gender(sc):
if (sc[cfg.GENDER].iloc[0] == 'FEMALE'):
return 0
return 1
def get_citizen_data(data, id):
sc = data.sc.loc[data.sc[cfg.CITIZEN_ID] == str(id)]
ss = data.ss.loc[data.ss[cfg.CITIZEN_ID] == str(id)]
td = data.td.loc[data.td[cfg.CITIZEN_ID] == str(id)]
tc = data.tc.loc[data.tc[cfg.CITIZEN_ID] == str(id)]
ats = data.ats.loc[data.ats[cfg.CITIZEN_ID] == str(id)]
fd = data.fd.loc[data.fd[cfg.CITIZEN_ID] == str(id)]
citizen_data = data_dto.Data(sc, ss, td, tc, ats, fd)
return citizen_data
def get_window_data(td, tc, ss, ats, start_date, end_date):
tdw = td.loc[(td[cfg.RATING_DATE] <= end_date)
& (td[cfg.RATING_DATE] >= start_date)]
tcw = tc.loc[(tc[cfg.RATING_DATE] <= end_date)
& (tc[cfg.RATING_DATE] >= start_date)]
ssw = ss.loc[(ss[cfg.CHANGE_DATE] <= end_date)
& (ss[cfg.CHANGE_DATE] >= start_date)]
huw = ats.loc[(ats[cfg.LEND_DATE] <= end_date)
& (ats[cfg.LEND_DATE] >= start_date)]
return tdw, tcw, ssw, huw
def convert_date_to_datetime(start_date, date_format):
return pd.to_datetime(start_date, format=date_format)
def get_cancels_week(tcw):
cancels_week = tcw[cfg.RATING_DATE].apply(lambda x: "%d/%d" % (x.week, x.year))
return cancels_week
def has_ats_id(ats, device_id):
return any(x in ats for x in device_id)
def get_cluster(cl):
if not cl.empty:
return int(cl.iloc[0].Cluster)
else:
return 0
def get_interval_length(start_date, end_date):
return np.around((end_date - start_date).days / 7, decimals=2)
def get_rehab_indicator(needs_start, physics_start):
return np.around(needs_start / (physics_start + 0.0001), decimals=2)
def get_needs_reason(screening):
needs_reason = screening.NeedForHelpReason
if not needs_reason == 'nan':
return needs_reason
else:
return np.nan
def get_physics_reason(screening):
physics_reason = screening.PhysicalStrengthReason
if not physics_reason == 'nan':
return physics_reason
else:
return np.nan
def get_last_status(ssw):
if ssw.empty:
return np.nan
else:
last_status = ssw[cfg.STATUS].iat[-1]
if not pd.isnull(last_status):
return last_status.replace(' ', '')
else:
return np.nan
def get_last_status_date(ssw, end_date, date_format):
if ssw.empty:
return end_date
else:
return pd.to_datetime(ssw[cfg.CHANGE_DATE].iat[-1], format=date_format)
def get_exercises(ex):
return ex
def get_physics_indicator(physics_start, physics_end):
return 1 if physics_end > physics_start else 0
def get_n_cancel_week_min(cancelsprweek):
return cancelsprweek.value_counts().min() if not cancelsprweek.empty else 0
def get_n_weeks_without_training(n_weeks, n_weeks_with_trainings):
return max(0, (np.ceil(n_weeks) - n_weeks_with_trainings))
def get_n_weeks_with_training(tdw, start_date):
return tdw[cfg.RATING_DATE].apply(lambda x: np.floor((x - start_date).days / 7)).nunique()
def get_training_week(tdw, start_date):
if not tdw.empty:
return tdw[cfg.RATING_DATE].apply(lambda x: np.floor((x - start_date).days / 7))
return pd.Series([])
def get_n_training_week_max(training_pr_week):
if not training_pr_week.empty:
return training_pr_week.value_counts().max()
else:
return 0
def get_n_training_week_min(training_pr_week, n_weeks_with_trainings, n_weeks):
if not training_pr_week.empty and n_weeks_with_trainings > n_weeks:
return training_pr_week.value_counts().min()
else:
return 0
def get_n_training_week(n_weeks, n_training_window):
if n_training_window is not None and n_weeks is not None:
n_trainings_week = np.around(float(n_training_window) / n_weeks, decimals=1) if n_weeks else 0
if not pd.isnull(n_trainings_week):
return n_trainings_week
else:
return 0
else:
return 0
def get_n_training_optimal(n_weeks):
n_optimal = np.ceil(n_weeks * 2)
return n_optimal
def get_n_training_window(tdw):
n_training_window = tdw.shape[0] if not tdw.empty else 0
return n_training_window
def get_max_evaluation(tdw):
max_evaluation = np.around(tdw[cfg.RATING_SCORE].max(axis=0), decimals=1)
if not pd.isnull(max_evaluation):
return max_evaluation
else:
return 0
def get_min_evaluation(tdw):
min_evaluation = np.around(tdw[cfg.RATING_SCORE].min(axis=0), decimals=1)
if not pd.isnull(min_evaluation):
return min_evaluation
else:
return 0
def get_std_evaluation(tdw):
std_evaluation = np.around(tdw[cfg.RATING_SCORE].std(axis=0), decimals=1)
if not pd.isnull(std_evaluation):
return std_evaluation
else:
return 0
def get_mean_evaluation(tdw):
mean_evaluation = np.around(tdw[cfg.RATING_SCORE].mean(axis=0), decimals=1)
if not pd.isnull(mean_evaluation):
return mean_evaluation
else:
return 0
def get_time_between_training_mean(tdw, n_decimals=2):
time_between_trainings_dif = tdw[cfg.RATING_DATE].diff().apply(lambda x: x.days) if not tdw.empty else None
if time_between_trainings_dif is not None:
if not time_between_trainings_dif.empty:
time_mean = np.round(time_between_trainings_dif.iloc[1:].mean(), n_decimals)
if not pd.isnull(time_mean):
return time_mean
else:
return 0
else:
return 0
else:
return 0
def get_mean_time_between_cancels(tcw, n_decimals=2):
if not tcw.empty:
time_between_cancels_diff = tcw[cfg.RATING_DATE].diff().apply(lambda x: x.days)
if time_between_cancels_diff is not None:
if tcw.shape[0] > 1:
return np.round(time_between_cancels_diff.iloc[1:].mean(), n_decimals)
else:
return 0
else:
return 0
return 0
def get_mean_cancels_week(n_cancel, n_weeks):
if n_cancel is not None and n_weeks is not None:
mean_cancels = round(float(n_cancel) / n_weeks, 2) if n_weeks else 0
return mean_cancels
else:
return 0
\ No newline at end of file