Commit 51cd33c1 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

made config file and tested scaling

parent 1ea4f333
Pipeline #23224 failed with stage
in 3 minutes and 35 seconds
This diff is collapsed.
This diff is collapsed.
......@@ -18,8 +18,7 @@ import numpy as np
import pandas as pd
# Internal
from src.globals.constants import *
from src.globals.paths import *
from src.config import *
def filter_duplicated_devices(hu):
"""
......
......@@ -15,8 +15,7 @@ import numpy as np
import pandas as pd
from src.clustering.clean import *
from src.globals.constants import *
from src.globals.paths import *
from src.config import *
def create_clusters(hu, test_mode=False):
hu[DEV_ISO_CLASS] = hu[DEV_ISO_CLASS].apply(lambda ik: str(ik)[0:4])
......
import json
import os
from pathlib import Path
FILE_PATHS = ['DigiRehab_BorgerID_TrainingDone.xlsx',
'DigiRehab_BorgerID_TrainingCancelled.xlsx',
'DigiRehab_BorgerID_StatusSet.xlsx',
'DigiRehab_BorgerID_ScreeningContent.xlsx',
'HjaelpemidlerUdtraek.csv',
'isoall.txt',
'DigiRehab_borgerIDALL.csv',
'clusters.csv']
ROOT_DIR = Path(__file__).absolute().parent.parent
MODELS_DIR = Path.joinpath(ROOT_DIR, 'models')
LOGS_DIR = Path.joinpath(ROOT_DIR, 'src/logs')
CONFIG_DIR = Path.joinpath(ROOT_DIR, 'src/cfg')
TESTS_FILES_DIR = Path.joinpath(ROOT_DIR, 'src/tests/files')
RAW_DATA_DIR = Path.joinpath(ROOT_DIR, 'data/raw')
PROCESSED_DATA_DIR = Path.joinpath(ROOT_DIR, 'data/processed')
INTERIM_DATA_DIR = Path.joinpath(ROOT_DIR, 'data/interim')
EXTERNAL_DATA_DIR = Path.joinpath(ROOT_DIR, 'data/external')
GENERAL_SUBSET = ["Age",
'NumberATsRunning',
'Sex',
'RehabIndicator',
'NeedsStart',
'PhysicsStart']
DEVICE_SUBSET = ['HasRollator',
'HasRaisedToiletSeat',
'HasShowerStool',
'HasRaisedToiletSeatAndShowerStool',
'DevicesUnique',
'DevicesCount'] #TODO: Re add 'Cluster'
OBJECT_COLS = ['LastStatus',
'HasRollator',
'HasRaisedToiletSeat',
'HasShowerStool',
'HasRaisedToiletSeatAndShowerStool']
LIST_COLS = ['DevicesUnique', 'DevicesCount']
DEVICE_OPTIONS = ['Unique', 'Count', 'Has', 'Empty'] # TODO: Re add 'cluster'
TWO_SCREENINGS_SUBSET = ['Age',
'NumberATsRunning',
'Sex',
'RehabIndicator',
'NeedsStart',
'PhysicsStart',
'NeedsDifference',
'MeanEvaluation',
'StdEvaluation',
'MinEvaluation',
'MaxEvaluation',
'nTrainingPrWeek',
'nTrainingPrWeekMax',
'nTrainingPrWeekMin',
'TimeBetweenTrainingsAvg',
'nCancellationsPrWeekAVG',
'nCancellationsPrWeekMax',
'nCancellationsPrWeekMin',
'NeedsEnd',
'PhysicsEnd',
'PhysicsDifference']
PATIENT_ID = 'PatientId'
CITIZEN_ID = 'CitizenId'
BIRTH_YEAR = 'BirthYear'
SEX = 'Sex'
RATING_DATE = 'RatingDate'
RATING_SCORE = 'RatingScore'
CHANGE_DATE = 'ChangeDate'
STATUS = 'Status'
SCREENING_DATE = 'ScreeningDate'
NEED_FOR_HELP_SCORE = 'NeedForHelpScore'
NEED_FOR_HELP_REASON = 'NeedForHelpReason'
PHYSICAL_STRENGTH_SCORE = 'PhysicalStrengthScore'
PHYSICAL_STRENGTH_REASON = 'PhysicalStrengthReason'
EXERCISE_CONTENT = 'ExerciseContent'
DEV_HMI_NUMBER = 'DevHMINumber'
DEV_HMI_NAME = 'DevHMIName'
DEV_ISO_CLASS = 'DevISOClass'
DEV_SERIAL = 'DevSerial'
LAW_PARAGRAPH = 'LawParagraph'
LEND_DATE = 'LendDate'
RETURN_DATE = 'ReturnDate'
PRICE = 'Price'
GROUP_SIZE = 'GroupSize'
DESCRIPTION = 'Description'
\ No newline at end of file
......@@ -7,47 +7,50 @@ Date: 20th March, 2020
"""
# Internal
from src.globals.constants import *
import src.utils.utility as ut
import src.log as logger
import src.config as cfg
# External
import pandas as pd
import numpy as np
logger = logger.setup_logger('cleaner')
def clean_patient_data(ptd):
ptd = remove_citizens_without_valid_id(ptd)
return ptd
def clean_screening_values(sv, ptd):
sv = remove_citizens_not_in_patient_data(sv, ptd, PATIENT_ID)
sv = remove_citizens_not_in_patient_data(sv, ptd, cfg.PATIENT_ID)
sv = remove_screenings_without_exercises(sv)
sv = merge_train_and_patient_data(sv, ptd, PATIENT_ID)
sv = sort_dataframe(sv, [CITIZEN_ID, SCREENING_DATE])
sv = merge_train_and_patient_data(sv, ptd, cfg.PATIENT_ID)
sv = sort_dataframe(sv, [cfg.CITIZEN_ID, cfg.SCREENING_DATE])
return sv
def clean_status_set(ss, ptd):
ss = remove_citizens_not_in_patient_data(ss, ptd, PATIENT_ID)
ss = merge_train_and_patient_data(ss, ptd, PATIENT_ID)
ss = sort_dataframe(ss, [CITIZEN_ID, CHANGE_DATE])
ss = remove_citizens_not_in_patient_data(ss, ptd, cfg.PATIENT_ID)
ss = merge_train_and_patient_data(ss, ptd, cfg.PATIENT_ID)
ss = sort_dataframe(ss, [cfg.CITIZEN_ID, cfg.CHANGE_DATE])
return ss
def clean_training_done(td, ptd):
td = remove_citizens_not_in_patient_data(td, ptd, PATIENT_ID)
td = merge_train_and_patient_data(td, ptd, PATIENT_ID)
td = sort_dataframe(td, [CITIZEN_ID, RATING_DATE])
td = remove_citizens_not_in_patient_data(td, ptd, cfg.PATIENT_ID)
td = merge_train_and_patient_data(td, ptd, cfg.PATIENT_ID)
td = sort_dataframe(td, [cfg.CITIZEN_ID, cfg.RATING_DATE])
return td
def clean_training_cancelled(tc, ptd):
tc = remove_citizens_not_in_patient_data(tc, ptd, PATIENT_ID)
tc = merge_train_and_patient_data(tc, ptd, PATIENT_ID)
tc = sort_dataframe(tc, [CITIZEN_ID, RATING_DATE])
tc = remove_citizens_not_in_patient_data(tc, ptd, cfg.PATIENT_ID)
tc = merge_train_and_patient_data(tc, ptd, cfg.PATIENT_ID)
tc = sort_dataframe(tc, [cfg.CITIZEN_ID, cfg.RATING_DATE])
return tc
def clean_assistive_aids(ats, ids, iso_classes):
ats = sort_dataframe(ats, [CITIZEN_ID, LEND_DATE])
ats = sort_dataframe(ats, [cfg.CITIZEN_ID, cfg.LEND_DATE])
ats = filter_ats_on_ids(ats, ids)
ats = remove_rows_with_old_dates(ats, LEND_DATE)
ats = remove_rows_with_old_dates(ats, RETURN_DATE)
ats = remove_rows_with_old_dates(ats, cfg.LEND_DATE)
ats = remove_rows_with_old_dates(ats, cfg.RETURN_DATE)
ats = remove_rows_with_invalid_id(ats)
ats = remove_deprecated_device_data(ats)
ats = remove_tainted_histories(ats)
......@@ -58,7 +61,7 @@ def drop_invalid_devices(ats, iso_classes):
return ats[ats['DevISOClass'].isin(iso_classes.DevISOClass)]
def remove_screenings_without_exercises(screenings):
nan_exercises = pd.isna(screenings[EXERCISE_CONTENT])
nan_exercises = pd.isna(screenings[cfg.EXERCISE_CONTENT])
screenings = screenings[~nan_exercises]
return screenings
......@@ -67,8 +70,8 @@ def remove_citizens_not_in_patient_data(train_data, patient_data, id):
return data
def remove_citizens_without_valid_id(patient_data):
patients_to_remove = patient_data[patient_data[CITIZEN_ID] == '0'][PATIENT_ID].unique()
patient_data = patient_data[~patient_data[PATIENT_ID].isin(patients_to_remove)]
patients_to_remove = patient_data[patient_data[cfg.CITIZEN_ID] == '0'][cfg.PATIENT_ID].unique()
patient_data = patient_data[~patient_data[cfg.PATIENT_ID].isin(patients_to_remove)]
return patient_data
def merge_train_and_patient_data(train_data, patient_data, key):
......@@ -79,18 +82,18 @@ def sort_dataframe(data, by):
def filter_ats_on_ids(ats, ids):
return ats[ats[CITIZEN_ID].isin(ids)]
return ats[ats[cfg.CITIZEN_ID].isin(ids)]
def remove_tainted_histories(ats):
tained_ids = ats[ats[DEV_HMI_NUMBER] == '899,999'][CITIZEN_ID].unique()
ats = ats[np.logical_not(ats[CITIZEN_ID].isin(tained_ids))]
tained_ids = ats[ats[cfg.DEV_HMI_NUMBER] == '899,999'][cfg.CITIZEN_ID].unique()
ats = ats[np.logical_not(ats[cfg.CITIZEN_ID].isin(tained_ids))]
return ats
def remove_deprecated_device_data(ats):
return ats[ats[DEV_HMI_NUMBER] != '899,999']
return ats[ats[cfg.DEV_HMI_NUMBER] != '899,999']
def remove_rows_with_invalid_id(ats):
return ats[ats[CITIZEN_ID] != "0000000000"]
return ats[ats[cfg.CITIZEN_ID] != "0000000000"]
def remove_rows_with_old_dates(ats, col):
ats[col] = pd.to_datetime(ats[col])
......
......@@ -7,15 +7,15 @@ Date: 20th March, 2020
"""
# Internal
from src.globals.constants import *
from src.globals.paths import *
import src.data.file_reader as file_reader
import src.data.file_writer as file_writer
import src.config as cfg
# External
import pandas as pd
import numpy as np
import datetime
from pathlib import Path
def parse_training_table(file_name, path, parsed_file_name):
"""
......@@ -26,14 +26,14 @@ def parse_training_table(file_name, path, parsed_file_name):
"""
df = file_reader.read_excelfile(path, file_name)
training_columns = [PATIENT_ID, RATING_DATE, RATING_SCORE]
training_columns = [cfg.PATIENT_ID, cfg.RATING_DATE, cfg.RATING_SCORE]
training_df = format_single_line_data(df, training_columns, "Patient Id")
# Format data set
training_df.replace(to_replace='', value=np.nan, regex=True, inplace=True)
training_df[PATIENT_ID] = pd.Series.astype(training_df[PATIENT_ID], dtype=str)
training_df[RATING_DATE] = pd.to_datetime(training_df[RATING_DATE], format='%d/%m/%Y')
training_df[RATING_SCORE] = pd.Series.astype(training_df[RATING_SCORE], dtype='float')
training_df[cfg.PATIENT_ID] = pd.Series.astype(training_df[cfg.PATIENT_ID], dtype=str)
training_df[cfg.RATING_DATE] = pd.to_datetime(training_df[cfg.RATING_DATE], format='%d/%m/%Y')
training_df[cfg.RATING_SCORE] = pd.Series.astype(training_df[cfg.RATING_SCORE], dtype='float')
#TODO: Enable saving
#file_name = str(parsed_file_name + ".csv")
......@@ -50,15 +50,15 @@ def parse_status_set(file_name, path, parsed_file_name):
"""
df = file_reader.read_excelfile(path, file_name)
ss_columns = [PATIENT_ID, CHANGE_DATE, STATUS]
ss_columns = [cfg.PATIENT_ID, cfg.CHANGE_DATE, cfg.STATUS]
status_set_df = format_single_line_data(df, ss_columns, "Patient Id")
# Format data set
status_set_df.replace(to_replace='', value=np.nan, regex=True, inplace=True)
status_set_df[PATIENT_ID] = pd.Series.astype(status_set_df[PATIENT_ID], dtype=str)
status_set_df[CHANGE_DATE] = pd.to_datetime(status_set_df[CHANGE_DATE], format='%d/%m/%Y')
status_set_df[STATUS] = status_set_df[STATUS].astype('category')
status_set_df[cfg.PATIENT_ID] = pd.Series.astype(status_set_df[cfg.PATIENT_ID], dtype=str)
status_set_df[cfg.CHANGE_DATE] = pd.to_datetime(status_set_df[cfg.CHANGE_DATE], format='%d/%m/%Y')
status_set_df[cfg.STATUS] = status_set_df[cfg.STATUS].astype('category')
#TODO: Enable saving
#file_name = str(parsed_file_name + ".csv")
......@@ -76,26 +76,26 @@ def parse_screening_values(file_name, path, parsed_file_name):
"""
df = file_reader.read_excelfile(path, file_name)
sv_columns = [PATIENT_ID,
SCREENING_DATE,
NEED_FOR_HELP_SCORE,
NEED_FOR_HELP_REASON,
PHYSICAL_STRENGTH_SCORE,
PHYSICAL_STRENGTH_REASON,
EXERCISE_CONTENT]
sv_columns = [cfg.PATIENT_ID,
cfg.SCREENING_DATE,
cfg.NEED_FOR_HELP_SCORE,
cfg.NEED_FOR_HELP_REASON,
cfg.PHYSICAL_STRENGTH_SCORE,
cfg.PHYSICAL_STRENGTH_REASON,
cfg.EXERCISE_CONTENT]
# iterate through and parse the data
sv = format_multiple_line_data(df, sv_columns, "Patient Id")
# formatting dataset
sv.replace(to_replace='', value=np.nan, regex=True, inplace=True)
sv[PATIENT_ID] = pd.Series.astype(sv[PATIENT_ID], dtype=str)
sv[SCREENING_DATE] = pd.to_datetime(sv[SCREENING_DATE], format='%d-%m-%Y')
sv[NEED_FOR_HELP_SCORE] = pd.Series.astype(sv[NEED_FOR_HELP_SCORE], dtype='float')
sv[NEED_FOR_HELP_REASON] = sv[NEED_FOR_HELP_REASON].astype('category')
sv[PHYSICAL_STRENGTH_SCORE] = pd.Series.astype(sv[PHYSICAL_STRENGTH_SCORE], dtype='float')
sv[PHYSICAL_STRENGTH_REASON] = sv[PHYSICAL_STRENGTH_REASON].astype('category')
sv[EXERCISE_CONTENT] = sv[EXERCISE_CONTENT].astype('category')
sv[cfg.PATIENT_ID] = pd.Series.astype(sv[cfg.PATIENT_ID], dtype=str)
sv[cfg.SCREENING_DATE] = pd.to_datetime(sv[cfg.SCREENING_DATE], format='%d-%m-%Y')
sv[cfg.NEED_FOR_HELP_SCORE] = pd.Series.astype(sv[cfg.NEED_FOR_HELP_SCORE], dtype='float')
sv[cfg.NEED_FOR_HELP_REASON] = sv[cfg.NEED_FOR_HELP_REASON].astype('category')
sv[cfg.PHYSICAL_STRENGTH_SCORE] = pd.Series.astype(sv[cfg.PHYSICAL_STRENGTH_SCORE], dtype='float')
sv[cfg.PHYSICAL_STRENGTH_REASON] = sv[cfg.PHYSICAL_STRENGTH_REASON].astype('category')
sv[cfg.EXERCISE_CONTENT] = sv[cfg.EXERCISE_CONTENT].astype('category')
#TODO: Enable saving
#file_name = str(parsed_file_name + ".csv")
......@@ -112,15 +112,15 @@ def parse_assistive_aids(file_name, path, parsed_file_name):
"""
hu_file = Path.joinpath(path, file_name)
hu_columns = [CITIZEN_ID,
DEV_HMI_NUMBER,
DEV_HMI_NAME,
DEV_ISO_CLASS,
DEV_SERIAL,
LAW_PARAGRAPH,
LEND_DATE,
RETURN_DATE,
PRICE]
hu_columns = [cfg.CITIZEN_ID,
cfg.DEV_HMI_NUMBER,
cfg.DEV_HMI_NAME,
cfg.DEV_ISO_CLASS,
cfg.DEV_SERIAL,
cfg.LAW_PARAGRAPH,
cfg.LEND_DATE,
cfg.RETURN_DATE,
cfg.PRICE]
date_parser = lambda c: pd.to_datetime(c, format='%Y/%m/%d %H:%M:%S', errors='coerce')
......@@ -149,7 +149,7 @@ def parse_iso_classes(file_name, path, parsed_file_name):
df = pd.read_csv(isoclass_file,
header=None,
usecols=[0, 1, 2],
names=[DEV_ISO_CLASS, GROUP_SIZE, DESCRIPTION],
names=[cfg.DEV_ISO_CLASS, cfg.GROUP_SIZE, cfg.DESCRIPTION],
converters={i: str for i in range(0, 10000)})
#TODO: Enable saving
......@@ -168,10 +168,10 @@ def parse_patient_data(file_name, path, parsed_file_name):
"""
citizen_file = Path.joinpath(path, file_name)
patient_data_columns = [PATIENT_ID,
CITIZEN_ID,
SEX,
BIRTH_YEAR]
patient_data_columns = [cfg.PATIENT_ID,
cfg.CITIZEN_ID,
cfg.SEX,
cfg.BIRTH_YEAR]
df = pd.read_csv(citizen_file,
header=0,
......@@ -180,10 +180,10 @@ def parse_patient_data(file_name, path, parsed_file_name):
df.dropna(subset=['Gender'], inplace=True) # remove rows with incomplete data
df.columns = patient_data_columns
df[PATIENT_ID] = pd.Series.astype(df[PATIENT_ID], dtype=str)
df[CITIZEN_ID] = pd.Series.astype(df[CITIZEN_ID], dtype=str)
df[SEX] = df[SEX].astype('category')
df[BIRTH_YEAR] = df[BIRTH_YEAR].astype('float')
df[cfg.PATIENT_ID] = pd.Series.astype(df[cfg.PATIENT_ID], dtype=str)
df[cfg.CITIZEN_ID] = pd.Series.astype(df[cfg.CITIZEN_ID], dtype=str)
df[cfg.SEX] = df[cfg.SEX].astype('category')
df[cfg.BIRTH_YEAR] = df[cfg.BIRTH_YEAR].astype('float')
#TODO: Enable saving
#file_name = str(parsed_file_name + ".csv")
......@@ -192,9 +192,9 @@ def parse_patient_data(file_name, path, parsed_file_name):
return df
def parse_clusters(file_name):
clusters = pd.read_csv(Path.joinpath(INTERIM_DATA_DIR, file_name),
names=[CITIZEN_ID, 'Cluster'],
dtype = {CITIZEN_ID: str})
clusters = pd.read_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, file_name),
names=[cfg.CITIZEN_ID, 'Cluster'],
dtype = {cfg.CITIZEN_ID: str})
return clusters
def format_single_line_data(data, data_columns, index_col, delimiter=';'):
......
......@@ -8,8 +8,7 @@ Date: 20th March, 2020
# Imports
# Internal
from src.globals.constants import *
from src.globals.paths import *
import src.config as cfg
import src.utils.utility as ut
# External
......@@ -49,7 +48,7 @@ single_sample=True, only_first_and_last_sv=False,
# Save the processed files and features to disk
if not test_mode:
features.to_csv(Path.joinpath(PROCESSED_DATA_DIR, "AIRfeatures.csv"),
features.to_csv(Path.joinpath(cfg.PROCESSED_DATA_DIR, "AIRfeatures.csv"),
sep=";", index=False, date_format="%d-%m-%Y", na_rep='NA')
## Fejler her:
......@@ -68,12 +67,12 @@ def create_window_features(id, data, n_citizens_less_two_sv, only_first_and_last
id_features = pd.DataFrame()
# Make copy of this specific person's values from data tuple
ptd_id = data.ptd.loc[data.ptd[CITIZEN_ID] == id]
sv_id = data.sv.loc[data.sv[CITIZEN_ID] == id]
ss_id = data.ss.loc[data.ss[CITIZEN_ID] == id]
td_id = data.td.loc[data.td[CITIZEN_ID] == id]
tc_id = data.tc.loc[data.tc[CITIZEN_ID] == id]
ats_id = data.ats.loc[data.ats[CITIZEN_ID] == str(id)]
ptd_id = data.ptd.loc[data.ptd[cfg.CITIZEN_ID] == id]
sv_id = data.sv.loc[data.sv[cfg.CITIZEN_ID] == id]
ss_id = data.ss.loc[data.ss[cfg.CITIZEN_ID] == id]
td_id = data.td.loc[data.td[cfg.CITIZEN_ID] == id]
tc_id = data.tc.loc[data.tc[cfg.CITIZEN_ID] == id]
ats_id = data.ats.loc[data.ats[cfg.CITIZEN_ID] == str(id)]
if sv_id.empty or len(sv_id) < 2: # If there is no screening record just jump to next BorgerID.
n_citizens_less_two_sv = n_citizens_less_two_sv + 1
......@@ -88,7 +87,7 @@ def create_window_features(id, data, n_citizens_less_two_sv, only_first_and_last
# Looking at the screenings and take the windows prior to this
for i, screening in enumerate(sv_id.itertuples(index=False)):
# skipping the loop if the first screening as this wil be the ini
# Skipping the loop if the first screening as this will be the ini
if i == 0:
continue
......@@ -117,24 +116,25 @@ def create_window_features(id, data, n_citizens_less_two_sv, only_first_and_last
continue
# create window and assign features that are not time dependent
patient_id = sv_id[PATIENT_ID].iloc[0]
sex = 0 if (sv_id[SEX].iloc[0] == 'Kvinde') else 1
birth_year = int(sv_id[BIRTH_YEAR].iloc[0]) + 1900
window_features = pd.DataFrame([{CITIZEN_ID: id, PATIENT_ID: patient_id, SEX: sex, BIRTH_YEAR: birth_year}])
patient_id = sv_id[cfg.PATIENT_ID].iloc[0]
sex = 0 if (sv_id[cfg.SEX].iloc[0] == 'Kvinde') else 1
birth_year = int(sv_id[cfg.BIRTH_YEAR].iloc[0]) + 1900
window_features = pd.DataFrame([{cfg.CITIZEN_ID: id, cfg.PATIENT_ID: patient_id, cfg.SEX: sex, cfg.BIRTH_YEAR: birth_year}])
window_features['Age'] = get_age_at_start(pre_screening, birth_year)
# Data for this particular window
tdw = td_id.loc[(td_id[RATING_DATE] <= end_date) & (td_id[RATING_DATE] >= start_date)]
tcw = tc_id.loc[(tc_id[RATING_DATE] <= end_date) & (tc_id[RATING_DATE] >= start_date)]
ssw = ss_id.loc[(ss_id[CHANGE_DATE] <= end_date) & (ss_id[CHANGE_DATE] >= start_date)]
huw = ats_id.loc[(ats_id[LEND_DATE] <= end_date) & (ats_id[LEND_DATE] >= start_date)]
tdw = td_id.loc[(td_id[cfg.RATING_DATE] <= end_date) & (td_id[cfg.RATING_DATE] >= start_date)]
tcw = tc_id.loc[(tc_id[cfg.RATING_DATE] <= end_date) & (tc_id[cfg.RATING_DATE] >= start_date)]
ssw = ss_id.loc[(ss_id[cfg.CHANGE_DATE] <= end_date) & (ss_id[cfg.CHANGE_DATE] >= start_date)]
huw = ats_id.loc[(ats_id[cfg.LEND_DATE] <= end_date) & (ats_id[cfg.LEND_DATE] >= start_date)]
# Set rating date as start date for window if it exists
start_date = tdw.RatingDate.iloc[0] if not tdw.empty else start_date
# Calculate and assign evaluations
window_features['StartDate'] = dt.datetime.strptime(str(start_date), '%Y-%m-%d %H:%M:%S').date()
window_features['EndDate'] = dt.datetime.strptime(str(end_date), '%Y-%m-%d %H:%M:%S').date()
#StartDate, EndDate
window_features['StartMonth'] = dt.datetime.strptime(str(start_date), '%Y-%m-%d %H:%M:%S').date().month
window_features['EndMonth'] = dt.datetime.strptime(str(end_date), '%Y-%m-%d %H:%M:%S').date().month
window_features['nWeeks'] = n_weeks
window_features['MeanEvaluation'] = get_mean_evaluation(tdw)
window_features['StdEvaluation'] = get_std_evaluation(tdw)
......@@ -162,7 +162,7 @@ def create_window_features(id, data, n_citizens_less_two_sv, only_first_and_last
# Calculate and assign training cancellations
n_cancel = tcw.shape[0]
cancelsprweek = tcw[RATING_DATE].apply(lambda x: "%d/%d" % (x.week, x.year))
cancelsprweek = tcw[cfg.RATING_DATE].apply(lambda x: "%d/%d" % (x.week, x.year))
window_features['nWeeksWithoutTrainings'] = get_n_weeks_without_training(n_weeks, n_weeks_with_training)
window_features['nCancellations'] = n_cancel
window_features['TimeBetweenCancelsAvg'] = get_avg_time_between_cancels(tcw, n_cancel)
......@@ -199,26 +199,26 @@ def create_window_features(id, data, n_citizens_less_two_sv, only_first_and_last
# Calculate and assign needs
window_features['NeedsStart'] = pre_screening.NeedForHelpScore
window_features['NeedsStartReason'] = get_needs_reason(pre_screening)
#window_features['NeedsStartReason'] = get_needs_reason(pre_screening) TODO: Decide to include
window_features['NeedsEnd'] = screening.NeedForHelpScore
window_features['NeedsDifference'] = screening.NeedForHelpScore - pre_screening.NeedForHelpScore
window_features['NeedsReason'] = get_needs_reason(screening)
#window_features['NeedsReason'] = get_needs_reason(screening) TODO: Decide to include
window_features['Needs'] = get_needs_indicator(pre_screening.NeedForHelpScore, screening.NeedForHelpScore)
# Calculate and assign physics
window_features['PhysicsStart'] = pre_screening.PhysicalStrengthScore
window_features['PhysicsStartReason'] = get_physics_reason(pre_screening)
#window_features['PhysicsStartReason'] = get_physics_reason(pre_screening) TODO: Decide to include
window_features['PhysicsEnd']= screening.PhysicalStrengthScore
window_features['PhysicsDifference'] = screening.PhysicalStrengthScore - pre_screening.PhysicalStrengthScore
window_features['PhysicsReason'] = get_physics_reason(screening)
#window_features['PhysicsReason'] = get_physics_reason(screening) TODO: Decide to include
window_features['Physics'] = get_physics_indicator(
pre_screening.PhysicalStrengthScore, screening.PhysicalStrengthScore)
# Calculate and assign misc
window_features['RehabIndicator'] = get_rehab_indicator(pre_screening.NeedForHelpScore,
pre_screening.PhysicalStrengthScore)
window_features['RehabIndicator'] = get_rehab_indicator(pre_screening.NeedForHelpScore,
pre_screening.PhysicalStrengthScore)
window_features['Exercises'] = pd.Series([get_exercises(pre_screening)])
window_features['LastStatusDate'] = get_last_status_date(ssw, '%Y-%m-%d %H:%M:%S')
window_features['LastStatusMonth'] = get_last_status_month(ssw, '%Y-%m-%d %H:%M:%S')
window_features['LastStatus'] = get_last_status(ssw)
# Add window to total data for id
......@@ -236,17 +236,17 @@ def get_rehab_indicator(needs_start, physics_start):
return needs_start / physics_start
def get_devices_count(hu_id, start_date):
return hu_id[DEV_ISO_CLASS][(hu_id.LendDate <= start_date)
return hu_id[cfg.DEV_ISO_CLASS][(hu_id.LendDate <= start_date)
& ((start_date < hu_id.ReturnDate) | (hu_id.ReturnDate.isna()))]
def get_n_weeks_with_training_first_12(tdw, start_date):
return tdw[RATING_DATE].apply(
return tdw[cfg.RATING_DATE].apply(
lambda x: (np.floor((x - start_date).days / 7))
if (x - start_date).days <= 84
else np.nan).dropna().nunique()
def get_n_weeks_with_training_last_12(tdw, start_date, end_date):
return tdw[RATING_DATE].apply(
return tdw[cfg.RATING_DATE].apply(
lambda x: (np.floor((x - start_date).days / 7))
if (end_date - x).days <= 84
else np.nan).dropna().nunique()
......@@ -255,9 +255,9 @@ def get_successful_program_all(td_id):
successful_program_all = 0
if not td_id.empty:
first_training = td_id.RatingDate.iloc[0]
n_weeks_with_trainings_all = td_id[RATING_DATE].apply(lambda x: np.floor((x - first_training).days / 7)).nunique()
n_weeks_with_trainings_all = td_id[cfg.RATING_DATE].apply(lambda x: np.floor((x - first_training).days / 7)).nunique()
if n_weeks_with_trainings_all >= 8:
weeks_of_training = td_id[RATING_DATE].apply(lambda x: np.floor((x - first_training).days / 7)).unique()
weeks_of_training = td_id[cfg.RATING_DATE].apply(lambda x: np.floor((x - first_training).days / 7)).unique()
for i, week in enumerate(weeks_of_training):
weeks_left = len(weeks_of_training) - i
if weeks_left < 8:
......@@ -283,28 +283,28 @@ def get_physics_reason(screening):
def get_last_status(ssw):
if ssw.empty:
return np.nan
return 'None'
else:
return ssw[STATUS].iat[-1]
return ssw[cfg.STATUS].iat[-1]
def get_last_status_date(ssw, date_format):
def get_last_status_month(ssw, date_format):
if ssw.empty:
return np.nan
return 0
else:
date_time_obj = dt.datetime.strptime(str(ssw[CHANGE_DATE].iat[-1]), date_format)
return date_time_obj.date()
date_time_obj = dt.datetime.strptime(str(ssw[cfg.CHANGE_DATE].iat[-1]), date_format)
return date_time_obj.date().month
def get_new_at(huw):
if huw.empty:
return np.nan
else:
return [at for at in huw[DEV_ISO_CLASS]]
return [at for at in huw[cfg.DEV_ISO_CLASS]]
def get_number_of_ats_running(hui, end_date):
return hui.loc[(hui[LEND_DATE] <= end_date)].shape[0]
return hui.