"""
raw_loader.py
====================================
Module to load raw data for cases.
"""
from tools import file_reader
from utility.time import assert_datetime
import pandas as pd
import numpy as np
from pathlib import Path
import abc
from io import BytesIO, StringIO
import shutil
[docs]class BaseRawLoader2021(metaclass=abc.ABCMeta):
"""
Base class for raw loaders 2021.
"""
[docs] @abc.abstractmethod
def load_status_set(self, aalborg_file_name,
viborg_file_name, file_path):
"""Load the DiGiRehab StatusSet data set."""
[docs] @abc.abstractmethod
def load_training_cancelled(self, aalborg_file_name,
viborg_file_name, file_path):
"""Load the DiGiRehab Training Cancelled data set."""
[docs] @abc.abstractmethod
def load_screening_content(self, aalborg_file_name,
viborg_file_name, file_path):
"""Load the DiGiRehab Screening Content data set."""
[docs] @abc.abstractmethod
def load_training_done(self, aalborg_file_name,
viborg_file_name, file_path):
"""Load the DiGiRehab Training Done data set."""
[docs] @abc.abstractmethod
def load_assistive_aids(self, aalborg_file_name, viborg_file_name,
viborg_alarm_file_name, file_path):
"""Load the DiGiRehab Assistive Aids data set."""
[docs] @abc.abstractmethod
def load_iso_classes(self, file_name, file_path):
"""Load the ISO classes data set"""
[docs]class RawLoader2021(BaseRawLoader2021):
"""Raw loader for 2021 dataset"""
[docs] def load_assistive_aids(self, aalborg_file_name: str, viborg_file_name: str,
viborg_alarm_file_name: str, file_path: Path) -> pd.DataFrame:
"""
This method loads assistive aids data
:param file_name: name of file
:param file_path: path of file
:param n_sheets: number of sheets in excel file
:return: dataframe with loaded data
"""
# Load Aalborg data
converters = {'ID': str,
'Kategori ISO nummer': str}
infile = BytesIO()
with open(Path.joinpath(file_path, aalborg_file_name), 'rb') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df_aa = file_reader.read_excelfile(infile, converters=converters)
df_aa['Seq'] = df_aa.groupby(['ID', 'Kategori ISO nummer']).cumcount()
df_aa = df_aa[['ID', 'Birth Year', 'Gender', 'Kategori ISO nummer', 'Kørselsdato', 'Seq']]
df_aa['LendDate'] = df_aa.apply(lambda x: x['Kørselsdato']
if x['Seq'] % 2 == 0 else pd.NaT, axis=1)
df_aa['ReturnDate'] = df_aa.apply(lambda x: x['Kørselsdato']
if x['Seq'] % 2 == 1 else pd.NaT, axis=1)
df_aa['ReturnDate'] = df_aa.groupby(['ID', 'Kategori ISO nummer'])['ReturnDate'].shift(-1)
df_aa = df_aa.dropna(subset=['LendDate', 'ReturnDate'], thresh=1)
df_aa = df_aa.drop(['Kørselsdato', 'Seq'], axis=1)
df_aa = df_aa.rename(columns={'ID': 'CitizenId',
'Birth Year': 'BirthYear',
'Kategori ISO nummer': 'DevISOClass'})
df_aa['LendDate'] = pd.to_datetime(df_aa['LendDate'])
df_aa['ReturnDate'] = pd.to_datetime(df_aa['ReturnDate'], errors = 'coerce')
# Load Viborg data
converters = {'BorgerID': str, 'Kategori ISO nummer': str}
infile = BytesIO()
with open(Path.joinpath(file_path, viborg_file_name), 'rb') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df_vb = file_reader.read_excelfile(infile, converters=converters)
df_vb = df_vb[['BorgerID', 'Birth Year', 'Gender',
'Kategori ISO nummer', 'Leveret Dato', 'Returneret Dato']]
df_vb = df_vb.rename(columns={'BorgerID': 'CitizenId',
'Birth Year': 'BirthYear',
'Kategori ISO nummer': 'DevISOClass',
'Leveret Dato': 'LendDate',
'Returneret Dato': 'ReturnDate'})
df_vb['LendDate'] = pd.to_datetime(df_vb['LendDate'])
df_vb['ReturnDate'] = pd.to_datetime(df_vb['ReturnDate'], errors='coerce')
# Load Viborg alarm file
converters = {'CitizenId': str}
infile = BytesIO()
with open(Path.joinpath(file_path, viborg_alarm_file_name), 'rb') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df_vb_alarm = file_reader.read_excelfile(infile, converters=converters)
df_vb_alarm = df_vb_alarm[['CitizenId', 'BirthYear', 'Gender',
'Ikrafttrædelsesdato', 'Visitation slut']]
df_vb_alarm['DevISOClass'] = '22271812'
df_vb_alarm = df_vb_alarm.rename(columns={'Ikrafttrædelsesdato': 'LendDate',
'Visitation slut': 'ReturnDate'})
df_vb_alarm['LendDate'] = pd.to_datetime(df_vb_alarm['LendDate'])
df_vb_alarm['ReturnDate'] = pd.to_datetime(df_vb_alarm['ReturnDate'], errors='coerce')
# Merge Viborg data
df_vb = pd.concat([df_vb, df_vb_alarm])
# Merge Aalborg and Viborg data
df = pd.concat([df_aa, df_vb])
return df
[docs] def load_iso_classes(self, file_name: str, file_path: Path) -> pd.DataFrame:
"""
This method loads iso classes
:param file_name: name of file
:param file_path: path of file
:return: dataframe with loaded data
"""
isoclass_file = Path.joinpath(file_path, file_name)
col_names = ['DevISOClass', 'GroupSize', 'Description']
converters = {i: str for i in range(0, 10000)}
infile = StringIO()
with open(isoclass_file, 'r', encoding='utf8') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, header=None, usecols=[0, 1, 2],
names=col_names, converters=converters)
return df
[docs] def load_status_set(self, aalborg_file_name: str,
viborg_file_name: str, file_path: Path) -> pd.DataFrame:
"""
This method loads status set data
:param file_name: name of file
:param file_path: path of file
:return: dataframe with loaded data
"""
index_col = 'Id'
converters = {index_col: str} # pass as string to avoid float conversion
infile = BytesIO()
with open(Path.joinpath(file_path, aalborg_file_name), 'rb') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df_aa_ss = file_reader.read_excelfile(infile, converters=converters)
infile = BytesIO()
with open(Path.joinpath(file_path, viborg_file_name), 'rb') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df_vb_ss = file_reader.read_excelfile(infile, converters=converters)
# Merge Aalborg and Viborg data
df_ss = pd.concat([df_aa_ss, df_vb_ss])
data_columns = ['CitizenId', 'ChangeDate', 'Status', 'BirthYear', 'Gender']
df = pd.DataFrame(columns=data_columns)
for _, row in df_ss.iterrows():
if not pd.isna(row[6]):
list_of_entries = [str.split(line, ';') for line in str.split(row[6], '#')]
for entry in list(filter(None, list_of_entries)):
entry.insert(0, row[index_col])
entry[-1] = entry[-1].replace('[', '').replace(']', '')
if not assert_datetime(entry[1]): # Handle a case where time is missing
entry[1] = np.nan
entry[2] = 'AutoTerminated'
entry_data = pd.DataFrame([entry[:3] + [row[1]] + [row[2]]],
columns=data_columns)
df = df.append(entry_data, ignore_index=True)
# Handle missing values
df['BirthYear'] = df['BirthYear'].fillna(0)
df['ChangeDate'] = pd.to_datetime(df['ChangeDate'])
df['Status'] = pd.Series.astype(df['Status'], dtype=str)
df['Gender'] = pd.Series.astype(df['Gender'], dtype=str)
df['BirthYear'] = pd.Series.astype(df['BirthYear'], dtype=int)
return df
[docs] def load_training_cancelled(self, aalborg_file_name: str,
viborg_file_name: str, file_path: Path) -> pd.DataFrame:
"""
This method loads training cancellation data
:param file_name: name of file
:param file_path: path of file
:return: dataframe with loaded data
"""
index_col = 'Id'
converters = {index_col: str} # pass as string to avoid float conversion
infile = BytesIO()
with open(Path.joinpath(file_path, aalborg_file_name), 'rb') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df_aa_tc = file_reader.read_excelfile(infile, converters=converters)
infile = BytesIO()
with open(Path.joinpath(file_path, viborg_file_name), 'rb') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df_vb_tc = file_reader.read_excelfile(infile, converters=converters)
# Merge Aalborg and Viborg data
df_tc = pd.concat([df_aa_tc, df_vb_tc])
data_columns = ['CitizenId', 'RatingDate', 'RatingScore', 'BirthYear', 'Gender']
df = pd.DataFrame(columns=data_columns)
for _, row in df_tc.iterrows():
if not pd.isna(row[6]):
list_of_entries = [str.split(line, ';') for line in str.split(row[6], '#')]
for entry in list(filter(None, list_of_entries)):
entry.insert(0, row[index_col])
entry[-1] = entry[-1].replace('[', '').replace(']', '')
entry_data = pd.DataFrame([entry[:3] + [row[1]] + [row[2]]],
columns=data_columns)
df = df.append(entry_data, ignore_index=True)
# Handle missing values
df['BirthYear'] = df['BirthYear'].fillna(0)
df['RatingDate'] = pd.to_datetime(df['RatingDate'])
df['RatingScore'] = pd.Series.astype(df['RatingScore'], dtype=int)
df['Gender'] = pd.Series.astype(df['Gender'], dtype=str)
df['BirthYear'] = pd.Series.astype(df['BirthYear'], dtype=int)
return df
[docs] def load_screening_content(self, aalborg_file_name: str,
viborg_file_name: str, file_path: Path) -> pd.DataFrame:
"""
This method loads screening datta
:param file_name: name of file
:param file_path: path of file
:return: dataframe with loaded data
"""
index_col = 'Id'
converters = {index_col: str} # pass as string to avoid float conversion
infile = BytesIO()
with open(Path.joinpath(file_path, aalborg_file_name), 'rb') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df_aa_sc = file_reader.read_excelfile(infile, converters=converters)
infile = BytesIO()
with open(Path.joinpath(file_path, viborg_file_name), 'rb') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df_vb_sc = file_reader.read_excelfile(infile, converters=converters)
# Merge Aalborg and Viborg data
df_sc = pd.concat([df_aa_sc, df_vb_sc])
data_columns = ['CitizenId', 'ScreeningDate', 'NeedForHelpScore',
'NeedForHelpReason', 'PhysicalStrengthScore',
'PhysicalStrengthReason', 'ExerciseContent',
'WalkingSpeed', 'TimedUpAndGo', 'BirthYear', 'Gender']
df = pd.DataFrame(columns=data_columns)
for _, row in df_sc.iterrows():
if not pd.isna(row[3]):
list_of_entries = [str.split(line, ';') for line in str.split(row[3], '#')]
for entry in list(filter(None, list_of_entries)):
entry.insert(0, row[index_col])
entry[-1] = entry[-1].replace('[', '').replace(']', '')
entry_data = pd.DataFrame([entry[:9] + [row[1]] + [row[2]]],
columns=data_columns)
df = df.append(entry_data, ignore_index=True)
# Handle missing values
df.replace(to_replace='', value=np.nan, regex=True, inplace=True)
df['BirthYear'] = df['BirthYear'].fillna(0)
df['NeedForHelpScore'] = df['NeedForHelpScore'].fillna(0)
df['PhysicalStrengthScore'] = df['PhysicalStrengthScore'].fillna(0)
df['ExerciseContent'] = df['ExerciseContent'].fillna('0')
df['ScreeningDate'] = pd.to_datetime(df['ScreeningDate'])
df['NeedForHelpScore'] = pd.Series.astype(df['NeedForHelpScore'], dtype=int)
df['NeedForHelpReason'] = pd.Series.astype(df['NeedForHelpReason'], dtype=str)
df['PhysicalStrengthScore'] = pd.Series.astype(df['PhysicalStrengthScore'], dtype=int)
df['PhysicalStrengthReason'] = pd.Series.astype(df['PhysicalStrengthReason'], dtype=str)
df['ExerciseContent'] = pd.Series.astype(df['ExerciseContent'], dtype=str)
df['WalkingSpeed'] = pd.Series.astype(df['WalkingSpeed'], dtype=float)
df['TimedUpAndGo'] = pd.Series.astype(df['TimedUpAndGo'], dtype=float)
df['Gender'] = pd.Series.astype(df['Gender'], dtype=str)
df['BirthYear'] = pd.Series.astype(df['BirthYear'], dtype=int)
return df
[docs] def load_training_done(self, aalborg_file_name: str,
viborg_file_name: str, file_path: Path) -> pd.DataFrame:
"""
This method loads training completed datta
:param file_name: name of file
:param file_path: path of file
:return: dataframe with loaded data
"""
index_col = 'Id'
converters = {index_col: str} # pass as string to avoid float conversion
infile = BytesIO()
with open(Path.joinpath(file_path, aalborg_file_name), 'rb') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df_aa_td = file_reader.read_excelfile(infile, converters=converters)
infile = BytesIO()
with open(Path.joinpath(file_path, viborg_file_name), 'rb') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df_vb_td = file_reader.read_excelfile(infile, converters=converters)
# Merge Aalborg and Viborg data
df_td = pd.concat([df_aa_td, df_vb_td])
data_columns = ['CitizenId', 'RatingDate', 'RatingScore', 'BirthYear', 'Gender']
df = pd.DataFrame(columns=data_columns)
for _, row in df_td.iterrows():
if not pd.isna(row[4]):
list_of_entries = [str.split(line, ";") for line in str.split(row[4], "#")]
for entry in list(filter(None, list_of_entries)):
if len(entry) > 2:
entry.insert(0, row[index_col])
entry[-1] = entry[-1].replace('[', '').replace(']', '')
data_entry = pd.DataFrame([entry[:3] + [row[1]] + [row[2]]],
columns=data_columns)
df = df.append(data_entry, ignore_index=True)
# Handle missing values
df.replace(to_replace='', value=np.nan, regex=True, inplace=True)
df['BirthYear'] = df['BirthYear'].fillna(0)
df['RatingDate'] = pd.to_datetime(df['RatingDate'])
df['RatingScore'] = pd.Series.astype(df['RatingScore'], dtype=int)
df['Gender'] = pd.Series.astype(df['Gender'], dtype=str)
df['BirthYear'] = pd.Series.astype(df['BirthYear'], dtype=int)
return df