Source code for tools.raw_loader

"""
raw_loader.py
====================================
Module to load raw data for cases.
"""

from tools import file_reader
from utility.time import assert_datetime
import pandas as pd
import numpy as np
from pathlib import Path
import abc
from io import BytesIO, StringIO
import shutil

[docs]class BaseRawLoader2021(metaclass=abc.ABCMeta): """ Base class for raw loaders 2021. """
[docs] @abc.abstractmethod def load_status_set(self, aalborg_file_name, viborg_file_name, file_path): """Load the DiGiRehab StatusSet data set."""
[docs] @abc.abstractmethod def load_training_cancelled(self, aalborg_file_name, viborg_file_name, file_path): """Load the DiGiRehab Training Cancelled data set."""
[docs] @abc.abstractmethod def load_screening_content(self, aalborg_file_name, viborg_file_name, file_path): """Load the DiGiRehab Screening Content data set."""
[docs] @abc.abstractmethod def load_training_done(self, aalborg_file_name, viborg_file_name, file_path): """Load the DiGiRehab Training Done data set."""
[docs] @abc.abstractmethod def load_assistive_aids(self, aalborg_file_name, viborg_file_name, viborg_alarm_file_name, file_path): """Load the DiGiRehab Assistive Aids data set."""
[docs] @abc.abstractmethod def load_iso_classes(self, file_name, file_path): """Load the ISO classes data set"""
[docs]class RawLoader2021(BaseRawLoader2021): """Raw loader for 2021 dataset"""
[docs] def load_assistive_aids(self, aalborg_file_name: str, viborg_file_name: str, viborg_alarm_file_name: str, file_path: Path) -> pd.DataFrame: """ This method loads assistive aids data :param file_name: name of file :param file_path: path of file :param n_sheets: number of sheets in excel file :return: dataframe with loaded data """ # Load Aalborg data converters = {'ID': str, 'Kategori ISO nummer': str} infile = BytesIO() with open(Path.joinpath(file_path, aalborg_file_name), 'rb') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df_aa = file_reader.read_excelfile(infile, converters=converters) df_aa['Seq'] = df_aa.groupby(['ID', 'Kategori ISO nummer']).cumcount() df_aa = df_aa[['ID', 'Birth Year', 'Gender', 'Kategori ISO nummer', 'Kørselsdato', 'Seq']] df_aa['LendDate'] = df_aa.apply(lambda x: x['Kørselsdato'] if x['Seq'] % 2 == 0 else pd.NaT, axis=1) df_aa['ReturnDate'] = df_aa.apply(lambda x: x['Kørselsdato'] if x['Seq'] % 2 == 1 else pd.NaT, axis=1) df_aa['ReturnDate'] = df_aa.groupby(['ID', 'Kategori ISO nummer'])['ReturnDate'].shift(-1) df_aa = df_aa.dropna(subset=['LendDate', 'ReturnDate'], thresh=1) df_aa = df_aa.drop(['Kørselsdato', 'Seq'], axis=1) df_aa = df_aa.rename(columns={'ID': 'CitizenId', 'Birth Year': 'BirthYear', 'Kategori ISO nummer': 'DevISOClass'}) df_aa['LendDate'] = pd.to_datetime(df_aa['LendDate']) df_aa['ReturnDate'] = pd.to_datetime(df_aa['ReturnDate'], errors = 'coerce') # Load Viborg data converters = {'BorgerID': str, 'Kategori ISO nummer': str} infile = BytesIO() with open(Path.joinpath(file_path, viborg_file_name), 'rb') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df_vb = file_reader.read_excelfile(infile, converters=converters) df_vb = df_vb[['BorgerID', 'Birth Year', 'Gender', 'Kategori ISO nummer', 'Leveret Dato', 'Returneret Dato']] df_vb = df_vb.rename(columns={'BorgerID': 'CitizenId', 'Birth Year': 'BirthYear', 'Kategori ISO nummer': 'DevISOClass', 'Leveret Dato': 'LendDate', 'Returneret Dato': 'ReturnDate'}) df_vb['LendDate'] = pd.to_datetime(df_vb['LendDate']) df_vb['ReturnDate'] = pd.to_datetime(df_vb['ReturnDate'], errors='coerce') # Load Viborg alarm file converters = {'CitizenId': str} infile = BytesIO() with open(Path.joinpath(file_path, viborg_alarm_file_name), 'rb') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df_vb_alarm = file_reader.read_excelfile(infile, converters=converters) df_vb_alarm = df_vb_alarm[['CitizenId', 'BirthYear', 'Gender', 'Ikrafttrædelsesdato', 'Visitation slut']] df_vb_alarm['DevISOClass'] = '22271812' df_vb_alarm = df_vb_alarm.rename(columns={'Ikrafttrædelsesdato': 'LendDate', 'Visitation slut': 'ReturnDate'}) df_vb_alarm['LendDate'] = pd.to_datetime(df_vb_alarm['LendDate']) df_vb_alarm['ReturnDate'] = pd.to_datetime(df_vb_alarm['ReturnDate'], errors='coerce') # Merge Viborg data df_vb = pd.concat([df_vb, df_vb_alarm]) # Merge Aalborg and Viborg data df = pd.concat([df_aa, df_vb]) return df
[docs] def load_iso_classes(self, file_name: str, file_path: Path) -> pd.DataFrame: """ This method loads iso classes :param file_name: name of file :param file_path: path of file :return: dataframe with loaded data """ isoclass_file = Path.joinpath(file_path, file_name) col_names = ['DevISOClass', 'GroupSize', 'Description'] converters = {i: str for i in range(0, 10000)} infile = StringIO() with open(isoclass_file, 'r', encoding='utf8') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df = file_reader.read_csv(infile, header=None, usecols=[0, 1, 2], names=col_names, converters=converters) return df
[docs] def load_status_set(self, aalborg_file_name: str, viborg_file_name: str, file_path: Path) -> pd.DataFrame: """ This method loads status set data :param file_name: name of file :param file_path: path of file :return: dataframe with loaded data """ index_col = 'Id' converters = {index_col: str} # pass as string to avoid float conversion infile = BytesIO() with open(Path.joinpath(file_path, aalborg_file_name), 'rb') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df_aa_ss = file_reader.read_excelfile(infile, converters=converters) infile = BytesIO() with open(Path.joinpath(file_path, viborg_file_name), 'rb') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df_vb_ss = file_reader.read_excelfile(infile, converters=converters) # Merge Aalborg and Viborg data df_ss = pd.concat([df_aa_ss, df_vb_ss]) data_columns = ['CitizenId', 'ChangeDate', 'Status', 'BirthYear', 'Gender'] df = pd.DataFrame(columns=data_columns) for _, row in df_ss.iterrows(): if not pd.isna(row[6]): list_of_entries = [str.split(line, ';') for line in str.split(row[6], '#')] for entry in list(filter(None, list_of_entries)): entry.insert(0, row[index_col]) entry[-1] = entry[-1].replace('[', '').replace(']', '') if not assert_datetime(entry[1]): # Handle a case where time is missing entry[1] = np.nan entry[2] = 'AutoTerminated' entry_data = pd.DataFrame([entry[:3] + [row[1]] + [row[2]]], columns=data_columns) df = df.append(entry_data, ignore_index=True) # Handle missing values df['BirthYear'] = df['BirthYear'].fillna(0) df['ChangeDate'] = pd.to_datetime(df['ChangeDate']) df['Status'] = pd.Series.astype(df['Status'], dtype=str) df['Gender'] = pd.Series.astype(df['Gender'], dtype=str) df['BirthYear'] = pd.Series.astype(df['BirthYear'], dtype=int) return df
[docs] def load_training_cancelled(self, aalborg_file_name: str, viborg_file_name: str, file_path: Path) -> pd.DataFrame: """ This method loads training cancellation data :param file_name: name of file :param file_path: path of file :return: dataframe with loaded data """ index_col = 'Id' converters = {index_col: str} # pass as string to avoid float conversion infile = BytesIO() with open(Path.joinpath(file_path, aalborg_file_name), 'rb') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df_aa_tc = file_reader.read_excelfile(infile, converters=converters) infile = BytesIO() with open(Path.joinpath(file_path, viborg_file_name), 'rb') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df_vb_tc = file_reader.read_excelfile(infile, converters=converters) # Merge Aalborg and Viborg data df_tc = pd.concat([df_aa_tc, df_vb_tc]) data_columns = ['CitizenId', 'RatingDate', 'RatingScore', 'BirthYear', 'Gender'] df = pd.DataFrame(columns=data_columns) for _, row in df_tc.iterrows(): if not pd.isna(row[6]): list_of_entries = [str.split(line, ';') for line in str.split(row[6], '#')] for entry in list(filter(None, list_of_entries)): entry.insert(0, row[index_col]) entry[-1] = entry[-1].replace('[', '').replace(']', '') entry_data = pd.DataFrame([entry[:3] + [row[1]] + [row[2]]], columns=data_columns) df = df.append(entry_data, ignore_index=True) # Handle missing values df['BirthYear'] = df['BirthYear'].fillna(0) df['RatingDate'] = pd.to_datetime(df['RatingDate']) df['RatingScore'] = pd.Series.astype(df['RatingScore'], dtype=int) df['Gender'] = pd.Series.astype(df['Gender'], dtype=str) df['BirthYear'] = pd.Series.astype(df['BirthYear'], dtype=int) return df
[docs] def load_screening_content(self, aalborg_file_name: str, viborg_file_name: str, file_path: Path) -> pd.DataFrame: """ This method loads screening datta :param file_name: name of file :param file_path: path of file :return: dataframe with loaded data """ index_col = 'Id' converters = {index_col: str} # pass as string to avoid float conversion infile = BytesIO() with open(Path.joinpath(file_path, aalborg_file_name), 'rb') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df_aa_sc = file_reader.read_excelfile(infile, converters=converters) infile = BytesIO() with open(Path.joinpath(file_path, viborg_file_name), 'rb') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df_vb_sc = file_reader.read_excelfile(infile, converters=converters) # Merge Aalborg and Viborg data df_sc = pd.concat([df_aa_sc, df_vb_sc]) data_columns = ['CitizenId', 'ScreeningDate', 'NeedForHelpScore', 'NeedForHelpReason', 'PhysicalStrengthScore', 'PhysicalStrengthReason', 'ExerciseContent', 'WalkingSpeed', 'TimedUpAndGo', 'BirthYear', 'Gender'] df = pd.DataFrame(columns=data_columns) for _, row in df_sc.iterrows(): if not pd.isna(row[3]): list_of_entries = [str.split(line, ';') for line in str.split(row[3], '#')] for entry in list(filter(None, list_of_entries)): entry.insert(0, row[index_col]) entry[-1] = entry[-1].replace('[', '').replace(']', '') entry_data = pd.DataFrame([entry[:9] + [row[1]] + [row[2]]], columns=data_columns) df = df.append(entry_data, ignore_index=True) # Handle missing values df.replace(to_replace='', value=np.nan, regex=True, inplace=True) df['BirthYear'] = df['BirthYear'].fillna(0) df['NeedForHelpScore'] = df['NeedForHelpScore'].fillna(0) df['PhysicalStrengthScore'] = df['PhysicalStrengthScore'].fillna(0) df['ExerciseContent'] = df['ExerciseContent'].fillna('0') df['ScreeningDate'] = pd.to_datetime(df['ScreeningDate']) df['NeedForHelpScore'] = pd.Series.astype(df['NeedForHelpScore'], dtype=int) df['NeedForHelpReason'] = pd.Series.astype(df['NeedForHelpReason'], dtype=str) df['PhysicalStrengthScore'] = pd.Series.astype(df['PhysicalStrengthScore'], dtype=int) df['PhysicalStrengthReason'] = pd.Series.astype(df['PhysicalStrengthReason'], dtype=str) df['ExerciseContent'] = pd.Series.astype(df['ExerciseContent'], dtype=str) df['WalkingSpeed'] = pd.Series.astype(df['WalkingSpeed'], dtype=float) df['TimedUpAndGo'] = pd.Series.astype(df['TimedUpAndGo'], dtype=float) df['Gender'] = pd.Series.astype(df['Gender'], dtype=str) df['BirthYear'] = pd.Series.astype(df['BirthYear'], dtype=int) return df
[docs] def load_training_done(self, aalborg_file_name: str, viborg_file_name: str, file_path: Path) -> pd.DataFrame: """ This method loads training completed datta :param file_name: name of file :param file_path: path of file :return: dataframe with loaded data """ index_col = 'Id' converters = {index_col: str} # pass as string to avoid float conversion infile = BytesIO() with open(Path.joinpath(file_path, aalborg_file_name), 'rb') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df_aa_td = file_reader.read_excelfile(infile, converters=converters) infile = BytesIO() with open(Path.joinpath(file_path, viborg_file_name), 'rb') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df_vb_td = file_reader.read_excelfile(infile, converters=converters) # Merge Aalborg and Viborg data df_td = pd.concat([df_aa_td, df_vb_td]) data_columns = ['CitizenId', 'RatingDate', 'RatingScore', 'BirthYear', 'Gender'] df = pd.DataFrame(columns=data_columns) for _, row in df_td.iterrows(): if not pd.isna(row[4]): list_of_entries = [str.split(line, ";") for line in str.split(row[4], "#")] for entry in list(filter(None, list_of_entries)): if len(entry) > 2: entry.insert(0, row[index_col]) entry[-1] = entry[-1].replace('[', '').replace(']', '') data_entry = pd.DataFrame([entry[:3] + [row[1]] + [row[2]]], columns=data_columns) df = df.append(data_entry, ignore_index=True) # Handle missing values df.replace(to_replace='', value=np.nan, regex=True, inplace=True) df['BirthYear'] = df['BirthYear'].fillna(0) df['RatingDate'] = pd.to_datetime(df['RatingDate']) df['RatingScore'] = pd.Series.astype(df['RatingScore'], dtype=int) df['Gender'] = pd.Series.astype(df['Gender'], dtype=str) df['BirthYear'] = pd.Series.astype(df['BirthYear'], dtype=int) return df