"""
data_loader.py
====================================
Module to load data for cases.
"""
from abc import ABC, abstractmethod
from sklearn.model_selection import train_test_split
from typing import Tuple, List
from pathlib import Path
from io import StringIO
import shutil
from tools import file_reader, preprocessor
import numpy as np
import pandas as pd
[docs]class BaseDataLoader(ABC):
"""
Base class for data loaders.
"""
def __init__(self,
file_path: Path,
file_name: str,
settings: dict,
converters: dict = None):
"""Initilizer method that takes a file path, file name,
settings and optionally a converter"""
self.X: pd.DataFrame = None
self.y: np.ndarray = None
self.file_path: Path = file_path
self.file_name: str = file_name
self.settings: dict = settings
self.converters: dict = converters
[docs] @abstractmethod
def load_data(self) -> None:
"""Loads the data from a data set at startup"""
[docs] def get_data(self) -> Tuple[pd.DataFrame, np.ndarray]:
"""
This method returns the features and targets
:return: X and y
"""
return self.X, self.y
[docs] def get_features(self) -> List[str]:
"""
This method returns the feature names
:return: the columns of X as a list
"""
return self.X.columns
[docs] def prepare_data(self) -> Tuple[np.ndarray, np.ndarray]:
"""
This method prepares data by normalizing and scaling it.
:return: prepared X and y
"""
X = self.X
y = self.y
features_to_normalize = self.settings['features_to_normalize']
features_to_scale = self.settings['features_to_scale']
X = preprocessor.normalize_data(X, features_to_normalize)
X = preprocessor.scale_data(X, features_to_scale)
X = np.array(X)
y = np.array(y)
return X, y
[docs] def prepare_data_split(self, test_size: float) -> Tuple[np.ndarray, np.ndarray,
np.ndarray, np.ndarray]:
"""
This method prepares and splits the data from a data set
:param test_size: the size of the test set
:return: a split train and test dataset
"""
X = self.X
y = self.y
features_to_normalize = self.settings['features_to_normalize']
features_to_scale = self.settings['features_to_scale']
X = preprocessor.normalize_data(X, features_to_normalize)
X = preprocessor.scale_data(X, features_to_scale)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
stratify=y, random_state=0)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
return X_train, X_test, y_train, y_test
[docs]class CompleteDataLoader(BaseDataLoader):
"""
Data loader for Complete case.
"""
[docs] def load_data(self):
infile = StringIO()
with open(Path.joinpath(self.file_path,
self.file_name), 'r', newline='', encoding='utf8') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=self.converters)
X = df.drop(['Complete'], axis=1)
y = df['Complete']
self.X = pd.DataFrame(X)
self.y = np.array(y)
return self
[docs]class ComplianceDataLoader(BaseDataLoader):
"""
Data loader for Compliance case.
"""
[docs] def load_data(self):
infile = StringIO()
with open(Path.joinpath(self.file_path,
self.file_name), 'r', newline='', encoding='utf8') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=self.converters)
X = df.drop(['Compliance'], axis=1)
y = df['Compliance']
self.X = pd.DataFrame(X)
self.y = np.array(y)
return self
[docs]class FallDataLoader(BaseDataLoader):
"""
Data loader for Fall case.
"""
[docs] def load_data(self):
infile = StringIO()
with open(Path.joinpath(self.file_path,
self.file_name), 'r', newline='', encoding='utf8') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=self.converters)
X = df.drop(['Fall'], axis=1)
y = df['Fall']
self.X = pd.DataFrame(X)
self.y = np.array(y)
return self
[docs]class RiskDataLoader(BaseDataLoader):
"""
Data loader for Risk case.
"""
[docs] def load_data(self):
infile = StringIO()
with open(Path.joinpath(self.file_path,
self.file_name), 'r', newline='', encoding='utf8') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=self.converters)
X = df.drop(['Risk'], axis=1)
y = df['Risk']
self.X = pd.DataFrame(X)
self.y = np.array(y)
return self
[docs]class AlarmDataLoader(BaseDataLoader):
"""
Data loader for Alarm case.
"""
[docs] def load_data(self):
infile = StringIO()
with open(Path.joinpath(self.file_path,
self.file_name), 'r', newline='', encoding='utf8') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=self.converters)
X = df.drop(['Status', 'Days'], axis=1)
y = np.array(list(tuple(x) for x in df[['Status', 'Days']].to_numpy()),
dtype=[('Status', 'bool'), ('Days_to_alarm', '>i4')])
self.X = pd.DataFrame(X)
self.y = np.array(y)
return self