Source code for tools.data_loader

"""
data_loader.py
====================================
Module to load data for cases.
"""

from abc import ABC, abstractmethod
from sklearn.model_selection import train_test_split
from typing import Tuple, List
from pathlib import Path
from io import StringIO
import shutil
from tools import file_reader, preprocessor
import numpy as np
import pandas as pd

[docs]class BaseDataLoader(ABC): """ Base class for data loaders. """ def __init__(self, file_path: Path, file_name: str, settings: dict, converters: dict = None): """Initilizer method that takes a file path, file name, settings and optionally a converter""" self.X: pd.DataFrame = None self.y: np.ndarray = None self.file_path: Path = file_path self.file_name: str = file_name self.settings: dict = settings self.converters: dict = converters
[docs] @abstractmethod def load_data(self) -> None: """Loads the data from a data set at startup"""
[docs] def get_data(self) -> Tuple[pd.DataFrame, np.ndarray]: """ This method returns the features and targets :return: X and y """ return self.X, self.y
[docs] def get_features(self) -> List[str]: """ This method returns the feature names :return: the columns of X as a list """ return self.X.columns
[docs] def prepare_data(self) -> Tuple[np.ndarray, np.ndarray]: """ This method prepares data by normalizing and scaling it. :return: prepared X and y """ X = self.X y = self.y features_to_normalize = self.settings['features_to_normalize'] features_to_scale = self.settings['features_to_scale'] X = preprocessor.normalize_data(X, features_to_normalize) X = preprocessor.scale_data(X, features_to_scale) X = np.array(X) y = np.array(y) return X, y
[docs] def prepare_data_split(self, test_size: float) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ This method prepares and splits the data from a data set :param test_size: the size of the test set :return: a split train and test dataset """ X = self.X y = self.y features_to_normalize = self.settings['features_to_normalize'] features_to_scale = self.settings['features_to_scale'] X = preprocessor.normalize_data(X, features_to_normalize) X = preprocessor.scale_data(X, features_to_scale) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=0) X_train = np.array(X_train) X_test = np.array(X_test) y_train = np.array(y_train) y_test = np.array(y_test) return X_train, X_test, y_train, y_test
[docs]class CompleteDataLoader(BaseDataLoader): """ Data loader for Complete case. """
[docs] def load_data(self): infile = StringIO() with open(Path.joinpath(self.file_path, self.file_name), 'r', newline='', encoding='utf8') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df = file_reader.read_csv(infile, converters=self.converters) X = df.drop(['Complete'], axis=1) y = df['Complete'] self.X = pd.DataFrame(X) self.y = np.array(y) return self
[docs]class ComplianceDataLoader(BaseDataLoader): """ Data loader for Compliance case. """
[docs] def load_data(self): infile = StringIO() with open(Path.joinpath(self.file_path, self.file_name), 'r', newline='', encoding='utf8') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df = file_reader.read_csv(infile, converters=self.converters) X = df.drop(['Compliance'], axis=1) y = df['Compliance'] self.X = pd.DataFrame(X) self.y = np.array(y) return self
[docs]class FallDataLoader(BaseDataLoader): """ Data loader for Fall case. """
[docs] def load_data(self): infile = StringIO() with open(Path.joinpath(self.file_path, self.file_name), 'r', newline='', encoding='utf8') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df = file_reader.read_csv(infile, converters=self.converters) X = df.drop(['Fall'], axis=1) y = df['Fall'] self.X = pd.DataFrame(X) self.y = np.array(y) return self
[docs]class RiskDataLoader(BaseDataLoader): """ Data loader for Risk case. """
[docs] def load_data(self): infile = StringIO() with open(Path.joinpath(self.file_path, self.file_name), 'r', newline='', encoding='utf8') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df = file_reader.read_csv(infile, converters=self.converters) X = df.drop(['Risk'], axis=1) y = df['Risk'] self.X = pd.DataFrame(X) self.y = np.array(y) return self
[docs]class AlarmDataLoader(BaseDataLoader): """ Data loader for Alarm case. """
[docs] def load_data(self): infile = StringIO() with open(Path.joinpath(self.file_path, self.file_name), 'r', newline='', encoding='utf8') as fd: shutil.copyfileobj(fd, infile) infile.seek(0) df = file_reader.read_csv(infile, converters=self.converters) X = df.drop(['Status', 'Days'], axis=1) y = np.array(list(tuple(x) for x in df[['Status', 'Days']].to_numpy()), dtype=[('Status', 'bool'), ('Days_to_alarm', '>i4')]) self.X = pd.DataFrame(X) self.y = np.array(y) return self