data_loader.py 5.66 KB
Newer Older
1
import numpy as np
2
import paths as pt
3
from abc import ABC, abstractmethod
4
from tools import file_reader
5
6
7
8
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

class BaseDataLoader(ABC):
thecml's avatar
thecml committed
9
10
    def __init__(self, file_name, converters=None):
        """Initilizer method that takes a file name and optionally a converter"""
11
        self.file_name = file_name
12
        self.converters = converters
13
14
15
16
17
        
    @abstractmethod
    def load_data(self):
        """Loads the data from a data set at startup"""
        
18
    def get_data(self):
thecml's avatar
thecml committed
19
        """Returns the features and target"""
20
        return self.X, self.y
thecml's avatar
thecml committed
21
22
    
    def get_features(self):
thecml's avatar
thecml committed
23
        """Returns the feature names"""
thecml's avatar
thecml committed
24
        return self.X.columns
thecml's avatar
thecml committed
25
26
27
        
    def prepare_data(self, scaling_strategy=None):
        """Prepares the data from a data set"""
28
29
30
        X = np.array(self.X)
        y = np.array(self.y)
        
thecml's avatar
thecml committed
31
32
33
34
35
36
37
38
39
40
41
42
43
        if scaling_strategy != None:
            emb_cols = self.X.filter(regex='((\d+)[Ats])\w+', axis=1)
            n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
            
            if scaling_strategy == "Standard":
                scaler = StandardScaler()
                X_sc = scaler.fit_transform(X[:,:n_scale_cols])
                X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
            else:
                scaler = MinMaxScaler()
                X_sc = scaler.fit_transform(X[:,:n_scale_cols])
                X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
            
44
        return X, y
thecml's avatar
thecml committed
45
46
47
        
    def prepare_data_split(self, test_size, scaling_strategy=None):
        """Prepares and splits the data from a data set"""
48
49
        X = np.array(self.X)
        y = np.array(self.y)
thecml's avatar
thecml committed
50
51
52
53
54
55
56
57
58
59
60
61

        if scaling_strategy != None:
            emb_cols = self.X.filter(regex='((\d+)[Ats])\w+', axis=1)
            n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
            if scaling_strategy == "Standard":
                scaler = StandardScaler()
                X_sc = scaler.fit_transform(X[:,:n_scale_cols])
                X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
            else:
                scaler = MinMaxScaler()
                X_sc = scaler.fit_transform(X[:,:n_scale_cols])
                X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
62
63
64
65
66
67
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
                                                    stratify=y, random_state=0)
        
        return X_train, X_test, y_train, y_test

68
69
class CompleteDataLoader(BaseDataLoader):
    def load_data(self):
70
        df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
71
72
                                  self.file_name,
                                  converters=self.converters)
73
74
75
76
77
78
        X = df.drop(['Complete'], axis=1)
        y = df['Complete']
        self.X = X
        self.y = y
        return self
    
thecml's avatar
thecml committed
79
80
81
82
83
84
85
86
87
88
class ComplianceDataLoader(BaseDataLoader):
    def load_data(self):
        df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
                                  self.file_name,
                                  converters=self.converters)
        X = df.drop(['Compliance'], axis=1)
        y = df['Compliance']
        self.X = X
        self.y = y
        return self
89
90
91

class FallDataLoader(BaseDataLoader):
    def load_data(self):
92
        df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
93
94
                                  self.file_name,
                                  converters=self.converters)
95
96
97
98
99
        X = df.drop(['Fall'], axis=1)
        y = df['Fall']
        self.X = X
        self.y = y
        return self
100
101
102

class FallTestDataLoader(BaseDataLoader):
    def load_data(self):
103
        df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
104
105
106
107
108
109
110
111
                                  self.file_name,
                                  converters=self.converters)
        X = df.drop(['Fall'], axis=1)
        y = df['Fall']
        self.X = X
        self.y = y
        return self
    
thecml's avatar
thecml committed
112
    def prepare_data(self, scaling_strategy: str = None):
113
114
115
        X = np.array(self.X)
        y = np.array(self.y)
        
thecml's avatar
thecml committed
116
117
118
119
120
121
122
123
124
125
126
        if scaling_strategy != None:
            emb_cols = self.X.filter(regex='((\d+)[Ats|Ex])\w+', axis=1)
            n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
            if scaling_strategy == "Standard":
                scaler = StandardScaler()
                X_sc = scaler.fit_transform(X[:,:n_scale_cols])
                X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
            else:
                scaler = MinMaxScaler()
                X_sc = scaler.fit_transform(X[:,:n_scale_cols])
                X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
127
128
129
        
        return X, y
    
thecml's avatar
thecml committed
130
131
    def prepare_data_split(self, test_size: float,
                           scaling_strategy: str = None):
132
133
        X = np.array(self.X)
        y = np.array(self.y)
thecml's avatar
thecml committed
134
135
136
137
138
139
140
141
142
143
144
145
    
        if scaling_strategy != None:
            emb_cols = self.X.filter(regex='((\d+)[Ats|Ex])\w+', axis=1)
            n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
            if scaling_strategy == "Standard":
                scaler = StandardScaler()
                X_sc = scaler.fit_transform(X[:,:n_scale_cols])
                X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
            else:
                scaler = MinMaxScaler()
                X_sc = scaler.fit_transform(X[:,:n_scale_cols])
                X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
146
147
148
149
150
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
                                                    stratify=y, random_state=0)
        
        return X_train, X_test, y_train, y_test