test_model_preprocessing.py 6.64 KB
Newer Older
1
2
3
#!/usr/bin/env python
import numpy as np
import pandas as pd
4
from tools import data_loader
5
6
7
8
9
10
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import RobustScaler, MaxAbsScaler, QuantileTransformer
11
12
from tools.classifiers import KnnClassifier, SvmClassifier, LrClassifier
from tools.classifiers import XgbClassifier, RfClassifier, MlpClassifier
13
from utility.settings import load_settings
14
import csv
15
from typing import List
16
import paths as pt
17
from utility.metrics import compute_mean, compute_std
18
19
20
21
22

class DummyScaler(BaseEstimator, TransformerMixin):
    def fit_transform(self, X):
        return np.array(X)

23
class DummyNormalizer(BaseEstimator, TransformerMixin): 
24
    def fit_transform(self, X, case=None):
25
26
27
        return np.array(X)

class BoxCoxNormalizer(BaseEstimator, TransformerMixin):
28
29
30
31
32
33
34
35
36
37
    def fit_transform(self, X, case=None):
        if case == "Risk":
            numeric_feats = ['Gender_Male', 'Gender_Female', 'BirthYear', 'Cluster',
                    'LoanPeriod', 'NumberSplit', 'NumberScreening', 'NumberWeeks',
                    'MeanEvaluation', 'NumberFalls', 'NumberTraining', 'NumberTrainingWeek',
                    'TimeBetweenTraining', 'NumberWeeksNoTraining', 'Needs', 'Physics',
                    'NumberAts', 'NumberEx']
        else:
            numeric_feats = ['Gender_Male', 'Gender_Female', 'BirthYear',
                            'Cluster', 'LoanPeriod', 'NumberAts']
38
39
40
41
42
43
44
45
46
        skewed_feats = X[numeric_feats].apply(lambda x: skew(x.dropna()))
        skewed_feats = skewed_feats[skewed_feats > 0.25]
        skewed_feats = skewed_feats.index
        for feats in skewed_feats:
            X[feats] = X[feats] + 1
            X[feats], lam = boxcox(X[feats])
        return np.array(X)
    
class BoxCoxNormalizerNoGender(BaseEstimator, TransformerMixin):
47
48
49
50
51
52
53
54
55
    def fit_transform(self, X, case=None):
        if case == "Risk":
            numeric_feats = ['BirthYear', 'Cluster',
                    'LoanPeriod', 'NumberSplit', 'NumberScreening', 'NumberWeeks',
                    'MeanEvaluation', 'NumberFalls', 'NumberTraining', 'NumberTrainingWeek',
                    'TimeBetweenTraining', 'NumberWeeksNoTraining', 'Needs', 'Physics',
                    'NumberAts', 'NumberEx']
        else:
            numeric_feats = ['BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts']
56
57
58
59
60
61
62
63
64
        skewed_feats = X[numeric_feats].apply(lambda x: skew(x.dropna()))
        skewed_feats = skewed_feats[skewed_feats > 0.25]
        skewed_feats = skewed_feats.index
        for feats in skewed_feats:
            X[feats] = X[feats] + 1
            X[feats], lam = boxcox(X[feats])
        return np.array(X)

def main():
65
    cases = ["Complete", "Compliance", "Fall", "Risk"]
66
    metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'f1']
67
68
69
70
71
72
73
74
    normalizer_names = ["None", "BoxCox", "BoxCoxNoGender"]
    normalizers = [DummyNormalizer(), BoxCoxNormalizer(), BoxCoxNormalizerNoGender()]
    scaler_names = ["None", "Standard", "MinMax", "MinMaxRange", "Robust",
    "MaxAbs", "QuantileTransformer", "QuantileTransformerNorm"]
    scalers = [DummyScaler(), StandardScaler(), MinMaxScaler(), MinMaxScaler((-1, 1)), RobustScaler(),
            MaxAbsScaler(), QuantileTransformer(), QuantileTransformer(random_state=0),
            QuantileTransformer(output_distribution='normal', random_state=0)]
    for case in cases:
75
        output_filename = f"{case} model preprocessing.csv"
76
77
78
79
        header = ['clf', 'normalizer', 'scaler', 'accuracy_mean', 'accuracy_std',
                  'precision_mean', 'precision_std', 'recall_mean', 'recall_std',
                  'roc_auc_mean', 'roc_auc_std', 'pr_auc_mean', 'pr_auc_std',
                  'f1_mean', 'f1_std']
80
81
82
83
        with open(Path.joinpath(pt.REPORTS_DIR, output_filename), 'w',
                  encoding='UTF8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(header)
84
85
86
        for normalizer_name, normalizer in zip(normalizer_names, normalizers):
            for scaler_name, scaler in zip(scaler_names, scalers):
                if case == "Complete":
87
                    settings = load_settings("complete_emb.yaml")
88
                    dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
89
90
                    X, y = dl.get_data()
                elif case == "Compliance":
91
                    settings = load_settings("compliance_emb.yaml")
92
93
94
                    dl = data_loader.ComplianceDataLoader("compliance_emb.csv", settings).load_data()
                    X, y = dl.get_data()
                elif case == "Fall":
95
                    settings = load_settings("fall_emb.yaml")
96
                    dl = data_loader.AlarmDataLoader("fall_emb.csv", settings).load_data()
97
98
                    X, y = dl.get_data()
                else:
99
                    settings = load_settings("risk_emb.yaml")
100
                    dl = data_loader.FallDataLoader("risk_emb.csv", settings).load_data()
101
102
                    X, y = dl.get_data()
                    
103
                emb_cols = X.filter(regex='((\d+)[Ats|Ex])\w+', axis=1)
104
                n_norm_cols = X.shape[1] - emb_cols.shape[1]
105
                X_sc = pd.DataFrame(normalizer.fit_transform(X.iloc[:,:n_norm_cols], case))
106
107
108
109
110
                X = pd.concat([X_sc, X.iloc[:,n_norm_cols:]], axis=1)
                X = np.array(X)
                y = np.array(y)
                X_sc = scaler.fit_transform(X[:,:n_norm_cols])
                X = np.concatenate([X_sc, X[:,n_norm_cols:]], axis=1)
111
112
113
114
115
116
117
                results = {'KNN': KnnClassifier(X, y).evaluate(metrics),
                           'SVM': SvmClassifier(X, y).evaluate(metrics),
                           'LR': LrClassifier(X, y).evaluate(metrics),
                           'XGB': XgbClassifier(X, y).evaluate(metrics),
                           'RF': RfClassifier(X, y).evaluate(metrics),
                           'MLP': MlpClassifier(X, y).evaluate(metrics)}
                
118
119
120
121
                with open(Path.joinpath(pt.REPORTS_DIR, output_filename), 'a',
                          encoding='UTF8', newline='') as f:
                    writer = csv.writer(f)
                    for clf_name, result in results.items():
122
123
124
125
126
                        data = [clf_name, normalizer_name, scaler_name]
                        for metric in metrics:
                            mean = compute_mean(result[metric])
                            std = compute_std(result[metric])
                            data.extend((mean, std))
127
128
129
130
                        writer.writerow(data)

if __name__ == '__main__':
    main()