Source code for analysis.evaluate_preprocessing

"""
evaluate_preprocessing.py
====================================
Script to evaluate preprocessing strategies for cases.
"""

import numpy as np
import pandas as pd
from tools import data_loader
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import RobustScaler, MaxAbsScaler, QuantileTransformer
from tools.classifiers import KnnClassifier, SvmClassifier, LrClassifier
from tools.classifiers import XgbClassifier, RfClassifier, MlpClassifier
from utility.config import load_config
from utility.metrics import compute_mean, compute_std
import csv
import paths as pt

[docs]class DummyScaler(BaseEstimator, TransformerMixin):
[docs] def fit_transform(self, X): return np.array(X)
[docs]class DummyNormalizer(BaseEstimator, TransformerMixin):
[docs] def fit_transform(self, X, case=None): return np.array(X)
[docs]class BoxCoxNormalizer(BaseEstimator, TransformerMixin):
[docs] def fit_transform(self, X, case=None): if case == "Risk": numeric_feats = ['Gender', 'BirthYear', 'LoanPeriod', 'NumberSplit', 'NumberScreening', 'NumberWeeks', 'MeanEvaluation', 'NumberTraining', 'NumberTrainingWeek', 'TimeBetweenTraining', 'NumberWeeksNoTraining', 'Needs', 'Physics', 'NumberAts', 'NumberEx'] else: numeric_feats = ['Gender', 'BirthYear', 'LoanPeriod', 'NumberAts'] skewed_feats = X[numeric_feats].apply(lambda x: skew(x.dropna())) skewed_feats = skewed_feats[skewed_feats > 0.25] skewed_feats = skewed_feats.index for feats in skewed_feats: X[feats] = X[feats] + 1 X[feats], _ = boxcox(X[feats]) return np.array(X)
[docs]class BoxCoxNormalizerNoGender(BaseEstimator, TransformerMixin):
[docs] def fit_transform(self, X, case=None): if case == "Risk": numeric_feats = ['BirthYear', 'LoanPeriod', 'NumberSplit', 'NumberScreening', 'NumberWeeks', 'MeanEvaluation', 'NumberTraining', 'NumberTrainingWeek', 'TimeBetweenTraining', 'NumberWeeksNoTraining', 'Needs', 'Physics', 'NumberAts', 'NumberEx'] else: numeric_feats = ['BirthYear', 'LoanPeriod', 'NumberAts'] skewed_feats = X[numeric_feats].apply(lambda x: skew(x.dropna())) skewed_feats = skewed_feats[skewed_feats > 0.25] skewed_feats = skewed_feats.index for feats in skewed_feats: X[feats] = X[feats] + 1 X[feats], _ = boxcox(X[feats]) return np.array(X)
def main(): cases = ["Complete", "Compliance", "Fall", "Risk"] metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'f1'] normalizer_names = ["None", "BoxCox", "BoxCoxNoGender"] normalizers = [DummyNormalizer(), BoxCoxNormalizer(), BoxCoxNormalizerNoGender()] scaler_names = ["None", "Standard", "MinMax", "MinMaxRange", "Robust", "MaxAbs", "QuantileTransformer", "QuantileTransformerNorm"] scalers = [DummyScaler(), StandardScaler(), MinMaxScaler(), MinMaxScaler((-1, 1)), RobustScaler(), MaxAbsScaler(), QuantileTransformer(), QuantileTransformer(random_state=0), QuantileTransformer(output_distribution='normal', random_state=0)] for case in cases: output_filename = f"{case} model preprocessing.csv" header = ['clf', 'normalizer', 'scaler', 'accuracy_mean', 'accuracy_std', 'precision_mean', 'precision_std', 'recall_mean', 'recall_std', 'roc_auc_mean', 'roc_auc_std', 'pr_auc_mean', 'pr_auc_std', 'f1_mean', 'f1_std'] with open(Path.joinpath(pt.REPORTS_DIR, output_filename), 'w', encoding='UTF8', newline='') as f: writer = csv.writer(f) writer.writerow(header) for normalizer_name, normalizer in zip(normalizer_names, normalizers): for scaler_name, scaler in zip(scaler_names, scalers): if case == "Complete": settings = load_config(pt.CONFIGS_DIR, "complete.yaml") dl = data_loader.CompleteDataLoader(pt.PROCESSED_DATA_DIR, "complete_emb.csv", settings).load_data() X, y = dl.get_data() elif case == "Compliance": settings = load_config(pt.CONFIGS_DIR, "compliance.yaml") dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR, "compliance_emb.csv", settings).load_data() X, y = dl.get_data() elif case == "Fall": settings = load_config(pt.CONFIGS_DIR, "fall.yaml") dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR, "fall_emb.csv", settings).load_data() X, y = dl.get_data() else: settings = load_config(pt.CONFIGS_DIR, "risk.yaml") dl = data_loader.RiskDataLoader(pt.PROCESSED_DATA_DIR, "risk_emb.csv", settings).load_data() X, y = dl.get_data() emb_cols = X.filter(regex='((\d+)[Ats|Ex])\w+', axis=1) n_norm_cols = X.shape[1] - emb_cols.shape[1] X_sc = pd.DataFrame(normalizer.fit_transform(X.iloc[:,:n_norm_cols], case)) X = pd.concat([X_sc, X.iloc[:,n_norm_cols:]], axis=1) X = np.array(X) y = np.array(y) X_sc = scaler.fit_transform(X[:,:n_norm_cols]) X = np.concatenate([X_sc, X[:,n_norm_cols:]], axis=1) seed = 0 results = {'KNN': KnnClassifier(X, y).evaluate(metrics, seed), 'SVM': SvmClassifier(X, y).evaluate(metrics, seed), 'LR': LrClassifier(X, y).evaluate(metrics, seed), 'XGB': XgbClassifier(X, y).evaluate(metrics, seed), 'RF': RfClassifier(X, y).evaluate(metrics, seed), 'MLP': MlpClassifier(X, y).evaluate(metrics, seed)} with open(Path.joinpath(pt.REPORTS_DIR, output_filename), 'a', encoding='UTF8', newline='') as f: writer = csv.writer(f) for clf_name, result in results.items(): data = [clf_name, normalizer_name, scaler_name] for metric in metrics: mean = compute_mean(result[metric]) std = compute_std(result[metric]) data.extend((mean, std)) writer.writerow(data) if __name__ == '__main__': main()