"""
evaluate_preprocessing.py
====================================
Script to evaluate preprocessing strategies for cases.
"""
import numpy as np
import pandas as pd
from tools import data_loader
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import RobustScaler, MaxAbsScaler, QuantileTransformer
from tools.classifiers import KnnClassifier, SvmClassifier, LrClassifier
from tools.classifiers import XgbClassifier, RfClassifier, MlpClassifier
from utility.config import load_config
from utility.metrics import compute_mean, compute_std
import csv
import paths as pt
[docs]class DummyScaler(BaseEstimator, TransformerMixin):
[docs]class DummyNormalizer(BaseEstimator, TransformerMixin):
[docs]class BoxCoxNormalizer(BaseEstimator, TransformerMixin):
[docs]class BoxCoxNormalizerNoGender(BaseEstimator, TransformerMixin):
def main():
cases = ["Complete", "Compliance", "Fall", "Risk"]
metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'f1']
normalizer_names = ["None", "BoxCox", "BoxCoxNoGender"]
normalizers = [DummyNormalizer(), BoxCoxNormalizer(), BoxCoxNormalizerNoGender()]
scaler_names = ["None", "Standard", "MinMax", "MinMaxRange", "Robust",
"MaxAbs", "QuantileTransformer", "QuantileTransformerNorm"]
scalers = [DummyScaler(), StandardScaler(), MinMaxScaler(), MinMaxScaler((-1, 1)), RobustScaler(),
MaxAbsScaler(), QuantileTransformer(), QuantileTransformer(random_state=0),
QuantileTransformer(output_distribution='normal', random_state=0)]
for case in cases:
output_filename = f"{case} model preprocessing.csv"
header = ['clf', 'normalizer', 'scaler', 'accuracy_mean', 'accuracy_std',
'precision_mean', 'precision_std', 'recall_mean', 'recall_std',
'roc_auc_mean', 'roc_auc_std', 'pr_auc_mean', 'pr_auc_std',
'f1_mean', 'f1_std']
with open(Path.joinpath(pt.REPORTS_DIR, output_filename), 'w',
encoding='UTF8', newline='') as f:
writer = csv.writer(f)
writer.writerow(header)
for normalizer_name, normalizer in zip(normalizer_names, normalizers):
for scaler_name, scaler in zip(scaler_names, scalers):
if case == "Complete":
settings = load_config(pt.CONFIGS_DIR, "complete.yaml")
dl = data_loader.CompleteDataLoader(pt.PROCESSED_DATA_DIR,
"complete_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Compliance":
settings = load_config(pt.CONFIGS_DIR, "compliance.yaml")
dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
"compliance_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Fall":
settings = load_config(pt.CONFIGS_DIR, "fall.yaml")
dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
"fall_emb.csv", settings).load_data()
X, y = dl.get_data()
else:
settings = load_config(pt.CONFIGS_DIR, "risk.yaml")
dl = data_loader.RiskDataLoader(pt.PROCESSED_DATA_DIR,
"risk_emb.csv", settings).load_data()
X, y = dl.get_data()
emb_cols = X.filter(regex='((\d+)[Ats|Ex])\w+', axis=1)
n_norm_cols = X.shape[1] - emb_cols.shape[1]
X_sc = pd.DataFrame(normalizer.fit_transform(X.iloc[:,:n_norm_cols], case))
X = pd.concat([X_sc, X.iloc[:,n_norm_cols:]], axis=1)
X = np.array(X)
y = np.array(y)
X_sc = scaler.fit_transform(X[:,:n_norm_cols])
X = np.concatenate([X_sc, X[:,n_norm_cols:]], axis=1)
seed = 0
results = {'KNN': KnnClassifier(X, y).evaluate(metrics, seed),
'SVM': SvmClassifier(X, y).evaluate(metrics, seed),
'LR': LrClassifier(X, y).evaluate(metrics, seed),
'XGB': XgbClassifier(X, y).evaluate(metrics, seed),
'RF': RfClassifier(X, y).evaluate(metrics, seed),
'MLP': MlpClassifier(X, y).evaluate(metrics, seed)}
with open(Path.joinpath(pt.REPORTS_DIR, output_filename), 'a',
encoding='UTF8', newline='') as f:
writer = csv.writer(f)
for clf_name, result in results.items():
data = [clf_name, normalizer_name, scaler_name]
for metric in metrics:
mean = compute_mean(result[metric])
std = compute_std(result[metric])
data.extend((mean, std))
writer.writerow(data)
if __name__ == '__main__':
main()