test_model_best_features.py 4.48 KB
Newer Older
1
2
#!/usr/bin/env python
import numpy as np
3
import paths as pt
4
5
from tools import file_writer, data_loader
from sklearn.model_selection import StratifiedKFold
6
7
8
import xgboost as xgb
from sklearn.metrics import accuracy_score
import pandas as pd
9
from utility.metrics import gini_xgb
10
import shap
11
from typing import List
12
from utility.settings import load_settings
13

14
NUM_ITERATIONS = 1
15
16

def main():
17
18
    #cases = ["Complete", "Compliance", "Alarm", "Fall"]
    cases = ['Alarm']
19
20
21
22
23
24
25
26
27
    for case in cases:
        if case == "Complete":
            settings = load_settings("complete_emb.yaml")
            dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
            X, y = dl.get_data()
        elif case == "Compliance":
            settings = load_settings("compliance_emb.yaml")
            dl = data_loader.ComplianceDataLoader("compliance_emb.csv", settings).load_data()
            X, y = dl.get_data()
28
29
30
        elif case == "Alarm":
            settings = load_settings("alarm_emb.yaml")
            dl = data_loader.AlarmDataLoader("alarm_count.csv", settings).load_data()
31
32
            X, y = dl.get_data()
        else:
33
34
            settings = load_settings("fall_emb.yaml")
            dl = data_loader.FallDataLoader("fall_emb.csv", settings).load_data()
35
36
37
38
39
40
            X, y = dl.get_data()
        
        features = dl.get_features()
        X, y = dl.prepare_data()
        X = np.array(X)
        y = np.array(y)
41

42
43
44
45
46
47
48
49
50
        total_shap_df  = pd.DataFrame()
        for seed in range(NUM_ITERATIONS):
            print('#'*40, '{} of {} iterations'.format(seed+1, NUM_ITERATIONS), '#' * 40)
            temp_shap_df = get_best_shap_features(X, y, features, seed)
            total_shap_df = pd.concat([total_shap_df, temp_shap_df])
        
        shap_sorted_df = total_shap_df.groupby('feature').mean().sort_values('shap_values',
                                                                            ascending=False)
        shap_sorted_df = shap_sorted_df.reset_index()
51

52
53
        importances = shap_sorted_df['shap_values']
        features = shap_sorted_df['feature']
54
55
56
        plot_file_name = f"{case} SHAP feature values"
        csv_file_name = f"{case} model features.csv"
        file_writer.write_shap_importance_plot(features, importances, pt.REPORTS_PLOTS_DIR, plot_file_name)
57
        file_writer.write_csv(shap_sorted_df, pt.REPORTS_DIR, csv_file_name)
58

59
60
def get_best_shap_features(X: np.ndarray, y: np.ndarray,
                           cols: List[str], seed: int):
61
    K = 5
62
    kf = StratifiedKFold(n_splits=K, random_state=seed, shuffle=True)
63
64
65
66
67
    feat_importance_df  = pd.DataFrame()
    
    neg, pos = np.bincount(y)
    scale_pos_weight = neg / pos

68
69
    model = xgb.XGBClassifier(n_estimators=400,
                              learning_rate=0.1,
70
                              objective='binary:logistic',
71
                              scale_pos_weight=scale_pos_weight,
72
73
                              eval_metric='logloss',
                              use_label_encoder=False,
74
                              random_state=seed)
75
76
77
78
    
    acc_score_list = list()
    for i, (train_index, test_index) in enumerate(kf.split(X, y)):
        print('='*30, '{} of {} folds'.format(i+1, K), '='*30)
79
80
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]
81
82
83
        model.fit(X_train, y_train,
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  eval_metric=gini_xgb,
84
                  verbose=0)
85
86
        y_val_pred = model.predict_proba(X_val)[:,1]
        y_scores_new = (y_val_pred > 0.5)
87
        acc_score_list.append(np.around(accuracy_score(y_val, y_scores_new), decimals=3))
88
89
        shap_values = shap.TreeExplainer(model).shap_values(X_train)
        fold_importance_df  = pd.DataFrame()
90
        fold_importance_df['feature'] = cols
91
92
        fold_importance_df['shap_values'] = np.around(abs(np.array(shap_values)[:,:]).mean(0), decimals=2)
        fold_importance_df['feat_imp'] = np.around(model.feature_importances_, decimals=2)
93
94
        feat_importance_df = pd.concat([feat_importance_df, fold_importance_df])
    
95
96
    mean_acc_score = np.mean(acc_score_list)
    print(f"Mean accuracy: {mean_acc_score}")
97
98
    feat_importance_df_shap = feat_importance_df.groupby('feature').mean().sort_values('shap_values',
                                                                                       ascending=False)
99
    feat_importance_df_shap = feat_importance_df_shap.reset_index()    
100
101
102
103
    return feat_importance_df_shap

if __name__ == '__main__':
    main()