make_xgb_models.py 5.29 KB
Newer Older
1
2
3
4
5
6
#!/usr/bin/env python
import numpy as np
import config as cfg
import pandas as pd
from tools import file_reader, file_writer, explainer
from utility import metrics
7
8
from sklearn.metrics import accuracy_score, precision_score,
from sklearn.metrics import recall_score, roc_auc_score
9
10
11
12
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost as xgb

13
DATA_DIR = cfg.PROCESSED_DATA_DIR
14
CASES = ["Complete", "Compliance", "Fall", "Fall_test"]
15
16
17
18

def main():
    for case in CASES:
        if case == "Complete":
19
            df = file_reader.read_csv(DATA_DIR, 'complete_count.csv')
20
            model_dir = cfg.COMPLETE_XGB_DIR
21
            target_name = "Complete"
22
        elif case == "Compliance":
23
            df = file_reader.read_csv(DATA_DIR, 'compliance_count.csv')
24
            model_dir = cfg.COMPLIANCE_XGB_DIR
25
26
27
28
29
            target_name = "Compliance"
        elif case == "Fall":
            df = file_reader.read_csv(DATA_DIR, 'fall_count.csv')
            model_dir = cfg.FALL_XGB_DIR
            target_name = "Fall"
30
        else:
31
32
33
            df = file_reader.read_csv(DATA_DIR, 'fall_test_count.csv')
            model_dir = cfg.FALL_TEST_XGB_DIR
            target_name = "Fall"
34
35
36
    
        df = df.sample(frac=1, random_state=0).reset_index(drop=True)
    
37
38
39
        X = df.drop([target_name], axis=1)
        y = df[target_name]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
40
41
42
43

        neg, pos = np.bincount(y)
        scale_pos_weight = neg / pos
    
44
        params = {"n_estimators": 800,
45
46
47
48
49
50
51
                "objective": "binary:logistic",
                "scale_pos_weight": scale_pos_weight,
                "use_label_encoder": False,
                "learning_rate": 0.1,
                "eval_metric": "logloss",
                "seed": 0
        }
52
        
53
        model = xgb.XGBClassifier(**params)
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    
        y_valid_pred = 0*y
        valid_acc, valid_pre, valid_recall, valid_roc_auc = list(), list(), list(), list()
        for train_index, valid_index in skf.split(X_train, y_train):
            X_train_split, X_valid_split = X_train.iloc[train_index,:], X_train.iloc[valid_index,:]
            y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]

            optimize_rounds = True
            early_stopping_rounds = 50
            if optimize_rounds:
                eval_set=[(X_valid_split, y_valid_split)]
                fit_model = model.fit(X_train_split, y_train_split, 
                                        eval_set=eval_set,
                                        eval_metric=metrics.gini_xgb,
                                        early_stopping_rounds=early_stopping_rounds,
                                        verbose=False)
            else:
                fit_model = model.fit(X_train_split, y_train_split)
        
            pred = fit_model.predict_proba(X_valid_split)[:,1]
            y_valid_pred.iloc[valid_index] = pred
            
            y_valid_scores = (y_valid_pred.iloc[valid_index] > 0.5)
            valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))
            valid_pre.append(precision_score(y_valid_split, y_valid_scores))
            valid_recall.append(recall_score(y_valid_split, y_valid_scores))
            valid_roc_auc.append(roc_auc_score(y_valid_split, y_valid_pred.iloc[valid_index]))
    
        print(f"Accuracy: {np.around(np.mean(valid_acc), decimals=3)}")
        print(f"Precision: {np.around(np.mean(valid_pre), decimals=3)}")
        print(f"Recall: {np.around(np.mean(valid_recall), decimals=3)}")
        print(f"ROC AUC: {np.around(np.mean(valid_roc_auc), decimals=3)}\n")
    
        y_pred = model.predict(X_test)
        file_writer.write_cm_plot(y_test, y_pred, cfg.REPORTS_PLOTS_DIR,
                                f'{case.lower()}_xgb_cm.pdf', case)
        file_writer.write_joblib(model, model_dir, f'{case.lower()}_xgboost.joblib')
        
        feature_names = X.columns
        shap_explainer, shap_values = explainer.get_shap_tree_explainer(model, X_test=X_test)
        importance_df  = pd.DataFrame()
        importance_df['feature'] = feature_names
        importance_df['shap_values'] = np.around(abs(np.array(shap_values)[:,:]).mean(0), decimals=3)
        importance_df['feat_imp'] = np.around(model.feature_importances_, decimals=3)
        feat_importance_df_shap = importance_df.groupby('feature').mean().sort_values('shap_values',
                                                                                       ascending=False)
        feat_importance_df_shap = feat_importance_df_shap.reset_index()
        file_writer.write_csv(feat_importance_df_shap, cfg.REPORTS_DIR, f"{case} best features.csv")
        file_name_sum = f'{case.lower()}_shap_summary'
        file_name_exp = f'{case.lower()}_shap_row_0'
        explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
                                         file_name_sum, model_dir,
                                         plot_type=None)
        explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
                                     file_name_sum + '_bar', model_dir,
                                     plot_type="bar")
111
112
113
    
if __name__ == '__main__':
    main()