make_xgb_models.py 4.74 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
#!/usr/bin/env python
import numpy as np
import config as cfg
import pandas as pd
from tools import file_reader, file_writer, explainer
from utility import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost as xgb

13
DATA_DIR = cfg.PROCESSED_DATA_DIR
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
CASES = ["Complete", "Success", "Fall"]

def main():
    for case in CASES:
        if case == "Complete":
            df = file_reader.read_csv(DATA_DIR, 'complete_emb.csv')
            model_dir = cfg.COMPLETE_XGB_DIR
        elif case == "Success":
            df = file_reader.read_csv(DATA_DIR, 'success_emb.csv')
            model_dir = cfg.SUCCESS_XGB_DIR
        else:
            df = file_reader.read_csv(DATA_DIR, 'fall_emb.csv')
            model_dir = cfg.FALL_XGB_DIR
    
        df = df.sample(frac=1, random_state=0).reset_index(drop=True)
    
        X = df.drop([case], axis=1)
31
        y = df[case]
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

        neg, pos = np.bincount(y)
        scale_pos_weight = neg / pos
    
        params = {"n_estimators": 400,
                "objective": "binary:logistic",
                "scale_pos_weight": scale_pos_weight,
                "use_label_encoder": False,
                "learning_rate": 0.1,
                "eval_metric": "logloss",
                "seed": 0
        }
        model = xgb.XGBClassifier(**params)
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    
        y_valid_pred = 0*y
        valid_acc, valid_pre, valid_recall = list(), list(), list()
        for train_index, valid_index in skf.split(X_train, y_train):
            X_train_split, X_valid_split = X_train.iloc[train_index,:], X_train.iloc[valid_index,:]
            y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]

            optimize_rounds = True
            early_stopping_rounds = 50
            if optimize_rounds:
                eval_set=[(X_valid_split, y_valid_split)]
                fit_model = model.fit(X_train_split, y_train_split, 
                                        eval_set=eval_set,
                                        eval_metric=metrics.gini_xgb,
                                        early_stopping_rounds=early_stopping_rounds,
                                        verbose=False)
            else:
                fit_model = model.fit(X_train_split, y_train_split)
        
            pred = fit_model.predict_proba(X_valid_split)[:,1]
            y_valid_pred.iloc[valid_index] = pred
            
            y_valid_scores = (y_valid_pred.iloc[valid_index] > 0.5)
            valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))
            valid_pre.append(precision_score(y_valid_split, y_valid_scores))
            valid_recall.append(recall_score(y_valid_split, y_valid_scores))
    
        print(f"Accuracy: {np.around(np.mean(valid_acc), decimals=3)}")
        print(f"Precision: {np.around(np.mean(valid_pre), decimals=3)}")
        print(f"Recall: {np.around(np.mean(valid_recall), decimals=3)}\n")
    
        y_pred = model.predict(X_test)
        file_writer.write_cm_plot(y_test, y_pred, cfg.REPORTS_PLOTS_DIR,
                                f'{case.lower()}_xgb_cm.pdf', case)
        file_writer.write_joblib(model, model_dir, f'{case}_xgboost.joblib')
        
        feature_names = X.columns
        shap_explainer, shap_values = explainer.get_shap_tree_explainer(model, X_test=X_test)
        importance_df  = pd.DataFrame()
        importance_df['feature'] = feature_names
        importance_df['shap_values'] = np.around(abs(np.array(shap_values)[:,:]).mean(0), decimals=3)
        importance_df['feat_imp'] = np.around(model.feature_importances_, decimals=3)
        feat_importance_df_shap = importance_df.groupby('feature').mean().sort_values('shap_values',
                                                                                       ascending=False)
        feat_importance_df_shap = feat_importance_df_shap.reset_index()
        file_writer.write_csv(feat_importance_df_shap, cfg.REPORTS_DIR, f"{case} best features.csv")
        file_name_sum = f'{case.lower()}_shap_summary'
        file_name_exp = f'{case.lower()}_shap_row_0'
        explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
                                         file_name_sum, model_dir,
                                         plot_type=None)
        explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
                                     file_name_sum + '_bar', model_dir,
                                     plot_type="bar")
    
if __name__ == '__main__':
    main()