train_xgboost_model.py 6.13 KB
Newer Older
1
2
3
#!/usr/bin/env python
import numpy as np
import pandas as pd
4
import paths as pt
5
6
from tools import file_reader, file_writer, explainer
from utility import metrics
7
from sklearn.metrics import accuracy_score, precision_score
8
from sklearn.metrics import recall_score, roc_auc_score
9
10
11
12
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost as xgb

13
CASES = ["Complete", "Compliance", "Fall", "Risk"]
14

15
def main(dataset_version : str = 'emb'):
16
17
    for case in CASES:
        if case == "Complete":
18
            if dataset_version == 'ohe':
19
                df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'complete_count.csv')
20
            else:
21
                df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'complete_emb.csv')
22
            model_dir = pt.COMPLETE_XGB_DIR
23
            target_name = "Complete"
24
        elif case == "Compliance":
25
            if dataset_version == 'ohe':
26
                df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'compliance_count.csv')
27
            else:
28
                df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'compliance_emb.csv')
29
            model_dir = pt.COMPLIANCE_XGB_DIR
30
31
            target_name = "Compliance"
        elif case == "Fall":
32
            if dataset_version == 'ohe':
33
                df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'fall_count.csv')
34
            else:
35
                df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'fall_emb.csv')
36
            model_dir = pt.FALL_XGB_DIR
37
            target_name = "Fall"
38
        else:
39
            if dataset_version == 'ohe':
40
                df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'risk_count.csv')
41
            else:
42
43
44
                df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'risk_emb.csv')
            model_dir = pt.RISK_XGB_DIR
            target_name = "Risk"
45
46
    
        df = df.sample(frac=1, random_state=0).reset_index(drop=True)
47
        
48
49
        X = df.drop([target_name], axis=1)
        y = df[target_name]
50
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
51
                                                            stratify=y, random_state=0)
52
        
53
54
55
        neg, pos = np.bincount(y)
        scale_pos_weight = neg / pos
    
56
        params = {"n_estimators": 400,
57
58
59
                "objective": "binary:logistic",
                "scale_pos_weight": scale_pos_weight,
                "use_label_encoder": False,
thecml's avatar
thecml committed
60
                "learning_rate": 0.07,
61
62
63
                "eval_metric": "logloss",
                "seed": 0
        }
thecml's avatar
thecml committed
64
                
65
        model = xgb.XGBClassifier(**params)
66
67
68
69
70
71
72
73
74
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    
        y_valid_pred = 0*y
        valid_acc, valid_pre, valid_recall, valid_roc_auc = list(), list(), list(), list()
        for train_index, valid_index in skf.split(X_train, y_train):
            X_train_split, X_valid_split = X_train.iloc[train_index,:], X_train.iloc[valid_index,:]
            y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]

            optimize_rounds = True
thecml's avatar
thecml committed
75
            early_stopping_rounds = 200
76
77
            if optimize_rounds:
                eval_set=[(X_valid_split, y_valid_split)]
thecml's avatar
thecml committed
78
                fit_model = model.fit(X_train_split, y_train_split,
thecml's avatar
thecml committed
79
80
81
82
                                      eval_set=eval_set,
                                      eval_metric=metrics.gini_xgb,
                                      early_stopping_rounds=early_stopping_rounds,
                                      verbose=False)
83
84
85
86
87
88
89
90
91
92
93
            else:
                fit_model = model.fit(X_train_split, y_train_split)
        
            pred = fit_model.predict_proba(X_valid_split)[:,1]
            y_valid_pred.iloc[valid_index] = pred
            
            y_valid_scores = (y_valid_pred.iloc[valid_index] > 0.5)
            valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))
            valid_pre.append(precision_score(y_valid_split, y_valid_scores))
            valid_recall.append(recall_score(y_valid_split, y_valid_scores))
            valid_roc_auc.append(roc_auc_score(y_valid_split, y_valid_pred.iloc[valid_index]))
94
                
95
        y_pred = model.predict(X_test)
96
        y_proba = model.predict_proba(X_test)[:,1]
97
        file_writer.write_cm_plot(y_test, y_pred, pt.REPORTS_PLOTS_DIR,
98
99
100
                                f'{case.lower()}_xgb_cm.pdf', case)
        file_writer.write_joblib(model, model_dir, f'{case.lower()}_xgboost.joblib')
        
101
        print(f"Scores for {case} XGBoost model:")
102
103
104
105
106
        print(f"Accuracy: {np.around(accuracy_score(y_test, y_pred), decimals=3)}")
        print(f"Precision: {np.around(precision_score(y_test, y_pred), decimals=3)}")
        print(f"Recall: {np.around(recall_score(y_test, y_pred), decimals=3)}")
        print(f"ROC AUC: {np.around(roc_auc_score(y_test, y_proba), decimals=3)}\n")
        
107
108
        feature_names = X.columns
        shap_explainer, shap_values = explainer.get_shap_tree_explainer(model, X_test=X_test)
109
        
110
111
112
113
114
115
116
        importance_df  = pd.DataFrame()
        importance_df['feature'] = feature_names
        importance_df['shap_values'] = np.around(abs(np.array(shap_values)[:,:]).mean(0), decimals=3)
        importance_df['feat_imp'] = np.around(model.feature_importances_, decimals=3)
        feat_importance_df_shap = importance_df.groupby('feature').mean().sort_values('shap_values',
                                                                                       ascending=False)
        feat_importance_df_shap = feat_importance_df_shap.reset_index()
117
        file_writer.write_csv(feat_importance_df_shap, pt.REPORTS_DIR, f"{case} best features.csv")
118
119
120
121
122
123
124
125
        file_name_sum = f'{case.lower()}_shap_summary'
        file_name_exp = f'{case.lower()}_shap_row_0'
        explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
                                         file_name_sum, model_dir,
                                         plot_type=None)
        explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
                                     file_name_sum + '_bar', model_dir,
                                     plot_type="bar")
126
127
128
    
if __name__ == '__main__':
    main()