train_xgb_models.py 6.17 KB
Newer Older
1
2
3
4
5
6
#!/usr/bin/env python
import numpy as np
import config as cfg
import pandas as pd
from tools import file_reader, file_writer, explainer
from utility import metrics
7
from sklearn.metrics import accuracy_score, precision_score
8
from sklearn.metrics import recall_score, roc_auc_score
9
10
11
12
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost as xgb

13
DATA_DIR = cfg.PROCESSED_DATA_DIR
14
CASES = ["Complete", "Compliance", "Fall", "Fall_test"]
15

16
def main(dataset_version : str = 'emb'):
17
18
    for case in CASES:
        if case == "Complete":
19
20
21
            if dataset_version == 'ohe':
                df = file_reader.read_csv(DATA_DIR, 'complete_count.csv')
            else:
22
                df = file_reader.read_csv(DATA_DIR, 'complete_emb.csv')
23
            model_dir = cfg.COMPLETE_XGB_DIR
24
            target_name = "Complete"
25
        elif case == "Compliance":
26
27
28
            if dataset_version == 'ohe':
                df = file_reader.read_csv(DATA_DIR, 'compliance_count.csv')
            else:
29
                df = file_reader.read_csv(DATA_DIR, 'compliance_emb.csv')
30
            model_dir = cfg.COMPLIANCE_XGB_DIR
31
32
            target_name = "Compliance"
        elif case == "Fall":
33
34
35
            if dataset_version == 'ohe':
                df = file_reader.read_csv(DATA_DIR, 'fall_count.csv')
            else:
36
                df = file_reader.read_csv(DATA_DIR, 'fall_emb.csv')
37
38
            model_dir = cfg.FALL_XGB_DIR
            target_name = "Fall"
39
        else:
40
41
42
            if dataset_version == 'ohe':
                df = file_reader.read_csv(DATA_DIR, 'fall_test_count.csv')
            else:
43
                df = file_reader.read_csv(DATA_DIR, 'fall_test_emb.csv')
44
45
            model_dir = cfg.FALL_TEST_XGB_DIR
            target_name = "Fall"
46
47
    
        df = df.sample(frac=1, random_state=0).reset_index(drop=True)
48
49
        
        #df['Random'] = np.random.rand(len(df),1) # add random noise col
50
    
51
52
        X = df.drop([target_name], axis=1)
        y = df[target_name]
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
53
54
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                            stratify=y, random_state=0)
55
        
56
57
58
        neg, pos = np.bincount(y)
        scale_pos_weight = neg / pos
    
59
        params = {"n_estimators": 400,
60
61
62
                "objective": "binary:logistic",
                "scale_pos_weight": scale_pos_weight,
                "use_label_encoder": False,
thecml's avatar
thecml committed
63
                "learning_rate": 0.07,
64
65
66
                "eval_metric": "logloss",
                "seed": 0
        }
thecml's avatar
thecml committed
67
                
68
        model = xgb.XGBClassifier(**params)
69
70
71
72
73
74
75
76
77
78
79
80
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    
        y_valid_pred = 0*y
        valid_acc, valid_pre, valid_recall, valid_roc_auc = list(), list(), list(), list()
        for train_index, valid_index in skf.split(X_train, y_train):
            X_train_split, X_valid_split = X_train.iloc[train_index,:], X_train.iloc[valid_index,:]
            y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]

            optimize_rounds = True
            early_stopping_rounds = 50
            if optimize_rounds:
                eval_set=[(X_valid_split, y_valid_split)]
thecml's avatar
thecml committed
81
                fit_model = model.fit(X_train_split, y_train_split,
thecml's avatar
thecml committed
82
83
84
85
                                      eval_set=eval_set,
                                      eval_metric=metrics.gini_xgb,
                                      early_stopping_rounds=early_stopping_rounds,
                                      verbose=False)
86
87
88
89
90
91
92
93
94
95
96
            else:
                fit_model = model.fit(X_train_split, y_train_split)
        
            pred = fit_model.predict_proba(X_valid_split)[:,1]
            y_valid_pred.iloc[valid_index] = pred
            
            y_valid_scores = (y_valid_pred.iloc[valid_index] > 0.5)
            valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))
            valid_pre.append(precision_score(y_valid_split, y_valid_scores))
            valid_recall.append(recall_score(y_valid_split, y_valid_scores))
            valid_roc_auc.append(roc_auc_score(y_valid_split, y_valid_pred.iloc[valid_index]))
97
                
98
        y_pred = model.predict(X_test)
99
        y_proba = model.predict_proba(X_test)[:,1]
100
101
102
103
        file_writer.write_cm_plot(y_test, y_pred, cfg.REPORTS_PLOTS_DIR,
                                f'{case.lower()}_xgb_cm.pdf', case)
        file_writer.write_joblib(model, model_dir, f'{case.lower()}_xgboost.joblib')
        
104
        print(f"Scores for {case} XGBoost model:")
105
106
107
108
109
        print(f"Accuracy: {np.around(accuracy_score(y_test, y_pred), decimals=3)}")
        print(f"Precision: {np.around(precision_score(y_test, y_pred), decimals=3)}")
        print(f"Recall: {np.around(recall_score(y_test, y_pred), decimals=3)}")
        print(f"ROC AUC: {np.around(roc_auc_score(y_test, y_proba), decimals=3)}\n")
        
110
111
        feature_names = X.columns
        shap_explainer, shap_values = explainer.get_shap_tree_explainer(model, X_test=X_test)
112
        
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
        importance_df  = pd.DataFrame()
        importance_df['feature'] = feature_names
        importance_df['shap_values'] = np.around(abs(np.array(shap_values)[:,:]).mean(0), decimals=3)
        importance_df['feat_imp'] = np.around(model.feature_importances_, decimals=3)
        feat_importance_df_shap = importance_df.groupby('feature').mean().sort_values('shap_values',
                                                                                       ascending=False)
        feat_importance_df_shap = feat_importance_df_shap.reset_index()
        file_writer.write_csv(feat_importance_df_shap, cfg.REPORTS_DIR, f"{case} best features.csv")
        file_name_sum = f'{case.lower()}_shap_summary'
        file_name_exp = f'{case.lower()}_shap_row_0'
        explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
                                         file_name_sum, model_dir,
                                         plot_type=None)
        explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
                                     file_name_sum + '_bar', model_dir,
                                     plot_type="bar")
129
130
131
    
if __name__ == '__main__':
    main()