train_xgboost_model.py 4.56 KB
Newer Older
1
2
3
#!/usr/bin/env python
import numpy as np
import pandas as pd
4
import paths as pt
5
6
from tools import file_reader, file_writer, explainer
from utility import metrics
7
from sklearn.metrics import accuracy_score, precision_score
8
from sklearn.metrics import recall_score, roc_auc_score
9
10
11
12
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost as xgb

13
CASES = ["Complete", "Compliance", "Fall", "Risk"]
14

15
def main(dataset_version : str = 'emb'):
16
17
    for case in CASES:
        if case == "Complete":
18
            if dataset_version == 'ohe':
19
                df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'complete_count.csv')
20
            else:
21
                df = file_reader.read_csv(pt.TESTS_FILES_DIR, 'complete_emb.csv')
22
            target_name = "Complete"
23
        elif case == "Compliance":
24
            if dataset_version == 'ohe':
25
                df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'compliance_count.csv')
26
            else:
27
                df = file_reader.read_csv(pt.TESTS_FILES_DIR, 'compliance_emb.csv')
28
29
            target_name = "Compliance"
        elif case == "Fall":
30
            if dataset_version == 'ohe':
31
                df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'fall_count.csv')
32
            else:
33
                df = file_reader.read_csv(pt.TESTS_FILES_DIR, 'fall_emb.csv')
34
            target_name = "Fall"
35
        else:
36
            if dataset_version == 'ohe':
37
                df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'risk_count.csv')
38
            else:
39
40
                df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'risk_emb.csv')
            target_name = "Risk"
41
42
    
        df = df.sample(frac=1, random_state=0).reset_index(drop=True)
43
        
44
        X = df.drop([target_name, 'Rand'], axis=1)
45
        y = df[target_name]
46
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
47
                                                            stratify=y, random_state=0)
48
        
49
50
51
        neg, pos = np.bincount(y)
        scale_pos_weight = neg / pos
    
52
        params = {"n_estimators": 400,
53
54
55
                "objective": "binary:logistic",
                "scale_pos_weight": scale_pos_weight,
                "use_label_encoder": False,
thecml's avatar
thecml committed
56
                "learning_rate": 0.07,
57
                "eval_metric": "logloss",
58
                "random_state": 0
59
        }
thecml's avatar
thecml committed
60
                
61
        model = xgb.XGBClassifier(**params)
62
63
64
65
66
67
68
69
70
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    
        y_valid_pred = 0*y
        valid_acc, valid_pre, valid_recall, valid_roc_auc = list(), list(), list(), list()
        for train_index, valid_index in skf.split(X_train, y_train):
            X_train_split, X_valid_split = X_train.iloc[train_index,:], X_train.iloc[valid_index,:]
            y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]

            optimize_rounds = True
thecml's avatar
thecml committed
71
            early_stopping_rounds = 200
72
73
            if optimize_rounds:
                eval_set=[(X_valid_split, y_valid_split)]
thecml's avatar
thecml committed
74
                fit_model = model.fit(X_train_split, y_train_split,
thecml's avatar
thecml committed
75
76
77
78
                                      eval_set=eval_set,
                                      eval_metric=metrics.gini_xgb,
                                      early_stopping_rounds=early_stopping_rounds,
                                      verbose=False)
79
80
81
82
83
84
85
86
87
88
89
            else:
                fit_model = model.fit(X_train_split, y_train_split)
        
            pred = fit_model.predict_proba(X_valid_split)[:,1]
            y_valid_pred.iloc[valid_index] = pred
            
            y_valid_scores = (y_valid_pred.iloc[valid_index] > 0.5)
            valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))
            valid_pre.append(precision_score(y_valid_split, y_valid_scores))
            valid_recall.append(recall_score(y_valid_split, y_valid_scores))
            valid_roc_auc.append(roc_auc_score(y_valid_split, y_valid_pred.iloc[valid_index]))
90
                
91
        y_pred = model.predict(X_test)
92
        y_proba = model.predict_proba(X_test)[:,1]
93
        file_writer.write_joblib(model, pt.MODELS_DIR, f'{case.lower()}_xgboost.joblib')
94
        
95
        print(f"Scores for {case} XGBoost model:")
96
97
98
99
        print(f"Accuracy: {np.around(accuracy_score(y_test, y_pred), decimals=3)}")
        print(f"Precision: {np.around(precision_score(y_test, y_pred), decimals=3)}")
        print(f"Recall: {np.around(recall_score(y_test, y_pred), decimals=3)}")
        print(f"ROC AUC: {np.around(roc_auc_score(y_test, y_proba), decimals=3)}\n")
100
101
    
if __name__ == '__main__':
102
    main()