train_xgboost_model.py 3.11 KB
Newer Older
1
2
3
#!/usr/bin/env python
import numpy as np
import pandas as pd
4
import paths as pt
5
from tools import file_reader, file_writer, data_loader
6
from utility import metrics
7
from sklearn.metrics import accuracy_score, precision_score
8
from sklearn.metrics import recall_score, roc_auc_score
9
10
11
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost as xgb
12
13
from pathlib import Path
import yaml
14

15
CASES = ["Complete", "Compliance", "Fall", "Risk"]
16
DATASET_VERSION = 'emb'
17

18
def main():
19
20
    for case in CASES:
        if case == "Complete":
21
22
23
24
25
            with open(Path.joinpath(pt.CONFIGS_DIR, "complete_emb.yaml"), 'r') as stream:
                settings = yaml.safe_load(stream)
            file_name = f'complete_{DATASET_VERSION}.csv'
            dl = data_loader.CompleteDataLoader(file_name, settings).load_data()
            X, y = dl.prepare_data()
26
        elif case == "Compliance":
27
28
29
30
31
            with open(Path.joinpath(pt.CONFIGS_DIR, "compliance_emb.yaml"), 'r') as stream:
                settings = yaml.safe_load(stream)
            file_name = f'compliance_{DATASET_VERSION}.csv'
            dl = data_loader.ComplianceDataLoader(file_name, settings).load_data()
            X, y = dl.prepare_data()
32
        elif case == "Fall":
33
34
35
36
37
            with open(Path.joinpath(pt.CONFIGS_DIR, "fall_emb.yaml"), 'r') as stream:
                settings = yaml.safe_load(stream)
            file_name = f'fall_{DATASET_VERSION}.csv'
            dl = data_loader.FallDataLoader(file_name, settings).load_data()
            X, y = dl.prepare_data()
38
        else:
39
40
41
42
43
            with open(Path.joinpath(pt.CONFIGS_DIR, "risk_emb.yaml"), 'r') as stream:
                settings = yaml.safe_load(stream)
            file_name = f'risk_{DATASET_VERSION}.csv'
            dl = data_loader.RiskDataLoader(file_name, settings).load_data()
            X, y = dl.prepare_data()
44
    
45
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
46
                                                            stratify=y, random_state=0)
47
        
48
49
        neg, pos = np.bincount(y)
        scale_pos_weight = neg / pos
50

51
        params = {"n_estimators": 400,
52
53
54
                "objective": "binary:logistic",
                "scale_pos_weight": scale_pos_weight,
                "use_label_encoder": False,
55
                "learning_rate": 0.1,
56
                "eval_metric": "logloss",
57
                "random_state": 0
58
        }
59
        
60
        model = xgb.XGBClassifier(**params)
61
        model.fit(X_train, y_train)
62
63

        y_pred = model.predict(X_test)
64
        y_proba = model.predict_proba(X_test)[:,1]
65
        file_writer.write_joblib(model, pt.MODELS_DIR, f'{case.lower()}_xgboost.joblib')
66
        
67
        print(f"Scores for {case} XGBoost model:")
68
69
70
71
        print(f"Accuracy: {np.around(accuracy_score(y_test, y_pred), decimals=3)}")
        print(f"Precision: {np.around(precision_score(y_test, y_pred), decimals=3)}")
        print(f"Recall: {np.around(recall_score(y_test, y_pred), decimals=3)}")
        print(f"ROC AUC: {np.around(roc_auc_score(y_test, y_proba), decimals=3)}\n")
72
73
    
if __name__ == '__main__':
74
    main()