tune_complete_xgb_wb.py 4.23 KB
Newer Older
thecml's avatar
thecml committed
1
2
3
"""
tune_complete_xgb_wb.py
====================================
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
4
XGBoost tune script for Complete case on WanDB.
thecml's avatar
thecml committed
5
6
"""

7
from utility.config import load_config
thecml's avatar
thecml committed
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from xgboost import XGBClassifier
from tools import data_loader
import paths as pt
from sklearn.model_selection import cross_validate, StratifiedKFold
import numpy as np
import os

os.environ["WANDB_SILENT"] = "true"
import wandb

sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"
    },
    "parameters": {
        "n_estimators": {
thecml's avatar
thecml committed
26
            "values": [50, 100, 200, 400]
thecml's avatar
thecml committed
27
28
29
30
31
        },
        "booster": {
            "values": ["gbtree"]
        },
        "max_depth": {
thecml's avatar
thecml committed
32
            "values": [3, 4, 5, 6, 7, 8, 9, 10]
thecml's avatar
thecml committed
33
34
        },
        "gamma": {
thecml's avatar
thecml committed
35
            "values": [0, 1, 5]
thecml's avatar
thecml committed
36
37
        },
        "colsample_bytree": {
thecml's avatar
thecml committed
38
            "values": [0.3, 0.5, 0.8, 1]
thecml's avatar
thecml committed
39
40
41
42
43
        },
        "min_child_weight": {
            "values": [int(x) for x in np.linspace(1, 10, 10, endpoint=True)]
        },
        "reg_alpha": {
thecml's avatar
thecml committed
44
            "values": [int(x) for x in np.linspace(10, 180, 20, endpoint=True)]
thecml's avatar
thecml committed
45
46
47
48
49
        },
        "reg_lambda": {
            "values": [float(x) for x in np.linspace(0, 0.9, 10, endpoint=True)]
        },
        "learning_rate": {
thecml's avatar
thecml committed
50
            "values": [0.01, 0.1, 0.05, 0.2]
thecml's avatar
thecml committed
51
52
        },
        "subsample": {
thecml's avatar
thecml committed
53
            "values": [0.8, 1]
thecml's avatar
thecml committed
54
55
56
57
58
59
60
61
62
63
        },
        "scale_pos_weight": {
            "values": [False, True]
        },
    }
}

def main():
    sweep_id = wandb.sweep(sweep_config,
                           project="air-complete-xgb")
thecml's avatar
thecml committed
64
    wandb.agent(sweep_id, train_model, count=100)
thecml's avatar
thecml committed
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

def train_model():
    config_defaults = {
        "n_estimators": 200,
        "booster": "gbtree",
        "max_depth": 3,
        "gamma": 0,
        "colsample_bytree": 1,
        "min_child_weight": 1,
        "reg_alpha": 0,
        "reg_lambda": 1,
        "learning_rate": 0.3,
        "subsample": 1,
        'scale_pos_weight': False,
        "seed": 0,
thecml's avatar
thecml committed
80
        "test_size": 0.3,
thecml's avatar
thecml committed
81
82
83
84
85
86
87
88
89
    }

    # Initialize a new wandb run
    wandb.init(config=config_defaults)

    # Config is a variable that holds and saves hyperparameters and inputs
    config = wandb.config

    # Load data
90
    settings = load_config(pt.CONFIGS_DIR, "complete.yaml")
thecml's avatar
thecml committed
91
92
93
94
95
96
97
98
99
100
101
102
    dl = data_loader.CompleteDataLoader(pt.PROCESSED_DATA_DIR,
                                        "complete_emb.csv",
                                        settings).load_data()
    X, y = dl.get_data()

    # Make model
    if config.scale_pos_weight:
        neg, pos = np.bincount(y)
        scale_pos_weight = neg / pos
    else:
        scale_pos_weight = None
    params = {"n_estimators": config.n_estimators,
thecml's avatar
thecml committed
103
104
105
106
107
108
109
110
111
112
113
114
115
116
              "booster": config.booster,
              "max_depth":config.max_depth,
              "gamma":config.gamma,
              "colsample_bytree":config.colsample_bytree,
              "min_child_weight":config.min_child_weight,
              "reg_alpha":config.reg_alpha,
              "reg_lambda":config.reg_lambda,
              "learning_rate":config.learning_rate,
              "subsample":config.subsample,
              "scale_pos_weight":scale_pos_weight,
              "use_label_encoder":False,
              "eval_metric":"logloss",
              "objective":"binary:logistic",
              "random_state":0}
thecml's avatar
thecml committed
117
118
119
120
    model = XGBClassifier(**params)

    # Make CV
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
thecml's avatar
thecml committed
121
122
    metrics = ['accuracy', 'precision', 'recall',
               'roc_auc', 'average_precision', 'f1']
thecml's avatar
thecml committed
123
124
125
    res_validate = cross_validate(model, X, y, cv=skf, scoring=metrics)

    # Evaluate performance
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
126
    accuracy = round(np.mean(res_validate[f'test_accuracy']), 3)
thecml's avatar
thecml committed
127
128
129
    precision = round(np.mean(res_validate[f'test_precision']), 3)
    recall = round(np.mean(res_validate[f'test_recall']), 3)
    roc_auc = round(np.mean(res_validate[f'test_roc_auc']), 3)
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
130
131
    avg_prec = round(np.mean(res_validate['test_average_precision']), 3)
    f1 = round(np.mean(res_validate['test_f1']), 3)
thecml's avatar
thecml committed
132
133
134

    # Log to wandb
    wandb.log({"accuracy": accuracy})
thecml's avatar
thecml committed
135
136
137
    wandb.log({"precision": precision})
    wandb.log({"recall": recall})
    wandb.log({"roc_auc": roc_auc})
thecml's avatar
thecml committed
138
139
140
141
    wandb.log({"pr_auc": avg_prec})
    wandb.log({"f1": f1})

if __name__ == "__main__":
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
142
    main()