test_ats_resolution.py 3.34 KB
Newer Older
1
2
import pandas as pd
import numpy as np
3
import paths as pt
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import os
import csv
import joblib
from pathlib import Path
from data import make_dataset_full, make_dataset_emb
from tools import file_reader, file_writer, explainer
from sklearn.model_selection import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

EPOCHS = 200

def main():
    target_name = "Complete"
    step_size = 10
    
21
    for idx in range(1, 10+1, step_size):
22
23
24
25
26
27
        logloss_train, logloss_test = list(), list()
        auc_train, auc_test = list(), list()
        for ats_res in range(idx, idx+step_size):
            make_dataset_full.main(ats_resolution=ats_res)
            make_dataset_emb.main(ats_resolution=ats_res)
            
28
            df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'complete_emb.csv')
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
            
            df = df.sample(frac=1, random_state=0).reset_index(drop=True)
            
            X = df.drop([target_name], axis=1)
            y = df[target_name]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                                stratify=y,
                                                                random_state=0)

            neg, pos = np.bincount(y)
            scale_pos_weight = neg / pos
    
            params = {"n_estimators": EPOCHS,
                    "objective": "binary:logistic",
                    "scale_pos_weight": scale_pos_weight,
                    "use_label_encoder": False,
                    "learning_rate": 0.07,
                    "seed": 0
            }
            
            model = xgb.XGBClassifier(**params)
            eval_set = [(X_train, y_train), (X_test, y_test)]
            eval_metric = ["auc", "logloss"]
            model.fit(X_train, y_train,
                    eval_set=eval_set,
                    eval_metric=eval_metric,
                    verbose=False)
            
            results = model.evals_result()
            logloss_train.append((ats_res, results['validation_0']['logloss']))
            logloss_test.append((ats_res, results['validation_1']['logloss']))
            auc_train.append((ats_res, results['validation_0']['auc']))
            auc_test.append((ats_res, results['validation_1']['auc']))
        
        fig, ax = plt.subplots()
        x_axis = range(0, EPOCHS)
        cmap = plt.cm.coolwarm        
        for entry in logloss_train:
            ax.plot(x_axis, entry[1], label=f'Train ATS={entry[0]}', color=cmap(0.))
            ax.text(x_axis[-1], entry[1][-1], f'ATS={entry[0]}', fontsize=2)
        for entry in logloss_test:
            ax.plot(x_axis, entry[1], label=f'Test ATS={entry[0]}', color=cmap(1.))
            ax.text(x_axis[-1], entry[1][-1], f'ATS={entry[0]}', fontsize=2)

        custom_lines = [Line2D([0], [0], color=cmap(0.), lw=4),
                        Line2D([0], [0], color=cmap(1.), lw=4)]

        ax.legend(custom_lines, ['Train logloss',
                                 'Test logloss'])
        file_name = f"XGBoost logloss for ATS {idx}-{idx+(step_size-1)}"
        plt.ylabel('Logloss')
        plt.xlabel('Iterations')
        plt.title(file_name)
82
        plt.savefig(Path.joinpath(pt.REPORTS_PLOTS_DIR, f'{file_name}.pdf'),
83
84
85
86
87
                    dpi=300,
                    bbox_inches = "tight")

if __name__ == "__main__":
    main()