Commit 21cb5577 authored by thecml's avatar thecml
Browse files

added xgboost code as notebook

parent 42ddc2fb
Pipeline #66423 passed with stage
in 3 minutes and 40 seconds
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
......@@ -60,7 +60,7 @@ def main():
param_grid = [
{
'cluster_maker__init': ['random', 'Huang', 'Cao'],
'cluster_maker__n_clusters': [2, 5, 10, 20, 30, 40, 50],
'cluster_maker__n_clusters': [5, 10, 15, 20, 30],
'cluster_maker__n_init': [1, 5, 10, 15, 20]
}
]
......
......@@ -17,17 +17,17 @@ def main():
with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
dl = data_loader.FallDataLoader("fall_emb.csv", settings).load_data()
X, y = dl.get_data()
param_grid = {
parameters = {
'n_estimators': [400, 800],
'class_weight': ['balanced'],
'max_features': ['auto'],
'max_depth' : [6],
'min_samples_split' : [10],
'min_samples_leaf': [3],
'criterion' : ['gini']
'criterion' : ['gini']
}
model = RandomForestClassifier(random_state=0,
......@@ -45,7 +45,7 @@ def main():
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
# Non-nested parameter search and scoring
clf = GridSearchCV(estimator=model, param_grid=param_grid,
clf = GridSearchCV(estimator=model, param_grid=parameters,
cv=inner_cv, scoring=metric)
clf.fit(X, y)
outer_scores[i] = clf.best_score_
......@@ -55,14 +55,16 @@ def main():
nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv,
scoring=metric)
nested_scores[i] = nested_score.mean()
print(f'Completed round {i+1}, PR AUC: %.3f (%.3f)'
% (nested_score.mean(), nested_score.std()))
score_difference = outer_scores - nested_scores
print("Avg. difference of {:6f} with std. dev. of {:6f}."
.format(score_difference.mean(), score_difference.std()))
# Print the best params per round
# Print the best params per round and score
for i, best_param in enumerate(best_params):
print(f"Round {i+1}: {best_param}")
print(f"Round {i+1}: {best_param} with score {nested_scores[i]}")
# Plot scores on each round for nested and non-nested cross-validation
plt.style.use('seaborn')
......
import numpy as np
import pandas as pd
import paths as pt
from sklearn.model_selection import GridSearchCV
from tools import data_loader, file_writer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import cross_validate, cross_val_score
import matplotlib.pyplot as plt
from pathlib import Path
import paths as pt
import yaml
import xgboost as xgb
def main():
with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
X, y = dl.get_data()
model = xgb.XGBClassifier()
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
parameters = {'n_estimators': [100, 200],
'objective': ['binary:logistic'],
'eval_metric': ["aucpr"],
'use_label_encoder': [False],
'scale_pos_weight': [1, scale_pos_weight],
'learning_rate': [0.05],
'max_depth': [3, 4, 6, 8],
'max_delta_step': [0, 1, 5, 10],
#'gamma': [0.5, 1, 1.5, 2, 5],
#'min_child_weight': [1, 5, 10],
#'subsample': [0.6, 0.8, 1.0],
#'colsample_bytree': [0.6, 0.8, 1.0],
'seed': [0]}
rounds = 5
outer_scores = np.zeros(rounds)
nested_scores = np.zeros(rounds)
metric = 'average_precision'
best_params = list()
for i in range(rounds):
#Define both cross-validation objects (inner & outer)
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
# Non-nested parameter search and scoring
clf = GridSearchCV(estimator=model, param_grid=parameters,
cv=inner_cv, scoring=metric)
clf.fit(X, y)
outer_scores[i] = clf.best_score_
best_params.append(clf.best_params_)
# Nested CV with parameter optimization
nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv,
scoring=metric)
nested_scores[i] = nested_score.mean()
print(f'Completed round {i+1}, PR AUC: %.3f (%.3f)'
% (nested_score.mean(), nested_score.std()))
score_difference = outer_scores - nested_scores
print("Avg. difference of {:6f} with std. dev. of {:6f}."
.format(score_difference.mean(), score_difference.std()))
# Print the best params per round and score
for i, best_param in enumerate(best_params):
print(f"Round {i+1}: {best_param} with score {nested_scores[i]}")
# Plot scores on each round for nested and non-nested cross-validation
plt.style.use('seaborn')
plt.tight_layout()
plt.figure(figsize=(10,5))
outer_scores_line, = plt.plot(outer_scores, color='orange')
nested_line, = plt.plot(nested_scores, color='steelblue')
plt.ylabel("Score", fontsize="14")
plt.legend([outer_scores_line, nested_line],
["Non-Nested CV", "Nested CV"],
bbox_to_anchor=(0, .4, .5, 0))
plt.title("Non-Nested vs Nested Cross-Validation",
x=.5, y=1.1, fontsize="15")
# Plot bar chart of the difference.
plt.figure(figsize=(10,5))
plt.tight_layout()
difference_plot = plt.bar(range(rounds), score_difference)
plt.xlabel("Individual Trial #")
plt.legend([difference_plot],
["Non-Nested CV - Nested CV Score"],
bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")
plt.show()
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -24,8 +24,8 @@ def main(ats_resolution: int = None):
for target_name in ["Complete", "Compliance", "Fall"]:
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
f'{target_name.lower()}.csv',
converters=ats)
f'{target_name.lower()}.csv',
converters=ats)
# Make a df to be encoded
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
......
......@@ -73,7 +73,7 @@ def main(dataset_version : str = 'emb'):
y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]
optimize_rounds = True
early_stopping_rounds = 50
early_stopping_rounds = 200
if optimize_rounds:
eval_set=[(X_valid_split, y_valid_split)]
fit_model = model.fit(X_train_split, y_train_split,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment