Skip to content
Snippets Groups Projects
Commit 17eb5153 authored by thecml's avatar thecml
Browse files

added learning curve plots

parent e70443ac
No related branches found
No related tags found
No related merge requests found
Pipeline #79822 passed
source diff could not be displayed: it is too large. Options to address this: view the blob.
This diff is collapsed.
source diff could not be displayed: it is too large. Options to address this: view the blob.
This diff is collapsed.
source diff could not be displayed: it is too large. Options to address this: view the blob.
......@@ -49,7 +49,9 @@ def main(ats_resolution: int = None):
object_cols = ['Gender']
df_enc = preprocessor.one_hot_encode(df, object_cols)
df = pd.concat([df.drop(object_cols, axis=1), df_enc], axis=1)
df['Gender_Female'] = df['Gender_Female'].astype(int)
df['Gender_Male'] = df['Gender_Male'].astype(int)
# Concat dataframe in proper order
if target_name in ["Complete", "Compliance", "Fall"]:
ats_cols = df.filter(regex='Ats', axis=1)
......
......@@ -29,7 +29,6 @@ class BaseClassifer(ABC):
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=k)
model = self.make_model()
res_validate = cross_validate(model, self.X, self.y, cv=skf, scoring=metrics)
#res_probas = cross_val_predict(model, self.X, self.y, cv=skf, method="predict_proba")
results = dict()
for metric in metrics:
results[metric] = res_validate[f'test_{metric}']
......
......@@ -13,30 +13,32 @@ from pathlib import Path
import paths as pt
import yaml
PLOT_SCORES = False
def main():
with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
with open(Path.joinpath(pt.CONFIGS_DIR, "fall_emb.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
dl = data_loader.FallDataLoader("fall_emb.csv", settings).load_data()
X, y = dl.get_data()
parameters = {
'n_estimators': [400, 800],
'n_estimators': [800],
'class_weight': ['balanced'],
'max_features': ['auto'],
'max_depth' : [6],
'max_depth' : [int(x) for x in np.linspace(5, 50, num=5)],
'min_samples_split' : [10],
'min_samples_leaf': [3],
'criterion' : ['gini']
'criterion' : ['gini']
}
model = RandomForestClassifier(random_state=0,
class_weight="balanced")
rounds = 5
rounds = 1
outer_scores = np.zeros(rounds)
nested_scores = np.zeros(rounds)
metric = 'average_precision'
metric = 'neg_log_loss'
best_params = list()
for i in range(rounds):
......@@ -55,8 +57,8 @@ def main():
nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv,
scoring=metric)
nested_scores[i] = nested_score.mean()
print(f'Completed round {i+1}, PR AUC: %.3f (%.3f)'
% (nested_score.mean(), nested_score.std()))
print(f'Completed round {i+1}, logloss: %.3f (%.3f)'
% (-nested_score.mean(), nested_score.std()))
score_difference = outer_scores - nested_scores
print("Avg. difference of {:6f} with std. dev. of {:6f}."
......@@ -64,31 +66,32 @@ def main():
# Print the best params per round and score
for i, best_param in enumerate(best_params):
print(f"Round {i+1}: {best_param} with score {nested_scores[i]}")
# Plot scores on each round for nested and non-nested cross-validation
plt.style.use('seaborn')
plt.tight_layout()
plt.figure(figsize=(10,5))
outer_scores_line, = plt.plot(outer_scores, color='orange')
nested_line, = plt.plot(nested_scores, color='steelblue')
plt.ylabel("Score", fontsize="14")
plt.legend([outer_scores_line, nested_line],
["Non-Nested CV", "Nested CV"],
bbox_to_anchor=(0, .4, .5, 0))
plt.title("Non-Nested vs Nested Cross-Validation",
x=.5, y=1.1, fontsize="15")
print(f"Round {i+1}: {best_param} with score {-nested_scores[i]}")
# Plot bar chart of the difference.
plt.figure(figsize=(10,5))
plt.tight_layout()
difference_plot = plt.bar(range(rounds), score_difference)
plt.xlabel("Individual Trial #")
plt.legend([difference_plot],
["Non-Nested CV - Nested CV Score"],
bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")
plt.show()
if PLOT_SCORES:
# Plot scores on each round for nested and non-nested cross-validation
plt.style.use('seaborn')
plt.tight_layout()
plt.figure(figsize=(10,5))
outer_scores_line, = plt.plot(outer_scores, color='orange')
nested_line, = plt.plot(nested_scores, color='steelblue')
plt.ylabel("Score", fontsize="14")
plt.legend([outer_scores_line, nested_line],
["Non-Nested CV", "Nested CV"],
bbox_to_anchor=(0, .4, .5, 0))
plt.title("Non-Nested vs Nested Cross-Validation",
x=.5, y=1.1, fontsize="15")
# Plot bar chart of the difference.
plt.figure(figsize=(10,5))
plt.tight_layout()
difference_plot = plt.bar(range(rounds), score_difference)
plt.xlabel("Individual Trial #")
plt.legend([difference_plot],
["Non-Nested CV - Nested CV Score"],
bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")
plt.show()
if __name__ == "__main__":
main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment