added learning curve plots

17eb5153 · thecml · e70443ac · 17eb5153 · 17eb5153 · 17eb5153
Commit 17eb5153 authored 3 years ago by thecml
--- a/ml/notebooks/MLP_shap.ipynb
+++ b/ml/notebooks/MLP_shap.ipynb
--- a/ml/notebooks/RF_learning_curves.ipynb
+++ b/ml/notebooks/RF_learning_curves.ipynb
--- a/ml/notebooks/RF_shap.ipynb
+++ b/ml/notebooks/RF_shap.ipynb
--- a/ml/notebooks/Timeseries_EDA.ipynb
+++ b/ml/notebooks/Timeseries_EDA.ipynb
--- a/ml/notebooks/XGB_shap.ipynb
+++ b/ml/notebooks/XGB_shap.ipynb
--- a/ml/src/data/make_dataset_full.py
+++ b/ml/src/data/make_dataset_full.py
@@ -49,7 +49,9 @@ def main(ats_resolution: int = None):
        object_cols = ['Gender']
        df_enc = preprocessor.one_hot_encode(df, object_cols)
        df = pd.concat([df.drop(object_cols, axis=1), df_enc], axis=1)
-    
+        df['Gender_Female'] = df['Gender_Female'].astype(int)
+        df['Gender_Male'] = df['Gender_Male'].astype(int)
+        
        # Concat dataframe in proper order
        if target_name in ["Complete", "Compliance", "Fall"]:
            ats_cols = df.filter(regex='Ats', axis=1)

--- a/ml/src/tools/classifiers.py
+++ b/ml/src/tools/classifiers.py
@@ -29,7 +29,6 @@ class BaseClassifer(ABC):
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=k)
        model = self.make_model()
        res_validate = cross_validate(model, self.X, self.y, cv=skf, scoring=metrics)
-        #res_probas = cross_val_predict(model, self.X, self.y, cv=skf, method="predict_proba")
        results = dict()
        for metric in metrics:
            results[metric] = res_validate[f'test_{metric}']

--- a/ml/src/tuning/tune_random_forest_gs.py
+++ b/ml/src/tuning/tune_random_forest_gs.py
@@ -13,30 +13,32 @@ from pathlib import Path
 import paths as pt
 import yaml

+PLOT_SCORES = False
+
 def main():
-    with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
+    with open(Path.joinpath(pt.CONFIGS_DIR, "fall_emb.yaml"), 'r') as stream:
        settings = yaml.safe_load(stream)
    
    dl = data_loader.FallDataLoader("fall_emb.csv", settings).load_data()
    X, y = dl.get_data()
            
    parameters = { 
-        'n_estimators': [400, 800],
+        'n_estimators': [800],
        'class_weight': ['balanced'],
        'max_features': ['auto'],
-        'max_depth' : [6],
+        'max_depth' : [int(x) for x in np.linspace(5, 50, num=5)],
        'min_samples_split' : [10],
        'min_samples_leaf': [3],
-            'criterion' : ['gini']
+        'criterion' : ['gini']
    }
    
    model = RandomForestClassifier(random_state=0,
                                   class_weight="balanced")
    
-    rounds = 5
+    rounds = 1
    outer_scores = np.zeros(rounds)
    nested_scores = np.zeros(rounds)
-    metric = 'average_precision'
+    metric = 'neg_log_loss'
    best_params = list()
    
    for i in range(rounds):
@@ -55,8 +57,8 @@ def main():
        nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv,
                                       scoring=metric)
        nested_scores[i] = nested_score.mean()
-        print(f'Completed round {i+1}, PR AUC: %.3f (%.3f)'
-              % (nested_score.mean(), nested_score.std()))
+        print(f'Completed round {i+1}, logloss: %.3f (%.3f)'
+              % (-nested_score.mean(), nested_score.std()))
    
    score_difference = outer_scores - nested_scores
    print("Avg. difference of {:6f} with std. dev. of {:6f}."
@@ -64,31 +66,32 @@ def main():
    
    # Print the best params per round and score
    for i, best_param in enumerate(best_params):
-        print(f"Round {i+1}: {best_param} with score {nested_scores[i]}")
-    
-    # Plot scores on each round for nested and non-nested cross-validation
-    plt.style.use('seaborn')
-    plt.tight_layout()
-    plt.figure(figsize=(10,5))
-    outer_scores_line, = plt.plot(outer_scores, color='orange')
-    nested_line, = plt.plot(nested_scores, color='steelblue')
-    plt.ylabel("Score", fontsize="14")
-    plt.legend([outer_scores_line, nested_line],
-            ["Non-Nested CV", "Nested CV"],
-            bbox_to_anchor=(0, .4, .5, 0))
-    plt.title("Non-Nested vs Nested Cross-Validation",
-            x=.5, y=1.1, fontsize="15")
+        print(f"Round {i+1}: {best_param} with score {-nested_scores[i]}")
    
-    # Plot bar chart of the difference.
-    plt.figure(figsize=(10,5))
-    plt.tight_layout()
-    difference_plot = plt.bar(range(rounds), score_difference)
-    plt.xlabel("Individual Trial #")
-    plt.legend([difference_plot],
-            ["Non-Nested CV - Nested CV Score"],
-            bbox_to_anchor=(0, 1, .8, 0))
-    plt.ylabel("score difference", fontsize="14")
-    plt.show()
+    if PLOT_SCORES:
+        # Plot scores on each round for nested and non-nested cross-validation
+        plt.style.use('seaborn')
+        plt.tight_layout()
+        plt.figure(figsize=(10,5))
+        outer_scores_line, = plt.plot(outer_scores, color='orange')
+        nested_line, = plt.plot(nested_scores, color='steelblue')
+        plt.ylabel("Score", fontsize="14")
+        plt.legend([outer_scores_line, nested_line],
+                ["Non-Nested CV", "Nested CV"],
+                bbox_to_anchor=(0, .4, .5, 0))
+        plt.title("Non-Nested vs Nested Cross-Validation",
+                x=.5, y=1.1, fontsize="15")
+        
+        # Plot bar chart of the difference.
+        plt.figure(figsize=(10,5))
+        plt.tight_layout()
+        difference_plot = plt.bar(range(rounds), score_difference)
+        plt.xlabel("Individual Trial #")
+        plt.legend([difference_plot],
+                ["Non-Nested CV - Nested CV Score"],
+                bbox_to_anchor=(0, 1, .8, 0))
+        plt.ylabel("score difference", fontsize="14")
+        plt.show()
            
 if __name__ == "__main__":
    main()
\ No newline at end of file