Commit 7adfc467 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

Fixed some metrics for completing

parent 509cbfb1
Pipeline #40661 passed with stage
in 2 minutes and 42 seconds
%% Cell type:code id: tags:
 
``` python
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.decomposition import KernelPCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
import category_encoders as ce
from tools import preprocessor
 
ex = {str(i)+'Ex':str for i in range(1,10)}
ats = {str(i)+'Ats':str for i in range(1,11)}
converters = {**ex, **ats}
df = pd.read_csv(f'../data/processed/screening_0_complete.csv', converters=converters)
 
rf_pipeline = Pipeline([('scaler', StandardScaler()),
('clf', RandomForestClassifier(random_state = 0))])
 
et_pipeline = Pipeline([('scaler', StandardScaler()),
('clf', ExtraTreesClassifier(random_state = 0))])
 
gb_pipeline = Pipeline([('scaler', StandardScaler()),
('clf', GradientBoostingClassifier(random_state = 0))])
 
lr_pipeline = Pipeline([('scaler', StandardScaler()),
('clf', LogisticRegression(random_state = 0))])
 
knn_pipeline = Pipeline([('scaler', StandardScaler()),
('clf', KNeighborsClassifier())])
 
svc_pipeline = Pipeline([('scaler', StandardScaler()),
('clf', SVC(random_state=0, kernel="rbf", gamma='scale', probability=True))])
 
gnb_pipeline = Pipeline([('scaler', StandardScaler()),
('clf', GaussianNB())])
 
X = df.drop(['Complete'], axis=1)
y = df['Complete']
 
cols = [str(i)+'Ex' for i in range(1,10)] + [str(i)+'Ats' for i in range(1,11)]
X = preprocessor.encode_vector_catboost(X, y, cols)
 
X = X[['Gender', 'BirthYear', 'NumberAts'] + cols]
 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)
 
rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)
 
print(str(y.value_counts()) + "\n")
print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}\n")
print(f"Classification Report:\n {classification_report(y_test, y_pred)}\n")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
```
 
%% Output
 
0 521\n1 364\nName: Complete, dtype: int64\n\nConfusion Matrix:\n [[127 30]\n [ 87 22]]\n\nClassification Report:\n precision recall f1-score support\n\n 0 0.59 0.81 0.68 157\n 1 0.42 0.20 0.27 109\n\n accuracy 0.56 266\n macro avg 0.51 0.51 0.48 266\nweighted avg 0.52 0.56 0.52 266
0 491\n1 394\nName: Complete, dtype: int64\n\nConfusion Matrix:\n [[108 40]\n [ 82 36]]\n\nClassification Report:\n precision recall f1-score support\n\n 0 0.57 0.73 0.64 148\n 1 0.47 0.31 0.37 118\n\n accuracy 0.54 266\n macro avg 0.52 0.52 0.51 266\nweighted avg 0.53 0.54 0.52 266\n\n\nAccuracy: 0.5413533834586466
 
%% Cell type:code id: tags:
 
``` python
pipelines = [rf_pipeline, et_pipeline, gb_pipeline, lr_pipeline, knn_pipeline, svc_pipeline, gnb_pipeline]
for pipeline in pipelines:
cv_scores_acc = cross_val_score(pipeline, X, y, scoring='accuracy', cv=5)
cv_scores_ba = cross_val_score(pipeline, X, y, scoring='balanced_accuracy', cv=5)
cv_scores_f1 = cross_val_score(pipeline, X, y, scoring='f1', cv=5)
cv_scores_roc_auc = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=5)
 
print(f"Results for: {str(pipeline['clf'])}")
print("ACC CV average: %0.2f (+/- %0.2f)" % (cv_scores_acc.mean(), cv_scores_acc.std() * 2))
print("BA CV average: %0.2f (+/- %0.2f)" % (cv_scores_ba.mean(), cv_scores_ba.std() * 2))
print("F1 CV average: %0.2f (+/- %0.2f)" % (cv_scores_f1.mean(), cv_scores_f1.std() * 2))
print("ROC_AUC CV average: %0.2f (+/- %0.2f)\n" % (cv_scores_roc_auc.mean(), cv_scores_roc_auc.std() * 2))
```
 
%% Output
 
Results for: RandomForestClassifier(random_state=0)
ACC CV average: 0.52 (+/- 0.06)
BA CV average: 0.48 (+/- 0.04)
F1 CV average: 0.27 (+/- 0.09)
ROC_AUC CV average: 0.46 (+/- 0.04)
Results for: ExtraTreesClassifier(random_state=0)
ACC CV average: 0.52 (+/- 0.09)
BA CV average: 0.47 (+/- 0.07)
F1 CV average: 0.23 (+/- 0.08)
ROC_AUC CV average: 0.48 (+/- 0.11)
Results for: GradientBoostingClassifier(random_state=0)
ACC CV average: 0.52 (+/- 0.06)
BA CV average: 0.49 (+/- 0.03)
F1 CV average: 0.35 (+/- 0.11)
ROC_AUC CV average: 0.47 (+/- 0.03)
Results for: LogisticRegression(random_state=0)
ACC CV average: 0.55 (+/- 0.05)
BA CV average: 0.48 (+/- 0.04)
F1 CV average: 0.13 (+/- 0.18)
ROC_AUC CV average: 0.43 (+/- 0.05)
Results for: KNeighborsClassifier()
ACC CV average: 0.54 (+/- 0.08)
BA CV average: 0.50 (+/- 0.09)
F1 CV average: 0.36 (+/- 0.16)
ROC_AUC CV average: 0.50 (+/- 0.10)
Results for: SVC(probability=True, random_state=0)
ACC CV average: 0.56 (+/- 0.05)
BA CV average: 0.48 (+/- 0.04)
F1 CV average: 0.03 (+/- 0.07)
ROC_AUC CV average: 0.45 (+/- 0.06)
Results for: GaussianNB()
ACC CV average: 0.49 (+/- 0.06)
BA CV average: 0.45 (+/- 0.03)
F1 CV average: 0.30 (+/- 0.07)
ROC_AUC CV average: 0.45 (+/- 0.02)
 
%% Cell type:code id: tags:
 
``` python
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--')
 
def get_roc_scores(pipeline, X, y):
y_probas = cross_val_predict(pipeline, X, y, cv=5, method="predict_proba")
fpr, tpr, _ = roc_curve(y, y_probas[:,1])
return fpr, tpr
 
from sklearn.metrics import roc_curve, auc, roc_auc_score
from catboost.utils import get_roc_curve
fpr_rf, tpr_rf = get_roc_scores(rf_pipeline, X, y)
fpr_et, tpr_et = get_roc_scores(et_pipeline, X, y)
fpr_gb, tpr_gb = get_roc_scores(gb_pipeline, X, y)
fpr_lr, tpr_lr = get_roc_scores(lr_pipeline, X, y)
fpr_knn, tpr_knn = get_roc_scores(knn_pipeline, X, y)
fpr_svc, tpr_svc = get_roc_scores(svc_pipeline, X, y)
fpr_gnb, tpr_gnb = get_roc_scores(gnb_pipeline, X, y)
 
plt.figure(figsize=(8,6))
plot_roc_curve(fpr_rf, tpr_rf, "Random Forest")
plot_roc_curve(fpr_et, tpr_et, "Extra Trees")
plot_roc_curve(fpr_gb, tpr_gb, "Gradient Boosting")
plot_roc_curve(fpr_lr, tpr_lr, 'Logistic Regression')
plot_roc_curve(fpr_knn, tpr_knn, 'KNeighbors')
plot_roc_curve(fpr_svc, tpr_svc, 'SVC-RBF')
plot_roc_curve(fpr_gnb, tpr_gnb, 'Gaussian Naive-Bayes')
plt.legend(loc="lower right")
plt.title("ROC plot")
plt.ylabel("TPR")
plt.xlabel("FPR")
plt.show()
```
 
%% Output
 
 
%% Cell type:code id: tags:
 
``` python
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score
 
y_probas_rf = cross_val_predict(rf_pipeline, X_test, np.array(y_test).reshape(-1), cv=5, method="predict_proba")
for threshold in [0.3, 0.4, 0.5, 0.6, 0.7]:
y_scores_new = (y_probas_rf[:,1] > threshold)
print(f'Threshold: {threshold}, accuracy: {np.around(accuracy_score(y_test, y_scores_new), decimals=3)}, precision: {np.around(precision_score(y_test, y_scores_new), decimals=3)}, recall: {np.around(recall_score(y_test, y_scores_new), decimals=3)}, CM:\n{confusion_matrix(y_test, y_scores_new)}\n')
```
 
%% Output
 
Threshold: 0.3, accuracy: 0.444, precision: 0.417, recall: 0.899, CM:\n[[ 20 137]\n [ 11 98]]\n\nThreshold: 0.4, accuracy: 0.432, precision: 0.356, recall: 0.477, CM:\n[[63 94]\n [57 52]]\n\nThreshold: 0.5, accuracy: 0.534, precision: 0.368, recall: 0.193, CM:\n[[121 36]\n [ 88 21]]\n\nThreshold: 0.6, accuracy: 0.579, precision: 0.385, recall: 0.046, CM:\n[[149 8]\n [104 5]]\n\nThreshold: 0.7, accuracy: 0.583, precision: 0.0, recall: 0.0, CM:\n[[155 2]\n [109 0]]
 
%% Cell type:code id: tags:
 
``` python
importances = rf_pipeline.named_steps['clf'].feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%d. Feature %s (%f)" % (f + 1, X_train.columns[indices[f]],
importances[indices[f]]))
```
 
%% Output
 
1. Feature 2Ex (0.057120)\n2. Feature 3Ex (0.054892)\n3. Feature 6Ex (0.054205)\n4. Feature 9Ex (0.053958)\n5. Feature 8Ats (0.050759)\n6. Feature 2Ats (0.049322)\n7. Feature 5Ats (0.048698)\n8. Feature 5Ex (0.047976)\n9. Feature 1Ex (0.047436)\n10. Feature 4Ats (0.047094)\n11. Feature 4Ex (0.046714)\n12. Feature 7Ex (0.046301)\n13. Feature 9Ats (0.045912)\n14. Feature 3Ats (0.045567)\n15. Feature 10Ats (0.045237)\n16. Feature 7Ats (0.045024)\n17. Feature 8Ex (0.044557)\n18. Feature 6Ats (0.044272)\n19. Feature 1Ats (0.042002)\n20. Feature BirthYear (0.041143)\n21. Feature NumberAts (0.034604)\n22. Feature Gender (0.007207)
 
%% Cell type:code id: tags:
 
``` python
plt.figure(figsize=(6,6))
plt.title('Feature importance for Random Forest classifier')
plt.barh(X_train.columns[indices][::-1], importances[indices][::-1])
```
 
%% Output
 
<BarContainer object of 22 artists>
 
 
%% Cell type:code id: tags:
 
``` python
import warnings
from sklearn.metrics import log_loss
 
warnings.simplefilter(action='ignore', category=FutureWarning)
train_errors, test_errors = [], []
for m in range(1, 50):
rf_pipeline.fit(X_train[:m], y_train[:m])
y_train_predict = rf_pipeline.predict_proba(X_train[:m])
y_test_predict = rf_pipeline.predict_proba(X_test)
log_loss_train = log_loss(y_train[:m], y_train_predict, eps=1e-15,
labels=['fails', 'completes'])
log_loss_test = log_loss(y_test, y_test_predict, eps=1e-15)
train_errors.append(np.around(log_loss_train, decimals=3))
test_errors.append(np.around(log_loss_test, decimals=3))
 
print(f'Mean log loss train: {np.mean(train_errors[-5:])}')
print(f'Mean log los trest: {np.mean(test_errors[-5:])}')
```
 
%% Output
 
Mean log loss train: 0.8602000000000001\nMean log los trest: 0.7171999999999998
 
%% Cell type:code id: tags:
 
``` python
plt.figure(figsize=(10,6))
plt.plot(train_errors, "r-+", linewidth=2, label="train")
plt.plot(test_errors, "b-", linewidth=2, label="test")
plt.title("Learning curves for RF classifier")
plt.ylabel("Log loss")
plt.xlabel('Training set size')
plt.legend(loc="upper right")
plt.yscale('log')
plt.grid(True)
```
 
%% Output
 
 
%% Cell type:code id: tags:
 
``` python
from lime.lime_tabular import LimeTabularExplainer
import shap
shap.initjs()
 
classes = rf_pipeline.classes_
explainer = LimeTabularExplainer(training_data=np.array(X_train),
feature_names=X_train.columns,
class_names=classes,
mode='classification',
discretize_continuous=True)
 
exp = explainer.explain_instance(X_test.iloc[0], rf_pipeline.predict_proba)
exp.as_pyplot_figure()
```
 
%% Output
 
 
<Figure size 432x288 with 1 Axes>
 
 
%% Cell type:code id: tags:
 
``` python
exp = explainer.explain_instance(X_test.iloc[1], rf_pipeline.predict_proba)
exp.as_pyplot_figure()
```
 
%% Output
 
<Figure size 432x288 with 1 Axes>
 
 
%% Cell type:code id: tags:
 
``` python
exp = explainer.explain_instance(X_test.iloc[2], rf_pipeline.predict_proba)
exp.as_pyplot_figure()
```
 
%% Output
 
<Figure size 432x288 with 1 Axes>
 
 
%% Cell type:code id: tags:
 
``` python
exp = explainer.explain_instance(X_test.iloc[0], rf_pipeline.predict_proba)
exp.show_in_notebook(show_table=True, show_all=False)
```
 
%% Output
 
 
%% Cell type:code id: tags:
 
``` python
clf = RandomForestClassifier(n_estimators=400, class_weight='balanced', random_state=0)
clf.fit(X_train, y_train)
 
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values[1], X_test)
```
 
%% Output