Commit bc4252c1 authored by Carsten Eie Frigaard's avatar Carsten Eie Frigaard
Browse files

updated_k_fold_demo

parent a816ed12
%% Cell type:markdown id: tags:
# ITMAL L04
## K-fold CV demo
Code original p89, [HOML].
(CEF: code cleaned up, global calls put into functions, changed `StratifiedKFold` to just `SKFold`)
%% Cell type:code id: tags:
``` python
print("MNIST data get and unpack (slow)..")
#print("MNIST data get and unpack (slow)..")
#from sklearn.datasets import fetch_openml
#mnist = fetch_openml('mnist_784', version=1)
#print(f" MNIST keys={mnist.keys()}")
# from libitmal.dataloaders (faster)
from libitmal.dataloaders import MNIST_GetDataSet
X, y = MNIST_GetDataSet()
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
print(f" MNIST keys={mnist.keys()}")
print("OK")
```
%%%% Output: stream
MNIST data get and unpack (slow)..
MNIST keys=dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])
OK
%% Cell type:code id: tags:
``` python
import numpy as np
print("Design Matrix setup..")
X, y = mnist["data"], mnist["target"]
X = X.astype(np.float32)
y = y.astype(np.uint8)
print(f" X: {X.shape}, y: {y.shape}")
print("Train/test split..")
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
print(f" Train: X: {X_train.shape}, y: {y_train.shape}")
print(f" Test : X: {X_test.shape}, y: {y_test_5.shape}")
print("SGD model setup and train..")
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
sgd_clf.fit(X_train, y_train_5)
print("\nOK")
```
%%%% Output: stream
Design Matrix setup..
X: (70000, 784), y: (70000,)
Train/test split..
Train: X: (60000, 784), y: (60000,)
Test : X: (10000, 784), y: (10000,)
SGD model setup and train..
OK
%% Cell type:code id: tags:
``` python
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
def plot_digit(data):
image = data.reshape(28, 28)
plt.imshow(image, cmap = mpl.cm.binary,
interpolation="nearest")
plt.axis("off")
def TestPredict(n):
some_digit = X_test[n]
ground_truth = y_test_5[n]
plot_digit(some_digit)
y_pred=sgd_clf.predict([some_digit])
print(f" ground_truth={ground_truth}")
print(f" predicted ={y_pred}")
print("Do some predictions..")
TestPredict(42)
TestPredict(45)
print("OK")
```
%%%% Output: stream
Do some predictions..
ground_truth=False
predicted =[False]
ground_truth=True
predicted =[ True]
OK
%%%% Output: display_data
![](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAOcAAADnCAYAAADl9EEgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAF4ElEQVR4nO3dsU+TXRjG4bfGydEAI66go5FJdDRxo4zGUcfuIpManJ0wIf4L4KabicyOBlcZhZUZp2/4kvY52lJ6Y69r9MmJryU/T8LJ6ds7Pz/vgDzXZv0AwHDihFDihFDihFDihFDXG3O/yoXp6w37QzsnhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhBInhLo+6wfg/54+fVrOj4+Py/nq6mo5v3///kTrp2lhYWHk7NatW5f4JBnsnBBKnBBKnBBKnBBKnBBKnBBKnBCqd35+Xs3LIeM5OTkZOVtbWyvX/vz5s5z3er1y3vh5l+snWfsn6x88eDByNhgMyrX9fr+chxv6wdk5IZQ4IZQ4IZQ4IZQ4IZQ4IZQrYzNQXftqHZW8f/++nD9//rycV8c4Xdd1BwcHI2crKyvl2h8/fpTzluq62s7OTrn29u3b5bz17InsnBBKnBBKnBBKnBBKnBBKnBBKnBDKOWeY1rWrSS0uLpbz1jlppbryNamtra1yfhXPMVvsnBBKnBBKnBBKnBBKnBBKnBBKnBDKOWeY1tdHzqv19fVZP8Kls3NCKHFCKHFCKHFCKHFCKHFCKHFCKOecM3B0dDRyNu37nFwddk4IJU4IJU4IJU4IJU4IJU4IJU4I5ZxzBvb390fO3OfkP3ZOCCVOCCVOCCVOCCVOCCVOCNVr/Ore7/Wn4Nq10f8ntq6MLS8vl/OFhYWxnulPvHjxopz3+/2p/d3/uKE/dDsnhBInhBInhBInhBInhBInhBInhHJlbArevHlTzie5FjbtK2Wnp6cjZ5ubm+Xaz58/l/NHjx6N9Uzzys4JocQJocQJocQJocQJocQJocQJodznHEP1Cr+u67q1tbVyfnZ2NnK2vb1drh0MBuV80vuc1Tnn0tJSubY1//LlSzlfXV0t5/8w9znhKhEnhBInhBInhBInhBInhBInhHKfcwzv3r0r59U5ZtfV9xpfvXo11jNdlOqcdGtrq1y7s7NTzg8PD8v5HJ9zDmXnhFDihFDihFDihFDihFDihFDihFDOOcfw7Nmzct56x+br168v8nEuzcbGRjl/+/btJT3JfLBzQihxQihxQihxQihxQihxQihHKWNYXl4u57u7u5f0JJfr69ev5XzaryecN3ZOCCVOCCVOCCVOCCVOCCVOCCVOCOWcc4j9/f1yvre3V84/ffp0kY8T4+PHj+W8dVWOv2PnhFDihFDihFDihFDihFDihFDihFC9xh28ubygd+/evXJevSav6672OefJycnI2dLSUrm2dc75/fv3cj7HrwAc+sHZOSGUOCGUOCGUOCGUOCGUOCGUOCHUXN7nrM7yuq7rTk9Py3nrFYDJWv/2x48fj5y1zjG3t7fL+RyfY47FzgmhxAmhxAmhxAmhxAmhxAmh5vIoZXFxsZzfvHmznP/69esiH+dCHR0dlfOXL1+W82/fvo2c3b17t1w7GAzKOX/HzgmhxAmhxAmhxAmhxAmhxAmhxAmh5vKcs+XOnTvl/MOHD+W89RWSGxsbI2cHBwfl2sPDw3Leek3f2dlZOd/c3Bw5293dLde2vjKUv2PnhFDihFDihFDihFDihFDihFDihFBeAThE607kw4cPy3nrqzWrz7z19ZONn1fX7/fL+ZMnTyZaz1R4BSBcJeKEUOKEUOKEUOKEUOKEUOKEUM45x3B8fFzO9/b2ynl1J7O669l1Xbe+vl7OV1ZWyvmNGzfKOTPhnBOuEnFCKHFCKHFCKHFCKHFCKHFCKOecMHvOOeEqESeEEieEEieEEieEEieEEieEEieEEieEEieEEieEEieEEieEEieEEieEEieEEieEEieEEieEEieEEieEEieEut6YD/3KPmD67JwQSpwQSpwQSpwQSpwQSpwQ6jeG1uak32Dk2AAAAABJRU5ErkJggg==)
%% Cell type:code id: tags:
``` python
from sklearn.model_selection import KFold
from sklearn.base import clone
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def PrintScores(y_true, y_pred, i):
assert y_true.shape == y_pred.shape, f"mismatch in shapes, y_true.shape={y_true.shape}, y_pred.shape={y_pred.shape}"
a = accuracy_score (y_true, y_pred)
p = precision_score(y_true, y_pred)
r = recall_score (y_true, y_pred)
F1= f1_score (y_true, y_pred)
prefix = f"FOLD {i:2d}: " if i>=0 else ""
print(f" {prefix}accuracy={a:.2f}, precision={p:.2f}, recall={r:.2f}, F1={F1:.2f}")
def MyKFoldSplit(clf, X, y, kfolds=3, debug=True):
def PrintVarInfo(varname, var):
assert isinstance(var, np.ndarray)
if debug and i==0:
msg = f"type({varname})"
t = f"{type(var)},"
s = f"{varname}.shape"
print(f" {msg:18s}={t:24s} {s:18s}={var.shape}")
i=0
if debug:
print(f"MyKFoldSplit(clf, X, y, kfolds={kfolds})..")
PrintVarInfo("X", X)
PrintVarInfo("y", y)
skfolds = KFold(n_splits=kfolds, random_state=42, shuffle=True)
for train_index, val_index in skfolds.split(X, y):
PrintVarInfo("train_index", train_index)
PrintVarInfo("val_index", val_index)
clone_clf = clone(clf)
X_train_folds = X[train_index]
y_train_folds = y[train_index]
X_val_fold = X[val_index]
y_val_fold = y[val_index]
clone_clf.fit(X_train_folds, y_train_folds)
y_pred = clone_clf.predict(X_val_fold)
PrintScores(y_val_fold, y_pred, i)
i += 1
#n_correct = sum(y_pred == y_val_fold)
#print(n_correct / len(y_pred))
# My : print 0.95035 0.96035 and 0.9604
# Gereon: prints 0.9502, 0.96565 and 0.96495
print("K-fold demo..")
MyKFoldSplit(sgd_clf, X_train, y_train_5, 3)
print("OK")
```
%%%% Output: stream
K-fold demo..
MyKFoldSplit(clf, X, y, kfolds=3)..
type(X) =<class 'numpy.ndarray'>, X.shape =(60000, 784)
type(y) =<class 'numpy.ndarray'>, y.shape =(60000,)
type(train_index) =<class 'numpy.ndarray'>, train_index.shape =(40000,)
type(val_index) =<class 'numpy.ndarray'>, val_index.shape =(20000,)
FOLD 0: accuracy=0.97, precision=0.94, recall=0.70, F1=0.80
FOLD 1: accuracy=0.95, precision=0.67, recall=0.89, F1=0.76
FOLD 2: accuracy=0.97, precision=0.89, recall=0.73, F1=0.80
OK
%% Cell type:code id: tags:
``` python
print("Final test scores..")
print(" train yet a model with all train data..")
sgd_clf.fit(X_train, y_train_5)
print(" predict on test data..")
y_test_5_pred = sgd_clf.predict(X_test)
PrintScores(y_test_5, y_test_5_pred, -1)
print("OK")
```
%%%% Output: stream
Final test scores..
train yet a model with all train data..
predict on test data..
accuracy=0.95, precision=0.66, recall=0.88, F1=0.76
OK
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment