Commit 3a90cc65 authored by Carsten Eie Frigaard's avatar Carsten Eie Frigaard
Browse files

update

parent bc4252c1
This diff is collapsed.
#%%
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
w1 = 2 # e.g. 0, 2,
w2 = 3 # e.g. 0, 3,
x = np.arange(-15,15,0.1)
y = np.arange(-15,15,0.1)
X,Y = np.meshgrid(x,y)
Z = 1/(1 + np.exp(-(w1*X + w2*Y)))
# Plot a basic wireframe.
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
linewidth=0)
ax.set_xlabel('X axis')
ax.set_ylabel('Y axis')
ax.set_zlabel('Z axis')
# read men/women height weight data..
# OBS: Enhed er hhv. pound og inch
import matplotlib.pyplot as plt
import numpy as np
# Load data
data = np.loadtxt('height_weight.csv', delimiter=';', skiprows=1)
X = data[:,1:3]
y = data[:,0]
pound2kg = 0.453592
inch2cm = 2.54
Xw = pound2kg*X[:,1]
Xh = inch2cm*X[:,0]
print('weight :', np.mean(Xw), 'height:', np.mean(Xh))
#%% input data - scatter plot
idx_men = y == 0
plt.scatter(Xw[idx_men], Xh[idx_men], s=0.1, c='b')
idx_women = y == 1
plt.scatter(Xw[idx_women], Xh[idx_women], s=0.1, c='r')
#%% input-output relation
# køn som funk af w/h
plt.plot(Xw, y, 'r.', markersize=1)
plt.plot(Xh, y, 'r.', markersize=1)
#%% distribution - e.g. weight
plt.hist([Xw[idx_men], Xw[idx_women]], bins=50)
%% Cell type:markdown id: tags:
# ITMAL Exercise
## The E.T. and Gradient Descent for Linear Regression
The Good News is that aliens are here, yes, really! The Bad News is that You are assigned teacher for one Extra-Terrestial _creature_.
Your task is to create a document (journal) for the _creature_, explaining all about Gradient Descent related to the linear regression model.
The _creature_ needs about four, max. six normal pages of text otherwise it becomes very grumpy like a Gremlin.
However, the _creature_ likes reading code and math and examining beutifull plots, so code sections, math and plots do not count into the max normal-page limit.
As you job of being an E.T.-teacher, You must cover Gradient Decent for a simple Linear Regression model with at least the following concepts:
* Linear Regression model prediction in vectorized form
* MSE cost function for a Linear Regression model
* Closed-form solution (normal equation)
* Numerical gradient decent
* Batch Gradient Descent
* Stochastic Gradient Descent
* Learning rates
Feel free to add additional Gradient Decent concepts, but remember to keep the text You submit below max. six pages (exluding plot, code and math).
(Once submitted as a hand-in in Brightspace, I will forward it to the E.T., but expect no direct feedback from the _creature_..)
%% Cell type:code id: tags:
``` python
# TODO: Your GD documentation for the E.T.
```
%% Cell type:markdown id: tags:
REVISIONS| |
---------| |
2021-0926| CEF, initial.
This diff is collapsed.
%% Cell type:markdown id: tags:
# ITMAL Exercise
REVISIONS|
---------|------------------------------------------------
2018-1218|CEF, initial.
2018-0214|CEF, major update.
2018-0218|CEF, fixed error in nabla expression.
2018-0218|CEF, added minimization plot.
2018-0218|CEF, added note on argmin/max.
2018-0218|CEF, changed concave to convex.
## Training a Linear Regressor I
The goal of the linear regression is to find the argument $w$ that minimizes the sum-of-squares error over all inputs.
Given the usual ML input data matrix $\mathbf X$ of size $(n,d)$ where each row is an input column vector $(\mathbf{x}^{(i)})^\top$ data sample of size $d$
$$
\newcommand\rem[1]{}
\rem{ITMAL: CEF def and LaTeX commands, remember: no newlines in defs}
\newcommand\eq[2]{#1 &=& #2\\}
\newcommand\ar[2]{\begin{array}{#1}#2\end{array}}
\newcommand\ac[2]{\left[\ar{#1}{#2}\right]}
\newcommand\st[1]{_{\mbox{\scriptsize #1}}}
\newcommand\norm[1]{{\cal L}_{#1}}
\newcommand\obs[2]{#1_{\mbox{\scriptsize obs}}^{\left(#2\right)}}
\newcommand\diff[1]{\mbox{d}#1}
\newcommand\pown[1]{^{(#1)}}
\def\pownn{\pown{n}}
\def\powni{\pown{i}}
\def\powtest{\pown{\mbox{\scriptsize test}}}
\def\powtrain{\pown{\mbox{\scriptsize train}}}
\def\bX{\mathbf{M}}
\def\bX{\mathbf{X}}
\def\bZ{\mathbf{Z}}
\def\bw{\mathbf{m}}
\def\bx{\mathbf{x}}
\def\by{\mathbf{y}}
\def\bz{\mathbf{z}}
\def\bw{\mathbf{w}}
\def\btheta{{\boldsymbol\theta}}
\def\bSigma{{\boldsymbol\Sigma}}
\def\half{\frac{1}{2}}
\newcommand\pfrac[2]{\frac{\partial~#1}{\partial~#2}}
\newcommand\dfrac[2]{\frac{\mbox{d}~#1}{\mbox{d}#2}}
\bX =
\ac{cccc}{
x_1\pown{1} & x_2\pown{1} & \cdots & x_d\pown{1} \\
x_1\pown{2} & x_2\pown{2} & \cdots & x_d\pown{2}\\
\vdots & & & \vdots \\
x_1\pownn & x_2\pownn & \cdots & x_d\pownn\\
}
$$
and $\by$ is the target output column vector of size $n$
$$
\by =
\ac{c}{
y\pown{1} \\
y\pown{2} \\
\vdots \\
y\pown{n} \\
}
$$
The linear regression model, via its hypothesis function and for a column vector input $\bx\powni$ of size $d$ and a column weight vector $\bw$ of size $d+1$ (with the additional element $w_0$ being the bias), can now be written as simple as
$$
\ar{rl}{
h(\bx\powni;\bw) &= \bw^\top \ac{c}{1\\\bx\powni} \\
&= w_0 + w_1 x_1\powni + w_2 x_2\powni + \cdots + w_d x_d\powni
}
$$
using the model parameters or weights, $\bw$, aka $\btheta$. To ease notation $\bx$ is assumed to have the 1 element prepended in the following so that $\bx$ is a $d+1$ column vector
$$
\ar{rl}{
\ac{c}{1\\\bx\powni} &\mapsto \bx\powni, ~~~~\mbox{by convention in the following...}\\
h(\bx\powni;\bw) &= \bw^\top \bx\powni
}
$$
This is actually the first fully white-box machine learning algorithm, that we see. All the glory details of the algorithm are clearly visible in the internal vector multiplication...quite simple, right? Now we just need to train the weights...
### Loss or Objective Function - Formulation for Linear Regression
The individual cost (or loss), $L\powni$, for a single input-vector $\bx\powni$ is a measure of how the model is able to fit the data: the higher the $L\powni$ value the worse it is able to fit. A loss of $L=0$ means a perfect fit.
It can be given by, say, the square difference from the calculated output, $h$, to the desired output, $y$
$$
\ar{rl}{
L\powni &= \left( h(\bx\powni;\bw) - y\powni \right)^2\\
&= \left( \bw^\top\bx\powni - y\powni \right)^2
}
$$
To minimize all the $L\powni$ losses (or indirectly also the MSE or RMSE) is to minimize the sum of all the
individual costs, via the total cost function $J$
$$
\ar{rl}{
\mbox{MSE}(\bX,\by;\bw) &= \frac{1}{n} \sum_{i=1}^{n} L\powni \\
&= \frac{1}{n} \sum_{i=1}^{n} \left( \bw^\top\bx\powni - y\powni \right)^2\\
&= \frac{1}{n} ||\bX \bw - \by||_2^2
}
$$
here using the squared Euclidean norm, $\norm{2}^2$, via the $||\cdot||_2^2$ expressions.
Now the factor $\frac{1}{n}$ is just a constant and can be ignored, yielding the total cost function
$$
\ar{rl}{
J &= \frac{1}{2} ||\bX \bw - \by||_2^2\\
&\propto \mbox{MSE}
}
$$
adding yet another constant, 1/2, to ease later differentiation of $J$.
### Training
Training the linear regression model now amounts to computing the optimal value of the $\bw$ weight; that is finding the $\bw$-value that minimizes the total cost
$$
\bw^* = \mbox{argmin}_\bw~J\\
$$
where $\mbox{argmin}_\bw$ means find the argument of $\bw$ that minimizes the $J$ function. This minima (sometimes a maxima, via argmax) is denoted $\bw^*$ in most ML literature.
The minimization can in 2-D visually be draw, as finding the lowest $J$ that for linear regression always form a convex shape
<img src="Figs/minimization.png" style="width:150px">
### Training: The Closed-form Solution
To solve for $\bw^*$ in closed form (i.e. directly, without any numerical approximation), we find the gradient of $J$ with respect to $\bw$. Taking the partial deriverty $\partial/\partial_\bw$ of the $J$ via the gradient (nabla) operator
$$
\rem{
\frac{\partial}{\partial \bw} =
\ac{c}{
\frac{\partial}{\partial w_1} \\
\frac{\partial}{\partial w_2} \\
\vdots\\
\frac{\partial}{\partial w_d}
}
}
\nabla_\bw~J =
\left[ \frac{\partial J}{\partial w_1}, \frac{\partial J}{\partial w_2}, \ldots , \frac{\partial J}{\partial w_m} \right]^\top
$$
and setting it to zero yields the optimal solution for $\bw$, and ignoring all constant factors of 1/2
and $1/n$
$$
\ar{rl}{
\nabla_\bw J(\bw) &= \bX^\top \left( \bX \bw - \by \right) ~=~ 0\\
0 &= \bX^\top\bX \bw - \bX^\top\by
}
$$
giving the closed-form solution, with $\by = [y\pown{1}, y\pown{2}, \cdots,
y\pown{n}]^\top$
$$
\bw^* ~=~ \left( \bX^\top \bX \right)^{-1} \bX^\top \by
$$
You already now this method from math, finding the extrema for a function, say
$$f(w)=w^2-2w-2$$
so is given by finding the place where the gradient $\mbox{d}~f(w)/\mbox{d}w = 0$
$$
\dfrac{f(w)}{w} = 2w -2 = 0
$$
so we see that there is an extremum at $w=1$. Checking the second deriverty tells if we are seeing a minimum, maximum or a saddlepoint at that point. In matrix terms, this corresponds to finding the _Hessian_ matrix and gets notational tricky due to the multiple feature dimensions involved.
> https://en.wikipedia.org/wiki/Ordinary_least_squares
> https://en.wikipedia.org/wiki/Hessian_matrix
#### Qa Write a Python function that uses the closed-form to find $\bw^*$
Use the test data, `X1` and `y1` in the code below to find `w1` via the closes-form. Use the test vectors for `w1` to test your implementation, and remember to add the bias term (concat an all-one vector to `X` before solving).
%% Cell type:code id: tags:
``` python
# TODO: Qa...
# TEST DATA:
import numpy as np
from libitmal import utils as itmalutils
itmalutils.ResetRandom()
X1 = np.array([[8.34044009e-01],[1.44064899e+00],[2.28749635e-04],[6.04665145e-01]])
y1 = np.array([5.97396028, 7.24897834, 4.86609388, 3.51245674])
w1_expected = np.array([4.046879011698, 1.880121487278])
assert False, "find the least-square solution for X1 and y1, your implementation here, say from [HOML] p.109"
# w1 = ...
# TEST VECTOR:
itmalutils.PrintMatrix(w1, label="w1=", precision=12)
itmalutils.AssertInRange(w1,w1_expected,eps=1E-9)
```
%% Cell type:markdown id: tags:
#### Qb Find the limits of the least-square method
Again find the least-square optimal value for `w2` now using `X2` and `y2` as inputs.
Describe the problem with the matrix inverse, and for what `M` and `N` combinations do you see, that calculation the matrix inverse takes up long time?
%% Cell type:code id: tags:
``` python
# TODO: Qb...
# TEST DATA: Matrix, taken from [HOML], p108
M=100
N=1
print(f'More test data, M={10}, N={N}...')
X2=2 * np.random.rand(M,N)
y2=4 + 3*X2 + np.random.randn(M,1)
y2=y2[:,0] # well, could do better here!
assert False, "find the least-square solution for X2 and y2, your implementation here, say from [HOML] p.109"
# w2 =
```
%% Cell type:markdown id: tags:
# ITMAL Exercise
REVISIONS|
---------|------------------------------------------------
2018-1218|CEF, initial.
2018-1302|CEF, major update.
2018-1402|CEF, spell checked.
2018-1802|CEF, added class hierarchy figure.
## Linear Regression II
The goal of the linear regression was to find the argument $\mathbf w$ that minimizes the objective function, $J$
$$
\newcommand\rem[1]{}
\rem{ITMAL: CEF def and LaTeX commands, remember: no newlines in defs}
\newcommand\eq[2]{#1 &=& #2\\}
\newcommand\ar[2]{\begin{array}{#1}#2\end{array}}
\newcommand\ac[2]{\left[\ar{#1}{#2}\right]}
\newcommand\st[1]{_{\mbox{\scriptsize #1}}}
\newcommand\norm[1]{{\cal L}_{#1}}
\newcommand\obs[2]{#1_{\mbox{\scriptsize obs}}^{\left(#2\right)}}
\newcommand\diff[1]{\mbox{d}#1}
\newcommand\pown[1]{^{(#1)}}
\def\pownn{\pown{n}}
\def\powni{\pown{i}}
\def\powtest{\pown{\mbox{\scriptsize test}}}
\def\powtrain{\pown{\mbox{\scriptsize train}}}
\def\bX{\mathbf{M}}
\def\bX{\mathbf{X}}
\def\bZ{\mathbf{Z}}
\def\bw{\mathbf{m}}
\def\bx{\mathbf{x}}
\def\by{\mathbf{y}}
\def\bz{\mathbf{z}}
\def\bw{\mathbf{w}}
\def\btheta{{\boldsymbol\theta}}
\def\bSigma{{\boldsymbol\Sigma}}
\def\half{\frac{1}{2}}
\newcommand\pfrac[2]{\frac{\partial~#1}{\partial~#2}}
\newcommand\dfrac[2]{\frac{\mbox{d}~#1}{\mbox{d}#2}}
\ar{rl}{
\bw^* &= \mbox{argmin}_\bw~J\\
&= \mbox{argmin}_\bw\frac{1}{2} ||\bX \bw - \by||_2^2
}
$$
for some input data, and using the $\norm{2}^2$ or MSE internally in the loss function.
To solve this equation in closed form (directly, without any numerical approximation), we found the optimal solution to be of the rather elegant least-square solution
$$
\bw^* ~=~ \left( \bX^\top \bX \right)^{-1} \bX^\top \by
$$
Now, we want to build a python linear regression class that encapsulates this elegant closed-form solution.
#### Qa Complete the estimator class `MyRegressor`
This one will be based on linear regression closed-form solution from the previous notebook (linear_regression_1.ipynb).
Use your knowledge of how to create Python classes and the fit-predict Scikit-learn interface. The class hierarchy will look like
<img src="Figs/class_regression.png" style="width:210px">
Finalize or complete the `fit()`, `predict()` and `mse()` score functions in the `MyRegressor` class; there is already a good structure for it in the code below.
Also, notice the implementation of the `score()` function in the class, that is similar to `sklearn.linear_model.LinearRegression`'s score function, i.e. a $R^2$ score we saw in an earlier lesson.
Use the test stub below, that creates some simple test data, similar to [HOML] p.108/110, and runs a fit-predict on your brand new linear regression closed-form estimator.
%% Cell type:code id: tags:
``` python
# TODO: Qa, with some help
import numpy as np
from numpy.linalg import inv
from sklearn.base import BaseEstimator
from sklearn.metrics import r2_score
from libitmal import utils as itmalutils
class MyRegressor(BaseEstimator):
def __init__(self):
self.__w = None
def weights(self):
return self.__w
def __addbiasterm(self, X: np.array):
assert X.ndim==2
assert X.shape[0]>0, 'empty X array'
assert X.shape[1]>0, 'empty X array'
X_b = np.c_[np.ones((X.shape[0],1)), X] # Add x0=1 to each instance
return X_b
def fit(self, X: np.array, y: np.array):
# NOTE: really to tight restrictions on input X and y types:
assert str(type(X))=="<class 'numpy.ndarray'>"
assert str(type(y))=="<class 'numpy.ndarray'>"
assert X.ndim==2, f'expected X array of ndim=2, got ndim={X.ndim}'
assert y.ndim==1, f'expected y array of ndim=1, got ndim={y.ndim}'
assert X.shape[0]==y.shape[0], 'expected X.shape[0] to be equal to y.shape[0], got X.shape[0]={X.shape[0]}, y.shape[0]={y.shape[0]}'
# y=Xw, where X is a matrix with full column rank, the least squares solution,
#
# w^=argmin∥Xw−y∥2
#
# is given by
#
# w^=(X^TX)^−1 X^T y
# HOML, p.111
assert False, 'TODO: implement it: add the least-square calc of w'
#self.__w = ...
return self
def predict(self, X: np.array):
#if (not self.__w):
# raise RuntimeError('You must train model before predicting data!')
X_b = self.__addbiasterm(X)
assert X_b.ndim==2
assert X_b.shape[1]==self.__w.shape[0]
assert False, 'TODO: implement it: add the predictor code'
#p = ...
assert p.ndim==1
assert p.shape[0]==X.shape[0]
return p
def score(self, X: np.array, y: np.array):
# Returns the coefficient of determination R^2 of the prediction.
# The coefficient R^2 is defined as (1 - u/v), where u is the residual
# sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum
# of squares ((y_true - y_true.mean()) ** 2).sum().
p=self.predict(X)
assert p.ndim==1
assert y.ndim==1
assert p.shape==y.shape
return r2_score(y, p)
def mse(self, X: np.array, y: np.array):
p = self.predict(X)
assert p.shape==y.shape
diff=y-p
assert diff.ndim==1
n=y.shape[0]
assert n>0
# could use np.dot() but lets be verbose
J=0.0
assert False, 'TODO: implement it: add the J sum functionality via MSE'
#...
itmalutils.CheckFloat(J)
if J<0:
raise RangeError(f'expeted J to be >= 0, got J={J}')
return J
def __str__ (self):
return f'MyRegressor: _w={self.__w}'
print('OK')
```
%%%% Output: stream
OK
%% Cell type:code id: tags:
``` python
# TEST SECTION for Qa...or use your own tests!
from sklearn.metrics import mean_squared_error
from libitmal import utils as itmalutils
def PrintResults(msg, f, X: np.array, y: np.array, p: np.array):
print(msg)
reg_score=reg.score(X,y)
mse=mean_squared_error(p,y)
try:
reg_mse=reg.mse(X,y)
except AttributeError:
reg_mse = mse # fall back
print(f' f={f}')
#print(f' y={y}')
#print(f' p={p}')
#print(f' y-p={y-p}')
print(f' reg_score={reg_score}')
print(f' reg_mse={reg_mse}')
print(f' mean_squared_error(y,p)={mse}')
itmalutils.AssertInRange(reg_mse,mse)
return reg_mse, reg_score
def DoPlot(X, y):
% matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.plot(X, y, "b.")
plt.xlabel('$x_1$', fontsize=18)
plt.ylabel('$y$', rotation=0, fontsize=18)
plt.axis([0, 2, 0, 15])
plt.show()
itmalutils.ResetRandom()
M=200
N=20
print(f'M={10}, N={N}')
X=2 * np.random.rand(M,N)
y=4 + 3*X + np.random.randn(M,1)
y=y[:,0] # well, could do better here!
DoPlot(X, y)
print('Test data for classifier:')
print(f' X.shape={X.shape}')
print(f' y.shape={y.shape}\n')
reg = MyRegressor()
f=reg.fit(X, y)
p=reg.predict(X)
assert p.shape==y.shape
reg_mse, reg_score = PrintResults('Result for MyRegressor...', f, X, y, p)
```
%% Cell type:markdown id: tags: