ex5: Bias and Variance, Training set & Validation set & Test set

#Bias and Variance, Training set & Validation set & Test set偏差和方差、訓練集&驗證集&測試集
# Rugulation on linear regression
# data visualization
# 對一個水庫的洩洪水量以及水庫水位進行正則化的線性歸回。然後將會探討方差-偏差的問題
import numpy as np
import scipy.io as sio
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# vector shape
# loadmat use to read matlab
data = sio.loadmat('Coursera-ML-AndrewNg-Notes-master\\code\\ex5-bias vs variance\\ex5data1.mat')
data

#each feature's vector
X, y, Xval, yval, Xtest, ytest = map(np.ravel,[data['X'], data['y'], data['Xval'], data['yval'], data['Xtest'], data['ytest']])
X.shape, y.shape, Xval.shape, yval.shape, Xtest.shape, ytest.shape


# show tje figure

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X, y)
ax.set_xlabel('water_level')
ax.set_ylabel('flow')
plt.show()

# rugulation the cost function of linear regression
#加入常數項
X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]

# design the cost function
def cost(theta, X, y):
    """
    X= R(m*n), m records, n features
    y= R(m)
    theta= R(n), linear regression parameters
    """
    m = X.shape[0]

    inner = X @ theta - y  # R(m*1)

    # 1*m @ m*1 = 1*1 in matrix multiplication
    # but you know numpy didn't do transpose in 1d array, so here is just a
    # vector inner product to itselves
    #  inner.T 都要轉置因為Xj(i)是X(i)微分後
    square_sum = inner.T @ inner
    cost = square_sum / (2 * m)

    return cost

# rugulation the cost function of linear regression
def costReg(theta, X, y, reg = 1):
    
    m = X.shape[0]

    regularized_term = (reg / (2 * m)) * np.power(theta[1:], 2).sum()

    return cost(theta, X, y) + regularized_term
# valid
theta = np.ones(X.shape[1])
costReg(theta, X, y, 1)

# gradiented rugulation the cost function of linear regression
#  .T 都要轉置因為Xj(i)是X(i)微分後
def gradient(theta, X, y):
    m = X.shape[0]

    inner = X.T @ (X @ theta - y)  # (m,n).T @ (m, 1) -> (n, 1)

    return inner / m

def gradientReg(theta, X, y, reg):
    m = X.shape[0]

    regularized_term = theta.copy()  # same shape as theta
    regularized_term[0] = 0  # don't regularize intercept theta

    regularized_term = (reg / m) * regularized_term

    return gradient(theta, X, y) + regularized_term
    
gradientReg(theta, X, y, 1)

# try to fit the best solution lanta=0
#用工具庫求参數最優解
#res.x就是會call此函數
theta = np.ones(X.shape[1])
final_theta = opt.minimize(fun=costReg, x0=theta, args=(X, y, 0), method='TNC', jac=gradientReg, options={'disp': True}).x
final_theta

# show the figure
# y=mx+b
b = final_theta[0] # intercept
m = final_theta[1] # slope

fig, ax = plt.subplots(figsize=(12,8))
plt.scatter(X[:,1], y, c='r', label="Training data")
plt.plot(X[:, 1], X[:, 1]*m + b, c='b', label="Prediction")
ax.set_xlabel('water_level')
ax.set_ylabel('flow')
ax.legend()
plt.show()

# Bias and Variance and not use regulation
# design
def linear_regression(X, y, l=1):
    """linear regression
    args:
        X: feature matrix, (m, n+1) # with incercept x0=1
        y: target vector, (m, )
        l: lambda constant for regularization

    return: trained parameters
    """
    # init theta
    theta = np.ones(X.shape[1])

    # train it
    res = opt.minimize(fun=costReg,
                       x0=theta,
                       args=(X, y, l),
                       method='TNC',
                       jac=gradientReg,
                       options={'disp': True})
    return res

#tc = costReg(res.x, X[:i, :], y[:i], 0)是 train set，X[:i, :]就是對i跌代每一項的完整數據，X[:i, 1]對i跌代每一項的第2列數據
#cv = costReg(res.x, Xval, yval, 0)是 cross validation set
training_cost, cv_cost = [], []
m = X.shape[0]
for i in range(1, m+1):
    res = linear_regression(X[:i, :], y[:i], 0)
    
    tc = costReg(res.x, X[:i, :], y[:i], 0)
    cv = costReg(res.x, Xval, yval, 0)
   
    training_cost.append(tc)
    cv_cost.append(cv)


# plot figure
fig, ax = plt.subplots(figsize=(12,8))
plt.plot(np.arange(1, m+1), training_cost, label='training cost')
plt.plot(np.arange(1, m+1), cv_cost, label='cv cost')
plt.legend()
plt.show()