forked from insightbook/data-science-from-scratch
-
Notifications
You must be signed in to change notification settings - Fork 32
/
simple_linear_regression.py
98 lines (74 loc) · 3.24 KB
/
simple_linear_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def predict(alpha: float, beta: float, x_i: float) -> float:
return beta * x_i + alpha
def error(alpha: float, beta: float, x_i: float, y_i: float) -> float:
"""
The error from predicting beta * x_i + alpha
when the actual value is y_i
"""
return predict(alpha, beta, x_i) - y_i
from scratch.linear_algebra import Vector
def sum_of_sqerrors(alpha: float, beta: float, x: Vector, y: Vector) -> float:
return sum(error(alpha, beta, x_i, y_i) ** 2
for x_i, y_i in zip(x, y))
from typing import Tuple
from scratch.linear_algebra import Vector
from scratch.statistics import correlation, standard_deviation, mean
def least_squares_fit(x: Vector, y: Vector) -> Tuple[float, float]:
"""
Given two vectors x and y,
find the least-squares values of alpha and beta
"""
beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
alpha = mean(y) - beta * mean(x)
return alpha, beta
x = [i for i in range(-100, 110, 10)]
y = [3 * i - 5 for i in x]
# Should find that y = 3x - 5
assert least_squares_fit(x, y) == (-5, 3)
from scratch.statistics import num_friends_good, daily_minutes_good
alpha, beta = least_squares_fit(num_friends_good, daily_minutes_good)
assert 22.9 < alpha < 23.0
assert 0.9 < beta < 0.905
from scratch.statistics import de_mean
def total_sum_of_squares(y: Vector) -> float:
"""the total squared variation of y_i's from their mean"""
return sum(v ** 2 for v in de_mean(y))
def r_squared(alpha: float, beta: float, x: Vector, y: Vector) -> float:
"""
the fraction of variation in y captured by the model, which equals
1 - the fraction of variation in y not captured by the model
"""
return 1.0 - (sum_of_sqerrors(alpha, beta, x, y) /
total_sum_of_squares(y))
rsq = r_squared(alpha, beta, num_friends_good, daily_minutes_good)
assert 0.328 < rsq < 0.330
def main():
import random
import tqdm
from scratch.gradient_descent import gradient_step
num_epochs = 10000
random.seed(0)
guess = [random.random(), random.random()] # choose random value to start
learning_rate = 0.00001
with tqdm.trange(num_epochs) as t:
for _ in t:
alpha, beta = guess
# Partial derivative of loss with respect to alpha
grad_a = sum(2 * error(alpha, beta, x_i, y_i)
for x_i, y_i in zip(num_friends_good,
daily_minutes_good))
# Partial derivative of loss with respect to beta
grad_b = sum(2 * error(alpha, beta, x_i, y_i) * x_i
for x_i, y_i in zip(num_friends_good,
daily_minutes_good))
# Compute loss to stick in the tqdm description
loss = sum_of_sqerrors(alpha, beta,
num_friends_good, daily_minutes_good)
t.set_description(f"loss: {loss:.3f}")
# Finally, update the guess
guess = gradient_step(guess, [grad_a, grad_b], -learning_rate)
# We should get pretty much the same results:
alpha, beta = guess
assert 22.9 < alpha < 23.0
assert 0.9 < beta < 0.905
if __name__ == "__main__": main()