-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main.v
397 lines (331 loc) · 16.5 KB
/
Main.v
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
Require Import Reals.
Require Import Vector.
Open Scope R_scope.
Module Main.
(*Alias for real numbered vector of size n*)
Notation Vec n := (Vector.t R n).
Notation CostFunction n := (Vector.t R n -> R).
Notation GradientFunction n := (Vector.t R n -> Vector.t R n).
(*All the x points from x_1 to x_k where x in Real ^ n for some arbitrary n dimensions *)
Notation X_Points k n := (Vector.t (Vec n) k).
(**
We are attempting to prove convergence of gradient descent in coq and see how far we get.
Reference: https://www.stat.cmu.edu/~ryantibs/convexopt-F13/scribes/lec6.pdf
Comments are occasionally from the reference.
*)
(* BEGIN Utilities *)
Definition L2norm {n: nat} (x : Vec n) : R.
Admitted.
(* Not sure why some of the vector functions are not in the stdlib or maybe I'm missing them.*)
(* Might be nice to make some notation with these *)
Definition vector_subtract {n:nat} (v1 v2: Vec n) : Vec n :=
map2 (fun x y => x - y) v1 v2.
Definition vector_addition {n:nat} (v1 v2: Vec n) : Vec n :=
map2 (fun x y => x + y) v1 v2.
Definition scalar_mult {n: nat} (s: R) (v: Vec n) : Vec n.
Admitted.
(* distance for real numbers*)
Definition dist_R (x y : R) : R := Rabs (x - y).
(* transpose v1 then multiply with v2 *)
Definition transpose_mult {n: nat} (v1 v2: Vec n) : R.
Admitted.
(*BEGIN Supporting Lemmas and Definations*)
Definition lipschitz {n: nat} (L: R) (f: CostFunction n) : Prop :=
forall x1 x2 : Vec n,
dist_R (f x1) (f x2) <= L * (L2norm (vector_subtract x1 x2)).
(* Could make this more polymorphic but also could make a thereom lipschitz imply this too*)
Definition lipschitz_gradient {n: nat} (L: R) (f: GradientFunction n) : Prop :=
forall x1 x2 : Vec n,
L2norm (vector_subtract (f x1) (f x2)) <= L * (L2norm (vector_subtract x1 x2)).
(* x_star is the optimal point in f
TODO so we just need state it's the lowest value?
*)
Definition optimal {n: nat} (x_star: Vec n) (f: CostFunction n) : Prop :=
forall x : Vec n, f x_star <= f x.
(*TODO write a defination*)
Definition convex {n: nat} (f: CostFunction n) : Prop.
Admitted.
(*TODO write a defination*)
Definition differentiable {n: nat} (f: CostFunction n) : Prop.
Admitted.
(* TODO see wikipedia gradients for details*)
Definition grad {n: nat} (f : CostFunction n ) : GradientFunction n.
Admitted.
(* Gradient decent after k steps *)
Fixpoint gradient_descent {n: nat} (k : nat) (x : Vec n) (f : CostFunction n) (learningRate: R) : Vec n :=
match k with
| O => x (*Last step return the optimized weights*)
| S k' =>
let gradient := grad f x in
let step := scalar_mult learningRate gradient in
(* Next weights *)
let x_i_plus_1 := vector_subtract x step in
gradient_descent k' x_i_plus_1 f learningRate
end.
(* TODO consider making this it's own function so we can do 6.2*)
(* Definition update_weights *)
(* TODO I forgot which one this was and if we needed it *)
(* Lemma lipschitz_implies_gradient_bound : lipschitz L f -> grad f -> True. *)
(**
Our assumption that ∇f is Lipschitz continuous with constant L implies that ∇2f (x) LI, or
equivalently that ∇2f (x) − LI is a negative semidefinite matrix. Using this fact we can perform a quadratic
expansion of f around f (x) and obtain the following inequality
f (y) ≤ f (x) + ∇f (x)T (y − x) + 1
2 ∇2f (x)‖y − x‖2
2
≤ f (x) + ∇f (x)T (y − x) + 1
2 L‖y − x‖2
2
*)
Lemma lipschitz_implies_inequality : forall {n: nat} (L: R) (x y : Vec n) (f: CostFunction n),
L > 0 ->
lipschitz_gradient L (grad f) ->
f y <= f x + transpose_mult ((grad f) x) (vector_subtract y x) + (1/2) * L * (L2norm (vector_subtract y x) ^ 2).
Admitted.
(* Minor Lemmas*)
(* Using t ≤ 1/L, we know that −(1 − 1/2 Lt) = 1/2 Lt − 1 ≤ 1/2 L(1/L) − 1 = 1/2 − 1 = − 1 / 2 *)
(* TODO see https://www.cs.princeton.edu/courses/archive/fall07/cos595/stdlib/html/Coq.Reals.Raxioms.html*)
(* We should do this proof first because it's the easiest*)
Lemma t_less_than_one_over_L_implies : forall (L t : R),
L > 0 ->
t <= 1 / L ->
- (1 - 1/2 * L * t) <= -1/2.
Proof.
intros L t H_L_less_than_0 H_t_less_than_eq_1_div_L.
unfold Rdiv in H_t_less_than_eq_1_div_L.
rewrite Rmult_1_l in H_t_less_than_eq_1_div_L.
Admitted.
(*6.2 *)
(**
Now let’s plug in the gradient descent update by letting y = x+ = x − t∇f (x). We then get:
f (x+) ≤ f (x) + ∇f (x)T (x+ − x) + 1/2 * L‖x+ − x‖_2^2
= ... # Some algebra
= f (x) − (1 − 1/2 * L* t) t‖∇f (x)‖_2^2
# And then use t_less_than_one_over_L_implies to get
= f (x) − 1/2 t‖∇f (x)‖_2^2
*)
Lemma f_x_plus_less_than : forall {n : nat} (x : Vec n) (f : CostFunction n) (L t : R),
let x_plus := vector_subtract x (scalar_mult t (grad f x)) in
f x_plus <= f x + transpose_mult (grad f x) (vector_subtract x_plus x) + 1 / 2 * L * L2norm (vector_subtract x_plus x) ^ 2
->
(* x+ = x − t∇f (x) *)
f x_plus <= f x - 1/2 * t * ((L2norm (grad f x)) ^ 2).
Admitted.
(* TODO apply t_less_than_one_over_L_implies*)
Search (fold_left).
(*6.6 The total difference of the cost at each iteration and the optimum cost is bounded
by some factor initial x subtracted by the x at the optimum and learning rate.
After each iteration taking the distance of how far it is
from the optimum value and then sum all those values.
Those sum of iterations should be **bounded** by some factor
of the initial x subtracted by the x at the optimum and learning rate.
∑ f (x(i) − f (x∗) ≤ 1 / 2t ( ‖x(0) − x∗‖^2_2
x_1_to_k : List R Represents x^1 to x^k for the k number of iterations
*)
Lemma bounded_sum_of_costs: forall {n: nat} {k: nat} (t: R) (x_0: Vec n) (x_star: Vec n) (X: X_Points k n) (f : CostFunction n),
let sums_of_iterations := fold_left (fun (acc : R) (x_i : Vec n) => acc + (f x_i) - (f x_star)) 0 X in
sums_of_iterations <= (L2norm (vector_subtract x_0 x_star)) ^ 2 / (2 * t).
Admitted.
(*TODO add hypothesis that f is decreasing *)
Lemma r1_lt_r2: forall {n: nat} {k: nat} (t: R) (x_0: Vec n) (x_k : Vec n) (x_star: Vec n) (X: X_Points k n) (f : CostFunction n),
let sums_of_iterations := fold_left (fun (acc : R) (x_i : Vec n) => acc + (f x_i) - (f x_star)) 0 X in
f x_k - f x_star <= sums_of_iterations / (INR k). (*TODO add k and inequality*)
Admitted.
Lemma r2_lt_r3: forall {n: nat} {k: nat} (t: R) (x_0: Vec n) (x_k : Vec n) (x_star: Vec n) (X: X_Points k n) (f : CostFunction n),
let sums_of_iterations := fold_left (fun (acc : R) (x_i : Vec n) => acc + (f x_i) - (f x_star)) 0 X in
sums_of_iterations / (INR k) <= (L2norm (vector_subtract x_0 x_star)) ^ 2 / (2 * t * (INR k)). (*TODO add k and inequality*)
Admitted.
(*We esentially want to squish Rlt_le_trans_middle :e. This sorta helps us do that.
*)
Lemma Rlt_le_trans_middle : forall r1 r2 r3, r1 <= r2 -> r2 <= r3 -> (r1 <= r3) = (r1 <= r2).
Admitted.
(* 6.5 -> 6.6 A *)
Lemma L6_5_implies_6_6_A: forall {n: nat} {k: nat} (t: R) (X: X_Points k n) (x_0: Vec n) (x_star: Vec n) (f: CostFunction n) (i: nat) (Hi: Nat.lt i k) (HSi: Nat.lt (S i) k) (Hi_minus_one: Nat.lt (i-1) k),
let xi_minus_one := nth X (Fin.of_nat_lt Hi_minus_one) in
let xi := nth X (Fin.of_nat_lt Hi) in
let xi_plus_one := nth X (Fin.of_nat_lt HSi) in
let f_x_star := f x_star in
let f_xi_plus_one := f xi_plus_one in
let six_five_right_hand := 1 / (2 * t) * (L2norm (vector_subtract xi x_star)^2 - L2norm(vector_subtract xi_plus_one x_star)^2) in
f_xi_plus_one - f_x_star <= six_five_right_hand ->
(* 6.6 A *)
let six_six_a_left_hand := fold_left (fun (acc : R) (x_i : Vec n) => acc + f x_i - f x_star) 0 X in
(* Use an acc that has the previous element to track x^(i-1) *)
let six_six_a_right_hand_sum := fold_left (
fun acc (x_i : Vec n) => match acc with
| (totalSum, x_i_minus_1) => (totalSum + (L2norm(vector_subtract x_i_minus_1 x_star)^2 - L2norm(vector_subtract x_i x_star)^2), x_i)
end
) (0, x_0) X in
let six_six_a_right_hand := match six_six_a_right_hand_sum with
| (sum, _) => 1 / (2 * t) * sum
end in
six_six_a_left_hand <= six_six_a_right_hand.
Proof.
Admitted.
(* 6.6 A -> 6.6 C *)
Lemma L6_6_A_implies_6_6_C: forall {n: nat} {k: nat} (t: R) (X: X_Points k n) (x_0: Vec n) (x_star: Vec n) (f: CostFunction n),
(* 6.6 A *)
let six_six_a_left_hand := fold_left (fun (acc : R) (x_i : Vec n) => acc + f x_i - f x_star) 0 X in
(* Use an acc that has the previous element to track x^(i-1) *)
let six_six_a_right_hand_sum := fold_left (
fun acc (x_i : Vec n) => match acc with
| (totalSum, x_i_minus_1) => (totalSum + (L2norm(vector_subtract x_i_minus_1 x_star)^2 - L2norm(vector_subtract x_i x_star)^2), x_i)
end
) (0, x_0) X in
let six_six_a_right_hand := match six_six_a_right_hand_sum with
| (sum, _) => 1 / (2 * t) * sum
end in
six_six_a_left_hand <= six_six_a_right_hand ->
(* 6.6 C *)
six_six_a_left_hand <= (1/2*t) * (L2norm(vector_subtract x_0 x_star) ^ 2).
Proof.
Admitted.
Lemma L6_6_C_implies_6_7_A: forall {n: nat} {k: nat} (t: R) (X: X_Points k n) (x_0: Vec n) (x_star: Vec n) (f: CostFunction n) (x_k : Vec n),
(* 6.6 C *)
let six_six_a_left_hand := fold_left (fun (acc : R) (x_i : Vec n) => acc + f x_i - f x_star) 0 X in
six_six_a_left_hand <= (1/2*t) * (L2norm(vector_subtract x_0 x_star) ^ 2) ->
(* 6.7 A *)
f x_k - f x_star <=
fold_left (fun (acc : R) (x_i : Vec n) => acc + f x_i - f x_star) 0 X / INR k.
Proof.
Admitted.
(*Hmm this moves backwards and the rest move forward*)
(*This was the strategy we did with joe so I must have meant to go backwards*)
Lemma L6_7_B_implies_6_7A: forall {n: nat} {k: nat} (t: R) (x_0: Vec n) (x_star: Vec n) (X: X_Points k n) (f : CostFunction n) (x_k : Vec n),
(* 6.7 B *)
let sums_of_iterations := fold_left (fun (acc : R) (x_i : Vec n) => acc + (f x_i) - (f x_star)) 0 X in
sums_of_iterations <= (L2norm (vector_subtract x_0 x_star)) ^ 2 / (2 * t) ->
(* 6.7 A *)
f x_k - f x_star <=
fold_left (fun (acc : R) (x_i : Vec n) => acc + f x_i - f x_star) 0 X / INR k.
Proof.
Admitted.
(*Ok so I need to make a 6.7 A implies 6.6 C*)
(*So copy and paste my old code *)
Lemma L6_7_A_implies_6_6_C: forall {n: nat} {k: nat} (t: R) (x_0: Vec n) (x_star: Vec n) (X: X_Points k n) (f: CostFunction n) (x_k : Vec n),
(* 6.7 A *)
f x_k - f x_star <=
fold_left (fun (acc : R) (x_i : Vec n) => acc + f x_i - f x_star) 0 X / INR k ->
(* 6.6 C *)
let six_six_a_left_hand := fold_left (fun (acc : R) (x_i : Vec n) => acc + f x_i - f x_star) 0 X in
six_six_a_left_hand <= (L2norm(vector_subtract x_0 x_star) ^ 2) / (2 * t).
Proof.
Admitted.
(* OK so this strategy is working let's keep going backwards and delete the forward lemmas later*)
Lemma L6_6_C_implies_6_6_A: forall {n: nat} {k: nat} (t: R) (x_0: Vec n) (x_star: Vec n) (X: X_Points k n) (f: CostFunction n),
(* 6.6 C *)
let six_six_a_left_hand := fold_left (fun (acc : R) (x_i : Vec n) => acc + f x_i - f x_star) 0 X in
six_six_a_left_hand <= (L2norm(vector_subtract x_0 x_star) ^ 2) / (2 * t) ->
(* 6.6 A *)
six_six_a_left_hand <=
(* Use an acc that has the previous element to track x^(i-1) *)
let six_six_a_right_hand_sum := fold_left (
fun acc (x_i : Vec n) => match acc with
| (totalSum, x_i_minus_1) => (totalSum + (L2norm(vector_subtract x_i_minus_1 x_star)^2 - L2norm(vector_subtract x_i x_star)^2), x_i)
end
) (0, x_0) X in
let six_six_a_right_hand := match six_six_a_right_hand_sum with
| (sum, _) => sum / (2 * t)
end in
six_six_a_right_hand.
Proof.
Admitted.
(* Yeonhee and I were making another attempt with the forward strategy *)
Lemma L6_7_A_implies_6_7_B: forall {n: nat} {k: nat} (t: R) (X: X_Points k n) (x_0: Vec n) (x_star: Vec n) (f : CostFunction n) (x_k : Vec n),
(* 6.7 A *)
let sums_of_iterations := fold_left (fun (acc :R) (x_i : Vec n) => acc + (f x_i) - (f x_star)) 0 X in
f x_k - f x_star <=
fold_left (fun (acc : R) (x_i : Vec n) => acc + f x_i - f x_star) 0 X / INR k ->
(* 6.7 B *)
f x_k - f x_star <= (L2norm (vector_subtract x_0 x_star)) ^ 2 / (2 * t * (INR k)).
Proof.
Admitted.
(**
Theorem 6.1 Suppose the function f : Rn → R is convex and differentiable, and that its gradient is
Lipschitz continuous with constant L > 0, i.e. we have that ‖∇f (x) − ∇f (y)‖2 ≤ L‖x − y‖2 for any x, y.
Then if we run gradient descent for k iterations with a fixed step size t ≤ 1/L, it will yield a solution f (k)
which satisfies
f (x(k)) − f (x∗) ≤ ‖x(0) − x∗‖2
where f (x∗) is the optimal value. Intuitively, this means that gradient descent is guaranteed to converge
and that it converges with rate O(1/k).
*)
Theorem L6_2_implies_6_3: forall {n: nat} {k: nat} (t: R) (X: X_Points k n) (f: CostFunction n) (L: R) (i: nat) (Hi: Nat.lt i k) (HSi: Nat.lt (S i) k),
let xi := nth X (Fin.of_nat_lt Hi) in
let xi_plus_one := nth X (Fin.of_nat_lt HSi) in
(* 6.2A *)
f xi_plus_one <= f xi + transpose_mult ((grad f) xi) (vector_subtract xi_plus_one xi) + (1/2) * L* (L2norm (vector_subtract xi_plus_one xi)^2)
-> (* 6.3 *)
f xi_plus_one <= f xi - (1/2 * t) * t * L2norm((grad f) xi)^2.
Proof.
Admitted.
Theorem L6_3_implies_6_4_A: forall {n: nat} {k: nat} (t: R) (X: X_Points k n) (x_star: Vec n) (f: CostFunction n) (L: R) (i: nat) (Hi: Nat.lt i k) (HSi: Nat.lt (S i) k),
let xi := nth X (Fin.of_nat_lt Hi) in
let xi_plus_one := nth X (Fin.of_nat_lt HSi) in
(* 6.3 *)
f xi_plus_one <= f xi - (1/2 * t) * t * L2norm((grad f) xi)^2 ->
(* 6.4 A *)
f xi_plus_one <= f x_star + transpose_mult((grad f) xi) (vector_subtract xi x_star) - (t / 2) * L2norm((grad f) xi)^2.
Proof.
Admitted.
Theorem L6_4_A_implies_6_4_D: forall {n: nat} {k: nat} (t: R) (X: X_Points k n) (x_star: Vec n) (f: CostFunction n) (L: R) (i: nat) (Hi: Nat.lt i k) (HSi: Nat.lt (S i) k),
let xi := nth X (Fin.of_nat_lt Hi) in
let xi_plus_one := nth X (Fin.of_nat_lt HSi) in
let distance_xi_plus_one_x_star := f xi_plus_one - f x_star in
(* 6.4 A *)
f xi_plus_one <= f x_star + transpose_mult((grad f) xi) (vector_subtract xi x_star) - (t / 2) * L2norm((grad f) xi)^2 ->
(* 6.4 D *)
(*6.4D *)
distance_xi_plus_one_x_star <= 1 / (2*t) * (L2norm (vector_subtract xi x_star)^2 - L2norm(vector_subtract (vector_subtract xi (scalar_mult t ((grad f) xi))) x_star)^2)
.
Proof.
Admitted.
(* 6.4D -> 6.5 *)
Lemma L6_4_D_implies_6_5: forall {n: nat} {k: nat} (t: R) (X: X_Points k n) (x_star : Vec n) (f: CostFunction n) (i: nat) (Hi: Nat.lt i k) (HSi: Nat.lt (S i) k) ,
let xi := nth X (Fin.of_nat_lt Hi) in
let xi_plus_one := nth X (Fin.of_nat_lt HSi) in (* why did we define x+ like this*)
let distance_xi_plus_one_x_star := f xi_plus_one - f x_star in
(*6.4D *)
distance_xi_plus_one_x_star <= 1 / (2*t) * (L2norm (vector_subtract xi x_star)^2 - L2norm(vector_subtract (vector_subtract xi (scalar_mult t ((grad f) xi))) x_star)^2)
->
(*6.5*)
distance_xi_plus_one_x_star <= 1 / (2 * t) * (L2norm (vector_subtract xi x_star)^2 - L2norm(vector_subtract xi_plus_one x_star)^2).
Admitted.
Theorem lipschitz_implies_grad: forall {n: nat} (L: R) (f: CostFunction n),
lipschitz L f ->
lipschitz_gradient L (grad f).
Proof.
Admitted.
Theorem convergence : forall {n: nat} {k: nat} (t: R) (x_0: Vec n) (x_star: Vec n) (x_k: Vec n) (X: X_Points k n) (f: CostFunction n) (L : R) (i: nat) (Hi: Nat.lt i k) (HSi: Nat.lt (S i) k) (Hi_minus_one: Nat.lt (i-1) k),
lipschitz L f ->
L > 0 ->
lipschitz L f ->
convex f ->
differentiable f ->
optimal x_star f ->
let xi_minus_one := nth X (Fin.of_nat_lt Hi_minus_one) in
let xi := nth X (Fin.of_nat_lt Hi) in
let xi_plus_one := nth X (Fin.of_nat_lt HSi) in
f x_k - f x_star <= (L2norm (vector_subtract x_0 x_star)) ^ 2 / (2 * t * (INR k)).
Proof.
intros.
pose proof (@lipschitz_implies_inequality n L xi xi_plus_one) as MainH.
apply (@L6_2_implies_6_3 n k t X f) in MainH.
apply (@L6_3_implies_6_4_A n k t X x_star f) in MainH.
apply (@L6_4_A_implies_6_4_D n k t X x_star f) in MainH.
apply (@L6_4_D_implies_6_5 n k t X x_star f) in MainH.
apply (@L6_5_implies_6_6_A n k t X x_0 x_star f) in MainH.
apply (@L6_6_A_implies_6_6_C n k t X x_0 x_star f) in MainH.
apply (@L6_6_C_implies_6_7_A n k t X x_0 x_star f x_k) in MainH.
apply (@L6_7_A_implies_6_7_B n k t X x_0 x_star f x_k) in MainH.
- apply MainH.
- auto.
- apply t.
- apply t.
- apply H0.
- apply (lipschitz_implies_grad) in H. apply H.
Qed.
(**
We decided thanks to Steven that we can just introduce hypothesis moving forward.
e.g. we have 6.2 or right before 6.1
*)
End Main.