start rebasing iml cheatsheet

Signed-off-by: Thomas Gassmann <tgassmann@student.ethz.ch>
thomasgassmann · Jun 22, 2024 · db107d0 · db107d0
1 parent 6a4bffc
commit db107d0
Show file tree

Hide file tree

Showing 50 changed files with 592 additions and 143 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Fetch tags
         shell: bash
         run: git fetch --tags -f
-      - name: Compile LaTeX document
+      - name: Compile LaTeX document (xelatex)
         uses: xu-cheng/latex-action@v2
         with:
           root_file: |
@@ -43,6 +43,13 @@ jobs:
           work_in_root_file_dir: true
           glob_root_file: true
           latexmk_use_xelatex: true
+      - name: Compile LaTeX document (pdflatex)
+        uses: xu-cheng/latex-action@v2
+        with:
+          root_file: |
+            iml/iml.tex
+          work_in_root_file_dir: true
+          glob_root_file: true
       - name: Deploy
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

diff --git a/iml/assets/.DS_Store b/iml/assets/.DS_Store
diff --git a/iml/assets/abstention.png b/iml/assets/abstention.png
diff --git a/iml/assets/anomaly-detection-gmm.png b/iml/assets/anomaly-detection-gmm.png
diff --git a/iml/assets/asymmetric-cost.png b/iml/assets/asymmetric-cost.png
diff --git a/iml/assets/autoencoder.png b/iml/assets/autoencoder.png
diff --git a/iml/assets/backpropagation.png b/iml/assets/backpropagation.png
diff --git a/iml/assets/bayesian-decision.tex b/iml/assets/bayesian-decision.tex
diff --git a/iml/assets/bias-variance-tradeoff.png b/iml/assets/bias-variance-tradeoff.png
diff --git a/iml/assets/cnn.png b/iml/assets/cnn.png
diff --git a/iml/assets/conditional-probability.png b/iml/assets/conditional-probability.png
diff --git a/iml/assets/confusion-matrix.jpeg b/iml/assets/confusion-matrix.jpeg
diff --git a/iml/assets/cross-validation.png b/iml/assets/cross-validation.png
diff --git a/iml/assets/decision-tree.png b/iml/assets/decision-tree.png
diff --git a/iml/assets/feature-mapping.png b/iml/assets/feature-mapping.png
diff --git a/iml/assets/fp-fn-tradeoff.png b/iml/assets/fp-fn-tradeoff.png
diff --git a/iml/assets/gan.png b/iml/assets/gan.png
diff --git a/iml/assets/gaussian-conditional-prob.jpeg b/iml/assets/gaussian-conditional-prob.jpeg
diff --git a/iml/assets/gmm.png b/iml/assets/gmm.png
diff --git a/iml/assets/gmmb.jpeg b/iml/assets/gmmb.jpeg
diff --git a/iml/assets/gnb.jpeg b/iml/assets/gnb.jpeg
diff --git a/iml/assets/gradient-descent.png b/iml/assets/gradient-descent.png
diff --git a/iml/assets/k-means.png b/iml/assets/k-means.png
diff --git a/iml/assets/knn.png b/iml/assets/knn.png
diff --git a/iml/assets/linear-classifiers.png b/iml/assets/linear-classifiers.png
diff --git a/iml/assets/ml-pipeline.png b/iml/assets/ml-pipeline.png
diff --git a/iml/assets/overfitting.png b/iml/assets/overfitting.png
diff --git a/iml/assets/pca.jpeg b/iml/assets/pca.jpeg
diff --git a/iml/assets/pooling.png b/iml/assets/pooling.png
diff --git a/iml/assets/qda-bigpicture.jpeg b/iml/assets/qda-bigpicture.jpeg
diff --git a/iml/assets/qda.jpeg b/iml/assets/qda.jpeg
diff --git a/iml/assets/risk.png b/iml/assets/risk.png
diff --git a/iml/assets/roc.png b/iml/assets/roc.png
diff --git a/iml/assets/swiss-roll.png b/iml/assets/swiss-roll.png
diff --git a/iml/assets/train-test-split.png b/iml/assets/train-test-split.png
diff --git a/iml/chapters/bayesian-decision.tex b/iml/chapters/bayesian-decision.tex
@@ -0,0 +1,7 @@
+\section*{Bayesian Decision Theory}
+
+Given $p(y \; | \; x)$, a set of actions $A$ and a cost $C: Y \times A \mapsto \R$, pick the action with the maximum expected utility. 
+
+\qquad \qquad $a^* = \text{argmin}_{a \in A} \; \E_y[C(y,a) \; | \; x]$
+
+Can be used for asymetric costs or abstention.
diff --git a/iml/chapters/classification.tex b/iml/chapters/classification.tex
@@ -0,0 +1,55 @@
+\section*{Classification}
+
+\textbf{Zero-One loss} \quad not convex or continuous
+
+\qquad \qquad $\ell_{0-1}(\hat{f}(x), y) = \mathbb{I}_{y \neq \sgn \hat{f}(x)}$
+
+\textbf{Logistic loss} \quad $\log(1 + e^{-y \hat{f}(x)})$
+
+\qquad \qquad $\nabla \ell(\hat{f}(x), y) = \frac{-y_i x_i}{1 + e^{y_i \hat{f}(x)}}$
+
+\textbf{Hinge loss} \quad \ $\max(0, 1-y \hat{f}(x))$
+
+\textbf{Softmax} $p(1 | x) = \frac{1}{1 + e^{- \hat{f}(x)}}, p(-1 | x) = \frac{1}{1 + e^{\hat{f}(x)}}$ 
+
+Multi-Class \ \ \ $\hat{p}_k = e^{\hat{f}_k(x)} / \sum_{i=1}^K e^{\hat{f}_j(x)}$
+
+\subsection*{Linear Classifiers}
+
+$f(x) = w^\top x$, the decision boundary $f(x) = 0$. \smallskip
+
+If data is lin. sep., grad. desc. converges to \textbf{Maximum-Margin Solution}: 
+
+\quad $w_\text{MM} = \text{argmax} \; \text{margin} (w) \; \text{with } ||w||_2 = 1$
+
+Where $\text{margin} (w) = \min_i y_i w^\top x_i$.
+
+\subsection*{Support Vector Machines}
+\textbf{Hard SVM}
+
+\qquad $\hat{w} = \min_w ||w||_2 \; \; \text{s.t. } \forall i \;y_i w^\top x_i \geq 1$
+
+\textbf{Soft SVM} \quad allow "slack" in the constraints
+$$\hat{w} = \min_{w, \xi} \frac{1}{2} ||w||_2^2 + \lambda \sum_{i=1}^n \underbrace{\max (0, 1 - y_i w^\top x_i)}_{\text{hinge loss}}$$ \\[-23pt]
+
+\subsection*{Metrics} 
+Choose $+1$ as the more important class. \\[-18pt]
+
+\begin{multicols*}{2}
+	\begin{center}
+		\includegraphics[width=\columnwidth]{confusion-matrix.jpeg}
+	\end{center}
+
+	$\text{error}_1 / \text{FPR}: \frac{\text{FP}}{\text{TN + FP}}$
+	$\text{error}_2 / \text{FNR}: \frac{\text{FN}}{\text{TP + FN}}$
+	$\text{Precision}: \frac{\text{TP}}{\text{TP + FP}}$
+	$\text{TPR / Recall}: \frac{\text{TP}}{\text{TP + FN}}$
+\end{multicols*}
+
+.\\[-20pt]
+\textbf{AUROC}: Plot TPR vs. FPR and compare different ROC's with area under the curve.
+
+\textbf{F1-Score}: $\frac{2\text{TP}}{2\text{TP + FP + FN}}$, $\text{Accuracy}: \frac{\text{TP + TN}}{\text{P + N}}$
+
+Goal: large recall and small FPR.
+
diff --git a/iml/chapters/gaussian-mixture-model.tex b/iml/chapters/gaussian-mixture-model.tex
@@ -0,0 +1,69 @@
+\section*{Gaussian Mixture Model}
+
+Assume that data is generated from a convex-combination of Gaussian distributions: \\[-8pt]
+
+$p(x  | \theta) = p(x  | \mu, \Sigma, w) = \sum_{j=1}^k w_j \mathcal{N}(x; \mu_j, \Sigma_j)$
+
+We don't have labels and want to cluster this data. The problem is to estimate the param. for the Gaussian distributions.
+
+\ \ $\text{argmin}_{\theta} \; - \sum_{i=1}^n \log \sum_{j=1}^k w_j \cdot \mathcal{N}(x_i \; | \; \mu_j, \Sigma_j)$
+
+This is a non-convex objective. Similar to training a GBC without labels. Start with guess for our parameters, predict the unknown labels and then impute the missing data. Now we can get a closed form update.
+
+\subsection*{Hard-EM Algorithm}
+
+\textbf{E-Step}: predict the most likely class for each data point:
+\begin{align*}
+	z_i^{(t)} &= \argmax{z} \; p(z \; | \; x_i, \theta^{(t-1)}) \\[-5pt]
+	&= \argmax{z} \; p(z \; | \; \theta^{(t-1)}) \cdot p(x_i \; | \; z, \theta^{(t-1)})
+\end{align*}
+\textbf{M-Step}: compute MLE of $\theta^{(t)}$ as for GBC. \smallskip
+
+Problems: labels if the model is uncertain, tries to extract too much inf. Works poorly if clusters are overlapping. With uniform weights and spherical covariances is equivalent to k-Means with Lloyd's heuristics.
+
+\subsection*{Soft-EM Algorithm}
+
+\textbf{E-Step}: calculate the cluster membership weights for each point ($w_j = \pi_j = p(Z = j))$: \\[-8pt]
+
+\quad $\gamma_j^{(t)}(x_i) = p(Z = j \; | \; D) =\frac{w_j \cdot p(x_i ; \theta_j^{(t-1)})}{\sum_k w_k \cdot p(x_i ; \theta_k^{(t-1)})}$
+
+\textbf{M-Step}: compute MLE with closed form:
+
+$w_j^{(t)} = \frac{1}{n} \sum_{i=1}^n \gamma_j^{(t)}(x_i) \quad \; \mu_j^{(t)} = \frac{\sum_{i=1}^n x_i \cdot \gamma_j^{(t)}(x_i)}{\sum_{i=1}^n \gamma_j^{(t)}(x_i)}$
+
+\qquad \quad $\Sigma_j^{(t)} = \frac{\sum_{i=1}^n \gamma_j^{(t)}(x_i)(x_i - \mu_j^{(t)})(x_i - \mu_j^{(t)})^\top}{\sum_{i=1}^n \gamma_j^{(t)}(x_i)}$
+
+
+Init. the weights as uniformly distributed, rand. or with k-Means++ and for variances use spherical init. or empirical covariance of the data. Select $k$ using cross-validation.
+
+\subsection*{Degeneracy of GMMs}
+
+GMMs can overfit with limited data. Avoid this by add $v^2 I$ to variance, so it does not collapse (equiv. to a Wishart prior on the covariance matrix). Choose $v$ by cross-validation.
+
+\subsection*{Gaussian-Mixture Bayes Classifiers}
+
+Assume that $p(x \; | \; y)$ for each class can be modelled by a GMM.
+
+\qquad $p(x \; | \; y) = \sum_{j=1}^{k_y} w_j^{(y)} \mathcal{N}(x; \mu_j^{(y)}, \Sigma_j^{(y)})$
+
+Giving highly complex decision boundaries:
+
+\qquad $p(y \; | \; x) = \frac{1}{z} p(y)  \sum_{j=1}^{k_y} w_j^{(y)} \mathcal{N}(x; \mu_j^{(y)}, \Sigma_j^{(y)})$
+
+\subsection*{GMMs for Density Estimation}
+
+Can be used for anomaly detection or data imputation. Detect outliers, by comparing the estimated density against $\tau$. Allows to control the FP rate. Use ROC curve as evaluation criterion and optimize using CV to find $\tau$.
+
+\subsection*{General EM Algorithm}
+
+\textbf{E-Step}: Take the expected value over latent variables $z$ to generate likelihood function $Q$:
+\begin{align*}
+	Q(\theta ; \theta^{(t-1)}) &= \E_{Z}[ \log  p(X, Z \; | \; \theta) \; | \; X, \theta^{(t-1)}] \\[-5pt]
+	&= \sum_{i=1}^n \sum_{z_i=1}^k \gamma_{z_i}(x_i) \log p(x_i, z_i \; | \; \theta)
+\end{align*}
+with $\gamma_z(x) = p(z \; | \; x, \theta^{(t-1)})$
+
+\textbf{M-Step}: Compute MLE / Maximize:
+$$\theta^{(t)} = \argmax{\theta} \; Q(\theta; \theta^{(t-1)})$$
+
+We have monotonic convergence, each EM-iteration increases the data likelihood.
diff --git a/iml/chapters/generative-adversarial-networks.tex b/iml/chapters/generative-adversarial-networks.tex
@@ -0,0 +1,17 @@
+\section*{GANs}
+
+Learn $f:$ "simple" distr. $\mapsto$ non linear distr. Computing likelihood of the data becomes hard, therefore we need a different loss.
+\begin{align*}
+	\min_{w_G} \max_{w_D} \; & \E_{x \sim p_{\text{data}}} [\log D(x, w_D)] \\[-5pt]
+ 	+ &\E_{z \sim p_z} [\log (1 - D(G(z, w_G), w_D))]
+\end{align*}
+
+ Training requires finding a saddle point, always converges to saddle point with if G, D have enough capacity. For a fixed $G$, the optimal discriminator is:
+ $$D_G(x) = \frac{p_{\text{data}}(x)}{p_{\text{data}}(x) + p_G(x)}$$
+
+The prob. of being fake is $1 - D_G$. Too powerful discriminator could lead to memorization of finite data. Other issues are oscillations/divergence or mode collapse. \smallskip
+
+One possible performance metric:
+$$DG = \max_{w_D'} M(w_G, w_D') - \min_{w_G'} M(w_G', w_D)$$
+
+Where $M(w_G, w_D)$ is the training objective.
diff --git a/iml/chapters/generative-modeling.tex b/iml/chapters/generative-modeling.tex
@@ -0,0 +1,56 @@
+\section*{Generative Modeling}
+
+Aim to estimate $p(x, y)$ for complex situations using Bayes' rule: $p(x,y) = p(x|y) \cdot p(y)$
+
+\subsection*{Naive Bayes Model}
+
+GM for classification tasks. Assuming for a class label, each feature is independent. This helps estimating $p( x \; | \; y) =\prod_{i=1}^d p(x_i \; | \; y_i)$.
+
+\subsection*{Gaussian Naive Bayes Classifier}
+
+Naive Bayes Model with Gaussians features. Estimate the parameters via MLE:
+
+MLE for class prior: $p(y) = \hat p_y = \frac{\text{Count}(Y = y)}{n}$
+MLE for feature distribution:
+
+\qquad \qquad $p(x_i \; | \; y) = \mathcal{N}(x_i; \hat \mu_{y,i}, \sigma^2_{y,i})$ \\[-13pt]
+
+Where:\\[-10pt]
+
+\qquad \quad $\mu_{y,i} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} x_{j,i}$
+
+\qquad \quad $\sigma^2_{y,i} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} (x_{j,i} - \hat \mu_{y, i})^2$
+
+
+Predictions are made by: \\[-20pt]
+$$y = \argmax{\hat y} \; p(\hat y \; | \; x) = \argmax{\hat y} \; p(\hat y) \cdot \prod_{i=1}^d p(x_i \; | \; \hat y)$$
+
+Equivalent to decision rule for bin. class.: \\[-8pt]
+
+\qquad \qquad $y = \sgn \left( \color{Red} \log \frac{p(Y = +1 \; | \; x)}{p(Y = -1 \; | \; x)} \color{Black} \right)$ \\[-3pt]
+
+Where \color{Red}$f(x)$\color{Black} is called the discriminant function. If the conditional independence assumption is violated, the classifier can be overconfident.
+
+\subsection*{Gaussian Bayes Classifier}
+
+No independence assumption, model the features with a multivariant Gaussian $\mathcal{N}(x; \mu_y, \Sigma_y)$:
+
+\quad $\mu_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} x_{j}$
+
+\quad $\Sigma_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} (x_{j} - \hat \mu_{y}) (x_{j} - \hat \mu_{y})^\top$
+
+This is also called the \textbf{quadratic discriminant analysis} (QDA). LDA: $\Sigma_+ = \Sigma_-$, Fisher LDA: $p(y) = \frac{1}{2}$, Outlier detection: $p(x) \leq \tau$.
+
+\subsection*{Avoiding Overfitting}
+
+MLE is prone to overfitting. Avoid this by restricting model class (fewer parameters, e.g. GNB) or using priors (restrict param. values).
+
+\subsection*{Generative vs. Discriminative}
+
+\textbf{Discriminative models}:
+
+$p(y | x)$, can't detect outliers, more robust
+
+\textbf{Generative models}:
+
+$p(x,y)$, can be more powerful (dectect outliers, missing values) if assumptions are met, are typically less robust against outliers
diff --git a/iml/chapters/kernels.tex b/iml/chapters/kernels.tex
@@ -0,0 +1,23 @@
+\section*{Kernels}
+
+Parameterize: $w = \Phi^\top \alpha$, $K = \Phi \Phi^\top$
+
+A kernel is \textbf{valid} if $K$ is sym.: $k(x,z) = k(z,x)$ and psd: $z^\top K z \geq 0$
+
+\textbf{lin.}: $k(x, z) = x^\top z$, \textbf{poly.}: $k(x, z) = (x^\top z + 1)^m$
+\textbf{rbf}: $k(x, z) = \exp ( -\frac{||x - z||_\alpha}{\tau} )$
+
+$\alpha = 1 \Rightarrow $ laplacian kernel \\
+$\alpha = 2 \Rightarrow $ gaussian kernel
+
+\textbf{Kernel composition rules}
+
+$k = k_1 + k_2$, \quad $k = k_1 \cdot k_2$ \quad $\forall c > 0. \; k = c \cdot k_1$,
+$\forall f \text{ convex}. \; k = f(k_1), $ holds for polynoms with pos. coefficients or exp function. 
+
+$\forall f. \; k(x,y) = f(x)k_1(x,y)f(y)$
+
+\textbf{Mercers Theorem}: Valid kernels can be decomposed into a lin. comb. of inner products.
+
+\textbf{Kern. Ridge Reg.}
+$\frac{1}{n} ||y - K\alpha ||_2^2 + \lambda \alpha^\top K \alpha$
diff --git a/iml/chapters/model-error.tex b/iml/chapters/model-error.tex
@@ -0,0 +1,19 @@
+\section*{Model Error}
+
+\textbf{Empirical Risk} \quad \ $\hat R_D(f) = \frac{1}{n} \sum \mathcal \ell(y, f(x))$
+
+\textbf{Population Risk} \quad $R(f) = \E_{x,y \sim p}[\ell(y, f(x))]$
+
+It holds that $\E_D[\hat R_D (\hat f)] \leq R(\hat f)$. We call $R(\hat f)$ the generalization error.
+
+\textbf{Bias Variance Tradeoff}:
+
+Pred. error = \color{red} Bias$^2$ \color{black} + \color{blue} Variance \color{black} + \color{ForestGreen} Noise \color{black}
+\begin{align*}
+	\E_D[R(\hat f)] &= \color{red} \E_x[f^*(x) - \E_D[\hat f_D(x)]]^2 \\[-4pt]
+ 	&+ \color{blue} \E_x[\E_D[(\hat f_D(x) - \E_D[\hat f_D(x)])^2]] \color{black}  + \color{ForestGreen} \sigma
+\end{align*}
+
+\textbf{Bias}: how close $\hat f$ can get to $f^*$
+
+\textbf{Variance}: how much $\hat f$ changes with $D$
diff --git a/iml/chapters/neural-networks.tex b/iml/chapters/neural-networks.tex
@@ -0,0 +1,32 @@
+\section*{Neural Networks}
+$w$ are the weights and $\varphi: \R \mapsto \R$ is a nonlinear \textbf{activation function}: $\phi(x, w) = \varphi(w^\top x)$
+
+
+$\textbf{ReLU: } \max (0,z), \; \textbf{Tanh: } \frac{\exp(z) - \exp(-z)}{\exp(z) + \exp(-z)}$ \\[-3pt]
+$\textbf{Sigmoid: } \frac{1}{1 + \exp(-z)}$
+
+
+\textbf{Universal Approximation Theorem}: We can approximate any arbitrary smooth target function, with 1+ layer with sufficient width.
+
+\subsection*{Forward Propagation}
+
+Input: $v^{(0)} = [x; 1]$ \quad Output: $f = W^{(L)} v^{(L-1)}$
+Hidden: $z^{(l)} = W^{(l)} v^{(l-1)}, v^{(l)} = [\varphi(z^{(l)}); 1]$
+
+
+\subsection*{Backpropagation}
+
+Non-convex optimization problem: \\[-10pt]
+
+\includegraphics[width=\columnwidth]{backpropagation.png} \\[-15pt]
+
+Only compute \color{Red} \textbf{the gradient}\color{Black}. Rand. init. weights by distr. assumption for $\varphi$. ( $2 / n_{in}$ for ReLu and $1/n_{in}$ or $ 1/ (n_{in} + n_{out})$ for Tanh)
+
+\subsection*{Overfitting}
+\textbf{Regularization}; \textbf{Early Stopping}; \textbf{Dropout}: ignore hidden units with prob. $p$, after training use all units and scale weights by $p$; \textbf{Batch Normalization}: normalize the input data (mean 0, variance 1) in each layer
+
+\subsection*{CNN \quad \color{Black}$\varphi(W * v^{(l)})$}
+
+The output dimension when applying $m$ different $f \times f$ filters to an $n \times n$ image with padding $p$ and stride $s$ is: $l = \frac{n + 2p - f}{s} + 1$
+
+For each channel there is a separate filter.
diff --git a/iml/chapters/optimization.tex b/iml/chapters/optimization.tex
@@ -0,0 +1,14 @@
+\section*{Gradient Descent}
+Converges only for convex case.
+\[
+	w^{t+1} = w^t - \eta_t \cdot \nabla \ell(w^t)
+\]
+
+For linear regression:
+\[
+	||w^t - w^*||_2 \leq ||I - \eta X^\top X||_{op}^t ||w^0 - w^*||_2
+\]
+
+$\rho = ||I - \eta X^\top X||_{op}^t$ conv. speed for const. $\eta$. Opt. fixed $\eta = \frac{2}{\lambda_{\text{min}} + \lambda_{\text{max}}}$ and max. $\eta \leq \frac{2}{\lambda_{\text{max}}}$. 
+
+\textbf{Momentum}: $w^{t+1} = w^t + \gamma \Delta w^{t-1} - \eta_t \nabla \ell(w^t)$
diff --git a/iml/chapters/other-nonlinear-methods.tex b/iml/chapters/other-nonlinear-methods.tex
@@ -0,0 +1,7 @@
+\section*{KNN Classification}
+
+\begin{compactitem}
+	\item Pick $k$ and distance metric $d$
+	\item For given $x$, find among $x_1,...,x_n \in D$ the $k$ closest to $x \to x_{i_1},..., x_{i_k}$
+	\item Output the majority vote of labels
+\end{compactitem}
diff --git a/iml/chapters/regression.tex b/iml/chapters/regression.tex
@@ -0,0 +1,37 @@
+\section*{Regression}
+
+\textbf{Squared loss} \quad (convex)
+
+\qquad \qquad $\frac{1}{n}\sum (y_i - f(x_i))^2 = \frac{1}{n}||y - X w||_2^2$
+
+\qquad \qquad $\nabla_w L(w) = 2X^\top(Xw -y)$
+
+Solution: $\hat{w} = (X^\top X)^{-1}X^\top y$
+
+\subsection*{Regularization}
+
+\textbf{Lasso Regression} \quad (sparse)
+
+\qquad \qquad $\argmin{w \in \R^d} ||y - \Phi w||_2^2 + \lambda ||w||_1$
+
+\textbf{Ridge Regression}
+
+\qquad \qquad $\argmin{w \in \R^d} ||y - \Phi w||_2^2 + \lambda ||w||_2^2$
+
+\qquad \qquad $\nabla_w L(w) = 2X^\top(Xw -y) + 2 \lambda w$
+
+Solution: $\hat w = (X^\top X + \lambda I)^{-1} X^\top y$
+
+large $\lambda \Rightarrow$ larger bias but smaller variance 
+
+\subsection*{Cross-Validation}
+
+\begin{compactitem}
+	\item For all folds $i = 1,..., k$: 
+		\begin{compactitem}
+			\item Train $\hat{f}_i$ on $D' - D'_i$
+			\item Val. error $R_i = \frac{1}{|D'_i|} \sum \ell(\hat{f}_i(x), y)$
+		\end{compactitem}
+	\item Compute CV error $\frac{1}{k} \sum_{i=1}^k R_i$
+	\item Pick model with lowest $CV$ error
+\end{compactitem}