move kernel things over

Signed-off-by: Thomas Gassmann <tgassmann@student.ethz.ch>
thomasgassmann · Jun 23, 2024 · 047b80e · 047b80e
1 parent f815314
commit 047b80e
Show file tree

Hide file tree

Showing 6 changed files with 21 additions and 33 deletions.
diff --git a/iml/Kernels.tex b/iml/Kernels.tex
@@ -1,21 +1,4 @@
 \section*{Kernels}
-\subsection*{Examples of Kernels}
-$k(x,y) = (x^\top y)^m$  all monomials of deg. m \\
-$k(x,y) = (1+x^\top y)^m$ all monomials up to deg. m\\
-There are $\binom{d+m}{m}=O_d(d^m)=O_m(m^d)$ monomials of 
-order $m$ in $d$ variables.\\
-
-
-\subsection*{Properties $k(x,y) = \phi(x)^\top \phi(y)$}
-
-
-\subsection*{Valid kernels}
-$f(k)$, where $f$ is a polynomial/power series with non-negative coefficients;\\
-$k(\binom{x}{y}, \binom{x'}{y'})=k(x,x')k(y,y'),\\ 
-k(\binom{x}{y}, \binom{x'}{y'})=k(x,x') + k(y,y')$ \\
-where $\binom{x}{y}$ is concatenation of vectors;\\
-$k(x,y)=g(x)k(x,y)g(y)$ where $g\colon X\to\R$.
-If for $g$ all Taylor coefficients non-negative, then $k(x, x') = g(\left< x, x' \right>)$ is a valid kernel
 
 \subsection*{Kernelized Ridge}
 Ansatz: $w^*=\Phi^\top\alpha$\\

diff --git a/iml/chapters/gaussian-mixture-model.tex b/iml/chapters/gaussian-mixture-model.tex
@@ -35,11 +35,7 @@ \subsection*{Soft-EM Algorithm}
 \qquad \quad $\Sigma_j^{(t)} = \frac{\sum_{i=1}^n \gamma_j^{(t)}(x_i)(x_i - \mu_j^{(t)})(x_i - \mu_j^{(t)})^\top}{\sum_{i=1}^n \gamma_j^{(t)}(x_i)}$
 
 
-Init. the weights as uniformly, or with k-Means++ and for variances use spherical init. or empirical covariance of the data. Select $k$ using cross-validation.
-
-\subsection*{Degeneracy of GMMs}
-
-GMMs can overfit with limited data. Avoid this by add $v^2 I$ to variance (choose with cv), so it does not collapse.
+Init. the weights as uniformly, or with k-Means++ and for variances use spherical init. or empirical covariance of the data. Select $k$ using cross-validation. GMMs can \color{Red}overfit \color{Black} with limited data. Avoid this by add $v^2 I$ to variance (choose with cv), so it does not collapse.
 
 \subsection*{Gaussian-Mixture Bayes Classifiers}
 

diff --git a/iml/chapters/generative-modeling.tex b/iml/chapters/generative-modeling.tex
@@ -33,7 +33,7 @@ \subsection*{Gaussian Naive Bayes Classifier}
 
 \subsection*{Gaussian Bayes Classifier}
 
-No independence assumption, model the features with a multivariant Gaussian $\mathcal{N}(x; \mu_y, \Sigma_y)$:
+No independence assumption, model the features with a multivariate Gaussian $\mathcal{N}(x; \mu_y, \Sigma_y)$:
 
 \quad $\mu_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} x_{j}$
 

diff --git a/iml/chapters/kernels.tex b/iml/chapters/kernels.tex
@@ -4,15 +4,25 @@ \section*{Kernels}
 
 A kernel is \textbf{valid} if $K$ is sym.: $k(x,z) = k(z,x)$ and psd: $z^\top K z \geq 0$, check Gram matrix (matrix of all inner products) is psd
 
-\textbf{lin.}: $k(x, z) = (x^\top z)^m$, \textbf{poly.}: $k(x, z) = (x^\top z + 1)^m$
+\textbf{all monomials of deg. m}: $k(x,y) = (x^\top y)^m$ \\
+\textbf{all monomials up to deg. m}: $k(x,y) = (1+x^\top y)^m$, there are $\binom{d+m}{m}$ monomials of 
+order $m$ in $d$ variables.\\
+
 \textbf{RBF}: $k(x, z) = \exp ( -\frac{||x - z||_\alpha}{\tau} )$, $\alpha = 1 \Rightarrow $ Laplacian kernel, $\alpha = 2 \Rightarrow $ Gaussian kernel
 
 \textbf{Mercers Theorem}: Valid kernels can be decomposed into a lin. comb. of inner products.
 
-\textbf{Kernel composition} $k = k_1 + k_2$, \quad $k = k_1 \cdot k_2$ \quad $\forall c > 0. \; k = c \cdot k_1$,
-$k = f(k_1)$ holds for $f$ convex or polynomials/power series with non-neg. coefficients  
+\textbf{Kernel composition}
 
-$\forall f. \; k(x,y) = f(x)k_1(x,y)f(y)$
+\begin{rowlist}
+    \item $\forall c > 0. \; k = c \cdot k_1$
+    \item $f(k)$, where $f$ is a polynomial/power series with non-negative coefficients
+    \item{
+        $k(\binom{x}{y}, \binom{x'}{y'})=k(x,x')k(y,y')$, $k(\binom{x}{y}, \binom{x'}{y'})=k(x,x') + k(y,y')$ where $\binom{x}{y}$ is concatenation of vectors
+    }
+    \item If for $g$ all Taylor coefficients non-negative, then $k(x, x') = g(\left< x, x' \right>)$ is a valid kernel
+    \item $\forall f. \; k(x,y) = f(x)k_1(x,y)f(y)$
+\end{rowlist}
 
 \textbf{Kern. Ridge Reg.}
 $\frac{1}{n} ||y - K\alpha ||_2^2 + \lambda \alpha^\top K \alpha$
diff --git a/iml/chapters/unsupervised-learning.tex b/iml/chapters/unsupervised-learning.tex
@@ -6,7 +6,7 @@ \subsection*{k-Means Clustering}
 
 \qquad $\hat{R} (\mu) = \sum_{i=1}^n \min_{j\in \{1,...,k\}} ||x_i - \mu_j||_2^2$
 
-Lloyd's heuristics:
+Lloyd's heuristic:
 Init. cluster centers $\mu^{(0)}$:
 \begin{compactitem}
 	\item Assign points to closest center				

diff --git a/iml/chapters/various.tex b/iml/chapters/various.tex
@@ -1,6 +1,5 @@
 \section*{Various}
 
-\textbf{Derivatives}:
 $$\nabla_x x^\top A = A \quad \nabla_x a^\top x = \nabla_x x^\top a = a$$
 $$\nabla_x b^\top A x = A^\top b \quad \nabla_x x^\top x = 2x \quad \nabla_x x^\top A x = (A + A^\top)x$$\\[-20pt]
 $$\nabla_w || y-Xw||_2^2 = 2X^\top(Xw-y)$$
@@ -12,9 +11,9 @@ \section*{Various}
 
 $\mathcal{N}(x; \mu, \Sigma) = \frac{1}{\sqrt{(2 \pi)^d \text{det}(\Sigma)}} \exp(-\frac{(x - \mu)^\top \Sigma^{-1} (x-\mu)}{2})$
 
-\textbf{Other Facts}
+\textbf{More} $\text{Tr}(AB) = \text{Tr}(BA)$, 
 
-$\text{Tr}(AB) = \text{Tr}(BA)$, $\text{Var}(X) = \E[X^2] - \E[X]^2$, $X \in \mathbb{R}^{n \times d}: \; \; X^{-1} \rightarrow \mathcal{O}(d^3) \; X^\top X \rightarrow \mathcal{O}(nd^2)$, $\binom{n}{k} = \frac{n!}{(n-k)!k!}$, $||w^\top w||_2 = \sqrt{w^\top w}$
+$\text{Var}(X) = \E[X^2] - \E[X]^2$
 
 Cov$[X] = \E[(X- \E[X])(X- \E[X])^\top]$;
 Covariance matrix for centered data: $
@@ -41,11 +40,11 @@ \section*{Various}
 
 2: Hessian $\nabla^2 L (w) \succcurlyeq 0$ (psd)
 
-\begin{compactitem}
+\begin{rowlist}
 	\item $\alpha f + \beta g$, $\alpha, \beta \geq 0$, convex if $f, g$ convex
 	\item $f \circ g$, convex if $f$ convex and $g$ affine or $f$ convex non-decresing and $g$ convex
 	\item $\max(f, g)$, convex if $f,g$ convex
-\end{compactitem}
+\end{rowlist}
 
 \textbf{PSD}
 $M \in \mathbb{R}^{n\times n}$ PSD $\Leftrightarrow \forall x \in \mathbb{R}^n: x^\top Mx \geq 0 \\