update k-means

Signed-off-by: Thomas Gassmann <tgassmann@student.ethz.ch>
thomasgassmann · Jun 23, 2024 · bf93923 · bf93923
1 parent 047b80e
commit bf93923
Show file tree

Hide file tree

Showing 6 changed files with 12 additions and 34 deletions.
diff --git a/iml/Clustering.tex b/iml/Clustering.tex
diff --git a/iml/chapters/gaussian-mixture-model.tex b/iml/chapters/gaussian-mixture-model.tex
@@ -20,7 +20,7 @@ \subsection*{Hard-EM Algorithm}
 \end{align*}
 \textbf{M-Step}: compute MLE of $\theta^{(t)}$ as for GBC. \smallskip
 
-Problems: tries to extract too much inf, works poorly if clusters overlapping, with uniform weights and spherical covariances equivalent to k-means with Lloyd's heuristic
+Problems: works poorly if clusters overlapping, with uniform weights and spherical covariances equivalent to k-means with Lloyd's heuristic
 
 \subsection*{Soft-EM Algorithm}
 
@@ -49,7 +49,7 @@ \subsection*{Gaussian-Mixture Bayes Classifiers}
 
 \subsection*{GMMs for Density Estimation}
 
-Can be used for anomaly detection or data imputation. Detect outliers, by comparing the estimated density against $\tau$. Allows to control the FP rate. Use ROC curve as evaluation criterion and optimize using CV to find $\tau$.
+Detect outliers, by comparing the estimated density against $\tau$. Allows to control the FP rate. Use ROC curve as evaluation criterion and optimize using CV to find $\tau$.
 
 \subsection*{General EM Algorithm}
 

diff --git a/iml/chapters/neural-networks.tex b/iml/chapters/neural-networks.tex
@@ -25,6 +25,4 @@ \subsection*{Overfitting}
 
 \subsection*{CNN \quad \color{Black}$\varphi(W * v^{(l)})$}
 
-The output dimension when applying $m$ different $f \times f$ filters to an $n \times n$ image with padding $p$ and stride $s$ is: $l = \frac{n + 2p - f}{s} + 1$
-
-For each channel there is a separate filter.
+The out. dim. of applying $m$ $f \times f$ filters to $n \times n$ image with padding $p$ and stride $s$ is: $l = \frac{n + 2p - f}{s} + 1$, for each channel: separate filter
diff --git a/iml/chapters/unsupervised-learning.tex b/iml/chapters/unsupervised-learning.tex
@@ -2,36 +2,29 @@ \section*{Unsupervised Learning}
 
 \subsection*{k-Means Clustering}
 
-Optimization Goal (non-convex):
+Want $\mu_i$ to minimize $\sum_{i=1}^n \min_{j\in\{1,...k\}}\|x_i-\mu_j\|_2^2$
+Non-convex and NP-hard in general. Can be kernelized.
 
-\qquad $\hat{R} (\mu) = \sum_{i=1}^n \min_{j\in \{1,...,k\}} ||x_i - \mu_j||_2^2$
+\subsection*{Lloyd's heuristic}
+$\hspace*{3mm}z_i = \text{argmin}_{j\in\{1,...,k\}}\|x_i - \mu_j^{t-1}\|_2^2\\
+\hspace*{3mm}\mu_j^{(t)} = \frac{1}{n_j}\sum_{i:z_i=j}x_i$\\
+Monotonically decreases objective and converges to a local 
+optimum. Cost per iteration $O(nkd)$, worst-case exponential
 
-Lloyd's heuristic:
-Init. cluster centers $\mu^{(0)}$:
-\begin{compactitem}
-	\item Assign points to closest center				
-	\item Update $\mu_i$ as mean of assigned points
-\end{compactitem}
-
-Converges in exponential time.
-
-Initialize with \textbf{k-Means++}:
-
-\begin{compactitem}
+\textbf{k-Means++}: \begin{compactitem}
 	\item Random data point $\mu_1 = x_i$
 	\item Add $\mu_2,...,\mu_k$ rand., with prob:
 		$$\text{given } \mu_{1:j} \text{ pick } \mu_{j+1} = x_i$$ 
 		$\text{ where } p(i) = \frac{1}{z} \min_{l \in \{1,...,j\}} ||x_i - \mu_l||_2^2$
 \end{compactitem}
 Converges in expectation $\mathcal O (\log k) * \text{opt. solution}$.
-Find $k$ by negligible loss decrease or reg.
 
 \subsection*{Principal Component Analysis}
 
 Given centered data, the PCA problem is 
 $$\min_{W^\top W=I_k,z_i\in\R^k}\sum_{i=1}^n||W z_i - x_i||_2^2,$$
 with solution $W^* = (v_1|...|v_k)$ where $v_i$ are the ordered 
-eigenvectors of $\frac{1}{n}\sum_ix_ix_i^\top$ 
+eigvec. of $\frac{1}{n}\sum_ix_ix_i^\top$ 
 and $z_i = {W^*}^\top x_i$. 
 
 \subsection*{PCA through SVD}

diff --git a/iml/chapters/various.tex b/iml/chapters/various.tex
@@ -20,7 +20,6 @@ \section*{Various}
 \widehat{\operatorname{cov}}(X)=\frac{1}{n}X^\top X=\frac{1}{n}\sum_i x_ix_i^\top
 $; $p(z|x,\theta) = \frac{p(x,z|\theta)}{p(x | \theta)}$, $\text{Var}(AX) = A \text{Var}(X) A^\top$
 
-
 $A^{-1}=
 \begin{bsmallmatrix}
 a&b \\ 

diff --git a/iml/main.tex b/iml/main.tex
@@ -133,7 +133,6 @@
 \small
 
 \input{Kernels}
-\input{Clustering}
 \input{ProbabilityModeling}
 \input{Generative}
 \input{Latent}