Adapt some GMM stuff

Signed-off-by: Thomas Gassmann <tgassmann@student.ethz.ch>
thomasgassmann · Jun 23, 2024 · 411b0d4 · 411b0d4
1 parent bf93923
commit 411b0d4
Show file tree

Hide file tree

Showing 8 changed files with 13 additions and 289 deletions.
diff --git a/iml/Generative.tex b/iml/Generative.tex
diff --git a/iml/Kernels.tex b/iml/Kernels.tex
diff --git a/iml/Latent.tex b/iml/Latent.tex
diff --git a/iml/ProbabilityModeling.tex b/iml/ProbabilityModeling.tex
diff --git a/iml/chapters/gaussian-mixture-model.tex b/iml/chapters/gaussian-mixture-model.tex
@@ -1,6 +1,6 @@
 \section*{Gaussian Mixture Model}
 
-data is generated from a convex-combination of Gaussian distributions
+Data is generated from a convex-combination of Gaussian distributions
 
 $p(x  | \theta) = p(x  | \mu, \Sigma, w) = \sum_{j=1}^k w_j \mathcal{N}(x; \mu_j, \Sigma_j)$
 
@@ -20,7 +20,7 @@ \subsection*{Hard-EM Algorithm}
 \end{align*}
 \textbf{M-Step}: compute MLE of $\theta^{(t)}$ as for GBC. \smallskip
 
-Problems: works poorly if clusters overlapping, with uniform weights and spherical covariances equivalent to k-means with Lloyd's heuristic
+Problems: works poorly if clusters overlapping, Hard-EM for GMM with $w_z=\frac{1}{k}, \Sigma_z=\sigma^2{I}$ is equivalent to k-means.
 
 \subsection*{Soft-EM Algorithm}
 

diff --git a/iml/chapters/generative-modeling.tex b/iml/chapters/generative-modeling.tex
@@ -2,13 +2,19 @@ \section*{Generative Modeling}
 
 Aim to estimate $p(x, y)$ for complex situations using Bayes' rule: $p(x,y) = p(x|y) \cdot p(y)$
 
-\subsection*{Naive Bayes Model}
+\subsection*{Gaussian Bayes Classifier}
+
+No independence assumption, model the features with a multivariate Gaussian $\mathcal{N}(x; \mu_y, \Sigma_y)$:
+
+\quad $\mu_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} x_{j}$
+
+\quad $\Sigma_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} (x_{j} - \hat \mu_{y}) (x_{j} - \hat \mu_{y})^\top$
 
-GM for classification tasks. Assuming for a class label, each feature is independent. This helps estimating $p( x \; | \; y) =\prod_{i=1}^d p(x_i \; | \; y_i)$.
+This is also called the \textbf{quadratic discriminant analysis} (QDA). LDA: $\Sigma_+ = \Sigma_-$, Fisher LDA: $p(y) = \frac{1}{2}$, classify $x$ as outlier if: $p(x) \leq \tau$.
 
 \subsection*{Gaussian Naive Bayes Classifier}
 
-Naive Bayes Model with ind. Gaussians features. Estimate the parameters via MLE:
+GBC with diagonal $\Sigma$s. Estimate the parameters via MLE:
 
 MLE for class prior: $p(y) = \hat p_y = \frac{\text{Count}(Y = y)}{n}$
 MLE for feature distribution:
@@ -31,16 +37,6 @@ \subsection*{Gaussian Naive Bayes Classifier}
 
 Where \color{Red}$f(x)$\color{Black} is called the discriminant function. If the conditional independence assumption is violated, the classifier can be overconfident.
 
-\subsection*{Gaussian Bayes Classifier}
-
-No independence assumption, model the features with a multivariate Gaussian $\mathcal{N}(x; \mu_y, \Sigma_y)$:
-
-\quad $\mu_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} x_{j}$
-
-\quad $\Sigma_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} (x_{j} - \hat \mu_{y}) (x_{j} - \hat \mu_{y})^\top$
-
-This is also called the \textbf{quadratic discriminant analysis} (QDA). LDA: $\Sigma_+ = \Sigma_-$, Fisher LDA: $p(y) = \frac{1}{2}$, Outlier detection: $p(x) \leq \tau$.
-
 \subsection*{Avoiding Overfitting}
 
 MLE is prone to overfitting. Avoid this by restricting model class (fewer parameters, e.g. GNB) or using priors (restrict param. values).

diff --git a/iml/chapters/kernels.tex b/iml/chapters/kernels.tex
@@ -25,4 +25,5 @@ \section*{Kernels}
 \end{rowlist}
 
 \textbf{Kern. Ridge Reg.}
-$\frac{1}{n} ||y - K\alpha ||_2^2 + \lambda \alpha^\top K \alpha$
+Ansatz: $w^*=\Phi^\top\alpha$\\
+$\min_w \frac1n \|y - \Phi w\|^2 + \lambda ||w||_2^2 = \min_\alpha \frac1n ||y - K\alpha||_2^2 + \lambda \alpha^\top K \alpha$
diff --git a/iml/main.tex b/iml/main.tex