diff --git a/iml/Generative.tex b/iml/Generative.tex
deleted file mode 100644
index 2b85000..0000000
--- a/iml/Generative.tex
+++ /dev/null
@@ -1,44 +0,0 @@
-\section*{Generative Modeling}
-Discriminative: Estimate $P(y\mid x)$\\
-Generative: Estimate $P(y,x)$
-
-Typical approach to generative modeling:
-\begin{enumerate}[noitemsep,leftmargin=6mm,topsep=2pt,parsep=2pt,partopsep=2pt]
-    \item Estimate prior on labels $P(y)$
-    \item Estimate conditional distribution $P(x\mid y)$ for each class y
-    \item Obtain predictive distribution using Bayes' rule:
-$P(y\mid x) = \frac{P(y) P(x\mid y)}{P(x)} = \frac{P(x,y)}{P(x)}$
-\end{enumerate}
-
-\subsection*{Decision rule}
-$\hat{y} = \operatorname{argmax_{y}} P(y\mid x)\\
-\hspace*{2.1mm}= \operatorname{argmax_{y}} P(y) \prod_{i} P(x_i\mid y)\\
-\hspace*{2.1mm}= \operatorname{argmax_{y}} \log P(y) + \sum_{i} \log P(x_i\mid y)$
-
-\subsection*{QDA/Gaussian Bayes Classifier}
-$P(Y=y) = p_y$ and $P(x\mid y) = \mathcal{N}({\mu}_y, {\Sigma}_y)$\\
-$\hat{p}_y= \frac{\operatorname{Count(Y = y)}}{n}$\\
-$\hat{\mu}_{y} = \frac{1}{\operatorname{Count}(Y=y)} \sum_{i:y_i=y} {x_i} $\\
-$\hat{\Sigma}_{y} = \frac{1}{\operatorname{Count}(Y=y)} \sum_{i:y_i=y} (x_i - \hat{\mu}_{y})(x_i-\hat{\mu}_y)^\top $
-
-For two classes $\hat{y} = \operatorname{sign}\Big(\log\frac{P(Y=1\mid x)}{P(Y=-1\mid x)}\Big) $ 
-\\ where  
-$    \log\frac{P(Y=1\mid x)}{P(Y=-1\mid x)} = \log \frac{\hat{p}}{1-\hat{p}} + \frac{1}{2}\log \frac{|\hat{\Sigma}_-|}{|\hat{\Sigma}_+|}\\
-    + \frac{1}{2}(x - \hat{\mu}_-)^\top \hat{\Sigma}_-^{-1} (x - \hat{\mu}_-) - \frac{1}{2}(x - \hat{\mu}_+)^\top \hat{\Sigma}_+^{-1} (x - \hat{\mu}_+)$
-
-\subsection*{Gaussian Naive Bayes}
-GBC with diagonal $\Sigma$s. GNB with shared $\Sigma$s across
-two classes yields the same predictions as Logistic Regression
-(if model is true).
-
-\subsection*{Fisher's LDA (Subcase of GBC)}
-Assume: Two classes, $p = 0.5$, ${\Sigma}_- = {\Sigma}_+ $
-
-\subsection*{Outlier Detection}
-Classify $x$ as outlier if $P(x) \leq \tau$.
-
-\subsection*{Regularization}
-\begin{itemize}[noitemsep,leftmargin=6mm,topsep=0pt,parsep=0pt,partopsep=0pt]
-    \item Restricting model (i.e. covariance)
-    \item Prior on parameters.\\
-\end{itemize}
diff --git a/iml/Kernels.tex b/iml/Kernels.tex
deleted file mode 100644
index 995d17c..0000000
--- a/iml/Kernels.tex
+++ /dev/null
@@ -1,8 +0,0 @@
-\section*{Kernels}
-
-\subsection*{Kernelized Ridge}
-Ansatz: $w^*=\Phi^\top\alpha$\\
-$\min_w\|\Phi w-y\|^2 + \lambda ||w||_2^2\\
-=\min_a ||K\alpha -y||_2^2 + \lambda \alpha^\top K \alpha$\\
-$\alpha^*=(K+\lambda I)^{-1} y$\\
-Prediction: $\hat{y} = \Sigma_{i=1}^n \alpha_i^* k(x_i,x)$\\
\ No newline at end of file
diff --git a/iml/Latent.tex b/iml/Latent.tex
deleted file mode 100644
index 3482428..0000000
--- a/iml/Latent.tex
+++ /dev/null
@@ -1,50 +0,0 @@
-\section*{Mixture Models}
-\subsection*{Gaussian Mixtures}
-$P(x\mid z) = \sum_iw_i\mathcal{N}({\mu}_i, {\Sigma}_i)$\\
-MLE is a nonconvex problem $\rightarrow$ EM.
-
-\subsection*{Hard-EM}
-\textbf{E-step: } Compute \\$z_i^{(t)} = \operatorname{argmax_z} P(z\mid x_i, \theta^{(t-1)})\\
-\hspace*{1.4mm}= \operatorname{argmax_z} P(z\mid \theta^{(t-1)}) P(x_i\mid z,\theta^{(t-1)})\\
- \stackrel{\text{GMM}}{=}\operatorname{argmax_z} w_z^{(t-1)}\mathcal{N}(x;\ \mu_z^{(t-1)},\Sigma_z^{(t-1)})$\\
-\textbf{M-step: }
-$\theta^{(t)} = \operatorname{argmax_\theta} P(x_{1:n},z^{(t)}_{1:n}\mid \theta)$\\
-Hard-EM converges to a local maximum
-of $P(x_{1:n},z^{(t)}_{1:n}\mid \theta)$. It tends to do poorly if clusters overlap. 
-Hard-EM for GMM with $w_z=\frac{1}{k}, \Sigma_z=\sigma^2{I}$
-is equivalent to k-means.
-
-\subsection*{Soft-EM}
-\textbf{E-step: \newline} Compute the distribution of $Z\mid x,\theta^{(t-1)}$, i.e. for each $x$ the responsibilities
-\begin{equation*}
-    \begin{aligned}
-        P_{\theta^{(t-1)}}(Z=j\mid x) &= \frac{P(Z=j)P(x\mid Z=j)}{P(x)} \\
-        &\stackrel{\text{GMM}}{=}
-        \frac{w_j \mathcal{N}(x;\ \Sigma_j,\mu_j)}{\sum_k w_k \mathcal{N}(x;\ \Sigma_k,\mu_k)}.
-    \end{aligned}
-\end{equation*}
-
-\textbf{M-step:} 
-\begin{equation*}
-    \begin{aligned}
-        \theta^{(t)}&=\argmax_\theta 
-        \mathbb{E}_{Z_{1:n}\mid x_{1:n},\theta^{(t-1)}}
-        \big[\log P_\theta(x_{1:n},Z_{1:n})\big]\\
-        &\stackrel{\text{iid\&cond.ind.}}{=}
-        \sum_{i=1}^n \mathbb{E}_{Z_i\mid x_i,\theta^{(t-1)}}
-        \big[\log P_\theta(x_i,Z_i)\big] \\
-        &= \sum_{i=1}^n \sum_{j=1}^k P_{\theta^{(t-1)}}(Z_i=j\mid x_i)\log P_\theta(x_i,Z_i=j)
-    \end{aligned}
-\end{equation*}
-
-GMM M-step:\\ $w_j^{(t)} \leftarrow \frac{1}{n} \sum_{i} \gamma_j^{(t)} (x_i)$; \\
-$\mu_j^{(t)} \leftarrow \frac{\sum_{i} \gamma_j^{(t)} (x_i) x_i}{\sum_{i} \gamma_j^{(t)} (x_i)}\\
-\Sigma_j^{(t)} \leftarrow \frac{\sum_{i} \gamma_j^{(t)}(x_i) (x_i - \mu_j^{(t)}) (x_i - \mu_j^{(t)})^\top}{\sum_{i} \gamma_j^{(t)}(x_i)} \{+\nu^2\mathbb{I}\}$\\ %\text{|}\gamma^{(t) = \gamma}$
-
-The cluster size can be selected via CV.
-EM converges to a local maximum for GMMs, dependent on
-initialization.
-\subsection*{Semi-Supervised Learning w/ GMMs:}
-Set $P_{\theta^{(t-1)}}(Z=j\mid x) =1_{\{j = y\}}$ for labeled points
-$(x,y).$
-\\
diff --git a/iml/ProbabilityModeling.tex b/iml/ProbabilityModeling.tex
deleted file mode 100644
index af25cda..0000000
--- a/iml/ProbabilityModeling.tex
+++ /dev/null
@@ -1,31 +0,0 @@
-\section*{Probabilistic Modeling}
-\subsection*{MLE}
-Given a choice of marginal $P(Y|X,\theta)$ take 
-$\theta^* =\argmax_\theta \prod_{i=1}^n {P_\theta}(y_i|x_i).$
-
-\subsection*{Bayes optimality}
-$\argmin_f\E_{x,y}[(y-f(x))^2]=\E[Y\mid X]$\\
-$\argmin_f\E_{x,y}[1_{[y\neq f(x)]}]\\=\argmax_yp(Y=y\mid X=x)$
-
-\subsection*{Bias-Variance-noise decomposition}
-$\E_{x,y}[(\hat{f}_D(x)- y)^2]= \E_x[\E_D[\hat{f}_D(x)]-f^*(x)]^2\\
-\hspace*{0.1mm}+\E_{x,D}[(\hat{f}_D(x) - \E_D[\hat{f}_D(x)])^2] +\E_{x,y}[(y-f^*(x))^2]$ where $f^*=\E[Y\mid X]$. Expected generalization error equals model bias squared, model variance and observation noise.
-
-
-\subsection*{Logistic regression}
-Parametrize $P(y\mid x)$ by $\frac{1}{1+\exp(-y w^\top x)}$.\\
-MLE is $\operatorname{argmax_w} P(y_{1:n}|w,x_{1:n})\\
-= \operatorname{argmin_w} - \sum_{i=1}^n \log P(y_i|w,x_i)\\
-= \operatorname{argmin_w} \sum_{i=1}^n \log(1+\exp(-y_i w^\top x_i))$
-
-\subsection*{Gradient for logistic regression}
-$\ell(w) = \log(1+\exp(-yw^\top x))$\\
-$\nabla_w \ell(w) =\frac{-yx}{1+\exp(yw^\top x)}$
-
-\subsection*{Multiclass Logistic Regression}
-Parametrize $P(Y=i\mid x)$ by $\frac{\exp(w_i^\top x)}{\sum_j \exp(w_j^\top x)}$.
-
-\subsection*{Kernelized logistic regression}
-$\min_\alpha\sum_i\log(1+\exp(-y_i\alpha^\top K_i)) + \lambda\alpha^\top K \alpha$
-$\hat{P}(y\mid x)=\frac{1}{1+\exp(-y\sum_i\alpha_ik(x_i,x))}$
-\\
\ No newline at end of file
diff --git a/iml/chapters/gaussian-mixture-model.tex b/iml/chapters/gaussian-mixture-model.tex
index 37d133c..0ac1690 100644
--- a/iml/chapters/gaussian-mixture-model.tex
+++ b/iml/chapters/gaussian-mixture-model.tex
@@ -1,6 +1,6 @@
 \section*{Gaussian Mixture Model}
 
-data is generated from a convex-combination of Gaussian distributions
+Data is generated from a convex-combination of Gaussian distributions
 
 $p(x  | \theta) = p(x  | \mu, \Sigma, w) = \sum_{j=1}^k w_j \mathcal{N}(x; \mu_j, \Sigma_j)$
 
@@ -20,7 +20,7 @@ \subsection*{Hard-EM Algorithm}
 \end{align*}
 \textbf{M-Step}: compute MLE of $\theta^{(t)}$ as for GBC. \smallskip
 
-Problems: works poorly if clusters overlapping, with uniform weights and spherical covariances equivalent to k-means with Lloyd's heuristic
+Problems: works poorly if clusters overlapping, Hard-EM for GMM with $w_z=\frac{1}{k}, \Sigma_z=\sigma^2{I}$ is equivalent to k-means.
 
 \subsection*{Soft-EM Algorithm}
 
diff --git a/iml/chapters/generative-modeling.tex b/iml/chapters/generative-modeling.tex
index 2023055..ead6c94 100644
--- a/iml/chapters/generative-modeling.tex
+++ b/iml/chapters/generative-modeling.tex
@@ -2,13 +2,19 @@ \section*{Generative Modeling}
 
 Aim to estimate $p(x, y)$ for complex situations using Bayes' rule: $p(x,y) = p(x|y) \cdot p(y)$
 
-\subsection*{Naive Bayes Model}
+\subsection*{Gaussian Bayes Classifier}
+
+No independence assumption, model the features with a multivariate Gaussian $\mathcal{N}(x; \mu_y, \Sigma_y)$:
+
+\quad $\mu_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} x_{j}$
+
+\quad $\Sigma_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} (x_{j} - \hat \mu_{y}) (x_{j} - \hat \mu_{y})^\top$
 
-GM for classification tasks. Assuming for a class label, each feature is independent. This helps estimating $p( x \; | \; y) =\prod_{i=1}^d p(x_i \; | \; y_i)$.
+This is also called the \textbf{quadratic discriminant analysis} (QDA). LDA: $\Sigma_+ = \Sigma_-$, Fisher LDA: $p(y) = \frac{1}{2}$, classify $x$ as outlier if: $p(x) \leq \tau$.
 
 \subsection*{Gaussian Naive Bayes Classifier}
 
-Naive Bayes Model with ind. Gaussians features. Estimate the parameters via MLE:
+GBC with diagonal $\Sigma$s. Estimate the parameters via MLE:
 
 MLE for class prior: $p(y) = \hat p_y = \frac{\text{Count}(Y = y)}{n}$
 MLE for feature distribution:
@@ -31,16 +37,6 @@ \subsection*{Gaussian Naive Bayes Classifier}
 
 Where \color{Red}$f(x)$\color{Black} is called the discriminant function. If the conditional independence assumption is violated, the classifier can be overconfident.
 
-\subsection*{Gaussian Bayes Classifier}
-
-No independence assumption, model the features with a multivariate Gaussian $\mathcal{N}(x; \mu_y, \Sigma_y)$:
-
-\quad $\mu_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} x_{j}$
-
-\quad $\Sigma_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} (x_{j} - \hat \mu_{y}) (x_{j} - \hat \mu_{y})^\top$
-
-This is also called the \textbf{quadratic discriminant analysis} (QDA). LDA: $\Sigma_+ = \Sigma_-$, Fisher LDA: $p(y) = \frac{1}{2}$, Outlier detection: $p(x) \leq \tau$.
-
 \subsection*{Avoiding Overfitting}
 
 MLE is prone to overfitting. Avoid this by restricting model class (fewer parameters, e.g. GNB) or using priors (restrict param. values).
diff --git a/iml/chapters/kernels.tex b/iml/chapters/kernels.tex
index 88d2fe8..3307126 100644
--- a/iml/chapters/kernels.tex
+++ b/iml/chapters/kernels.tex
@@ -25,4 +25,5 @@ \section*{Kernels}
 \end{rowlist}
 
 \textbf{Kern. Ridge Reg.}
-$\frac{1}{n} ||y - K\alpha ||_2^2 + \lambda \alpha^\top K \alpha$
+Ansatz: $w^*=\Phi^\top\alpha$\\
+$\min_w \frac1n \|y - \Phi w\|^2 + \lambda ||w||_2^2 = \min_\alpha \frac1n ||y - K\alpha||_2^2 + \lambda \alpha^\top K \alpha$
diff --git a/iml/main.tex b/iml/main.tex
deleted file mode 100644
index e6fb76b..0000000
--- a/iml/main.tex
+++ /dev/null
@@ -1,140 +0,0 @@
-% Basic stuff
-\documentclass[a4paper,11pt]{article}
-\usepackage{scrextend}
-
-% 3 column landscape layout with fewer margins
-\usepackage[landscape, left=0.75cm, top=0.75cm, right=0.75cm, bottom=1cm, footskip=15pt]{geometry}
-\usepackage{flowfram}
-\usepackage{bbm}
-\ffvadjustfalse
-\setlength{\columnsep}{0.5cm}
-\Ncolumn[<10]{4}
-\onecolumn[10]
-
-% define nice looking boxes
-\usepackage[many]{tcolorbox}
-\changefontsizes[11pt]{11pt}
-
-% a base set, that is then customised
-\tcbset {
-	base/.style={
-		boxrule=0mm,
-		leftrule=1mm,
-		left=1.75mm,
-		arc=0mm, 
-		fonttitle=\bfseries, 
-		colbacktitle=black!10!white, 
-		coltitle=black, 
-		toptitle=0.75mm, 
-		bottomtitle=0.25mm,
-		title={#1}
-	}
-}
-
-\definecolor{brandblue}{rgb}{0.34, 0.7, 1}
-\newtcolorbox{mainbox}[1]{
-	colframe=brandblue, 
-	base={#1}
-}
-
-\newtcolorbox{subbox}[1]{
-	colframe=black!20!white,
-	base={#1}
-}
-
-% Mathematical typesetting & symbols
-\usepackage{amsthm, mathtools, amssymb} 
-\usepackage{marvosym, wasysym}
-\allowdisplaybreaks
-
-% Tables
-\usepackage{tabularx, multirow}
-\usepackage{makecell}
-\usepackage{booktabs}
-\renewcommand*{\arraystretch}{2}
-
-% Make enumerations more compact
-\usepackage{enumitem}
-
-% To include sketches & PDFs
-\usepackage{graphicx}
-
-% For hyperlinks
-\usepackage{hyperref}
-\hypersetup{
-	colorlinks=true
-}
-\renewcommand{\baselinestretch}{.9}
-% Metadata
-\title{Cheatsheet\\ Introduction to Machine Learning}
-\author{Thomas Gassmann}
-\date{\vspace{-10pt}Sommer 2024}
-
-% Math helper stuff
-\def\limxo{\lim_{x\to 0}}
-\def\limxi{\lim_{x\to\infty}}
-\def\limxn{\lim_{x\to-\infty}}
-\def\R{\mathbb{R}}
-\def\P{\mathbb{P}}
-\def\F{\mathcal{F}}
-\def\sumn{\sum_{n=0}^\infty}
-\def\sumk{\sum_{k=1}^\infty}
-\def\E{\mathbb{E}}
-\DeclareMathOperator{\Var}{\text{Var}}
-\newcommand{\middot}{~\textperiodcentered~}
-\newlist{rowlist}{enumerate*}{1}
-\setlist[rowlist]{label={\textbf{\roman*}\text{: }}, afterlabel={}, itemjoin=\middot}
-
-\newcommand{\C}{\mathbb{C}}
-\newcommand{\K}{\mathbb{K}}
-\newcommand{\N}{\mathbb{N}}
-\newcommand{\Q}{\mathbb{Q}}
-\newcommand{\Z}{\mathbb{Z}}
-\newcommand{\X}{\mathbb{X}}
-\renewcommand{\P}{\mathbb{P}}
-
-\newcommand{\No}{\mathcal{N}}
-
-\newcommand{\br}{\par\medskip\noindent}
-% \newcommand{\P}{\mathbb{P}}
-\newcommand{\Oh}{\mathcal{O}}
-% \newcommand{\vphi}{\varphi}
-% \newcommand{\veps}{\varepsilon} 
-\newcommand{\bd}{\textbf}
-\newcommand{\equi}{\Leftrightarrow}
-\newcommand{\imp}{\Rightarrow}
-\newcommand{\emp}{\varnothing}
-\newcommand{\subs}{\subseteq}
-\newcommand{\ol}{\overline}
-\newcommand{\ra}{\rangle}
-\newcommand{\la}{\langle}
-\newcommand{\ox}{\otimes}
-\newcommand*\dx{\mathop{}\!\mathrm{d}x}
-\newcommand*\dy{\mathop{}\!\mathrm{d}y}
-\newcommand*\dz{\mathop{}\!\mathrm{d}z}
-
-\DeclareMathOperator*{\argmax}{arg\,max}
-\DeclareMathOperator*{\argmin}{arg\,min}
-\DeclareMathOperator*{\vol}{vol}
-\DeclareMathOperator*{\Mat}{Mat}
-
-\setlength{\parindent}{0pt}
-
-\makeatletter
-\renewcommand{\section}{\@startsection{section}{1}{0mm}%
-                                {0pt}%
-                                {0.5pt}%x
-                                {\sffamily\bfseries\large}}
-\renewcommand{\subsection}{\@startsection{subsection}{1}{0mm}%
-                                {0pt}%
-                                {0.1pt}%x
-                                {\sffamily\bfseries}}
-\begin{document}
-\small
-
-\input{Kernels}
-\input{ProbabilityModeling}
-\input{Generative}
-\input{Latent}
-
-\end{document}