diff --git a/iml/Generative.tex b/iml/Generative.tex deleted file mode 100644 index 2b85000..0000000 --- a/iml/Generative.tex +++ /dev/null @@ -1,44 +0,0 @@ -\section*{Generative Modeling} -Discriminative: Estimate $P(y\mid x)$\\ -Generative: Estimate $P(y,x)$ - -Typical approach to generative modeling: -\begin{enumerate}[noitemsep,leftmargin=6mm,topsep=2pt,parsep=2pt,partopsep=2pt] - \item Estimate prior on labels $P(y)$ - \item Estimate conditional distribution $P(x\mid y)$ for each class y - \item Obtain predictive distribution using Bayes' rule: -$P(y\mid x) = \frac{P(y) P(x\mid y)}{P(x)} = \frac{P(x,y)}{P(x)}$ -\end{enumerate} - -\subsection*{Decision rule} -$\hat{y} = \operatorname{argmax_{y}} P(y\mid x)\\ -\hspace*{2.1mm}= \operatorname{argmax_{y}} P(y) \prod_{i} P(x_i\mid y)\\ -\hspace*{2.1mm}= \operatorname{argmax_{y}} \log P(y) + \sum_{i} \log P(x_i\mid y)$ - -\subsection*{QDA/Gaussian Bayes Classifier} -$P(Y=y) = p_y$ and $P(x\mid y) = \mathcal{N}({\mu}_y, {\Sigma}_y)$\\ -$\hat{p}_y= \frac{\operatorname{Count(Y = y)}}{n}$\\ -$\hat{\mu}_{y} = \frac{1}{\operatorname{Count}(Y=y)} \sum_{i:y_i=y} {x_i} $\\ -$\hat{\Sigma}_{y} = \frac{1}{\operatorname{Count}(Y=y)} \sum_{i:y_i=y} (x_i - \hat{\mu}_{y})(x_i-\hat{\mu}_y)^\top $ - -For two classes $\hat{y} = \operatorname{sign}\Big(\log\frac{P(Y=1\mid x)}{P(Y=-1\mid x)}\Big) $ -\\ where -$ \log\frac{P(Y=1\mid x)}{P(Y=-1\mid x)} = \log \frac{\hat{p}}{1-\hat{p}} + \frac{1}{2}\log \frac{|\hat{\Sigma}_-|}{|\hat{\Sigma}_+|}\\ - + \frac{1}{2}(x - \hat{\mu}_-)^\top \hat{\Sigma}_-^{-1} (x - \hat{\mu}_-) - \frac{1}{2}(x - \hat{\mu}_+)^\top \hat{\Sigma}_+^{-1} (x - \hat{\mu}_+)$ - -\subsection*{Gaussian Naive Bayes} -GBC with diagonal $\Sigma$s. GNB with shared $\Sigma$s across -two classes yields the same predictions as Logistic Regression -(if model is true). - -\subsection*{Fisher's LDA (Subcase of GBC)} -Assume: Two classes, $p = 0.5$, ${\Sigma}_- = {\Sigma}_+ $ - -\subsection*{Outlier Detection} -Classify $x$ as outlier if $P(x) \leq \tau$. - -\subsection*{Regularization} -\begin{itemize}[noitemsep,leftmargin=6mm,topsep=0pt,parsep=0pt,partopsep=0pt] - \item Restricting model (i.e. covariance) - \item Prior on parameters.\\ -\end{itemize} diff --git a/iml/Kernels.tex b/iml/Kernels.tex deleted file mode 100644 index 995d17c..0000000 --- a/iml/Kernels.tex +++ /dev/null @@ -1,8 +0,0 @@ -\section*{Kernels} - -\subsection*{Kernelized Ridge} -Ansatz: $w^*=\Phi^\top\alpha$\\ -$\min_w\|\Phi w-y\|^2 + \lambda ||w||_2^2\\ -=\min_a ||K\alpha -y||_2^2 + \lambda \alpha^\top K \alpha$\\ -$\alpha^*=(K+\lambda I)^{-1} y$\\ -Prediction: $\hat{y} = \Sigma_{i=1}^n \alpha_i^* k(x_i,x)$\\ \ No newline at end of file diff --git a/iml/Latent.tex b/iml/Latent.tex deleted file mode 100644 index 3482428..0000000 --- a/iml/Latent.tex +++ /dev/null @@ -1,50 +0,0 @@ -\section*{Mixture Models} -\subsection*{Gaussian Mixtures} -$P(x\mid z) = \sum_iw_i\mathcal{N}({\mu}_i, {\Sigma}_i)$\\ -MLE is a nonconvex problem $\rightarrow$ EM. - -\subsection*{Hard-EM} -\textbf{E-step: } Compute \\$z_i^{(t)} = \operatorname{argmax_z} P(z\mid x_i, \theta^{(t-1)})\\ -\hspace*{1.4mm}= \operatorname{argmax_z} P(z\mid \theta^{(t-1)}) P(x_i\mid z,\theta^{(t-1)})\\ - \stackrel{\text{GMM}}{=}\operatorname{argmax_z} w_z^{(t-1)}\mathcal{N}(x;\ \mu_z^{(t-1)},\Sigma_z^{(t-1)})$\\ -\textbf{M-step: } -$\theta^{(t)} = \operatorname{argmax_\theta} P(x_{1:n},z^{(t)}_{1:n}\mid \theta)$\\ -Hard-EM converges to a local maximum -of $P(x_{1:n},z^{(t)}_{1:n}\mid \theta)$. It tends to do poorly if clusters overlap. -Hard-EM for GMM with $w_z=\frac{1}{k}, \Sigma_z=\sigma^2{I}$ -is equivalent to k-means. - -\subsection*{Soft-EM} -\textbf{E-step: \newline} Compute the distribution of $Z\mid x,\theta^{(t-1)}$, i.e. for each $x$ the responsibilities -\begin{equation*} - \begin{aligned} - P_{\theta^{(t-1)}}(Z=j\mid x) &= \frac{P(Z=j)P(x\mid Z=j)}{P(x)} \\ - &\stackrel{\text{GMM}}{=} - \frac{w_j \mathcal{N}(x;\ \Sigma_j,\mu_j)}{\sum_k w_k \mathcal{N}(x;\ \Sigma_k,\mu_k)}. - \end{aligned} -\end{equation*} - -\textbf{M-step:} -\begin{equation*} - \begin{aligned} - \theta^{(t)}&=\argmax_\theta - \mathbb{E}_{Z_{1:n}\mid x_{1:n},\theta^{(t-1)}} - \big[\log P_\theta(x_{1:n},Z_{1:n})\big]\\ - &\stackrel{\text{iid\&cond.ind.}}{=} - \sum_{i=1}^n \mathbb{E}_{Z_i\mid x_i,\theta^{(t-1)}} - \big[\log P_\theta(x_i,Z_i)\big] \\ - &= \sum_{i=1}^n \sum_{j=1}^k P_{\theta^{(t-1)}}(Z_i=j\mid x_i)\log P_\theta(x_i,Z_i=j) - \end{aligned} -\end{equation*} - -GMM M-step:\\ $w_j^{(t)} \leftarrow \frac{1}{n} \sum_{i} \gamma_j^{(t)} (x_i)$; \\ -$\mu_j^{(t)} \leftarrow \frac{\sum_{i} \gamma_j^{(t)} (x_i) x_i}{\sum_{i} \gamma_j^{(t)} (x_i)}\\ -\Sigma_j^{(t)} \leftarrow \frac{\sum_{i} \gamma_j^{(t)}(x_i) (x_i - \mu_j^{(t)}) (x_i - \mu_j^{(t)})^\top}{\sum_{i} \gamma_j^{(t)}(x_i)} \{+\nu^2\mathbb{I}\}$\\ %\text{|}\gamma^{(t) = \gamma}$ - -The cluster size can be selected via CV. -EM converges to a local maximum for GMMs, dependent on -initialization. -\subsection*{Semi-Supervised Learning w/ GMMs:} -Set $P_{\theta^{(t-1)}}(Z=j\mid x) =1_{\{j = y\}}$ for labeled points -$(x,y).$ -\\ diff --git a/iml/ProbabilityModeling.tex b/iml/ProbabilityModeling.tex deleted file mode 100644 index af25cda..0000000 --- a/iml/ProbabilityModeling.tex +++ /dev/null @@ -1,31 +0,0 @@ -\section*{Probabilistic Modeling} -\subsection*{MLE} -Given a choice of marginal $P(Y|X,\theta)$ take -$\theta^* =\argmax_\theta \prod_{i=1}^n {P_\theta}(y_i|x_i).$ - -\subsection*{Bayes optimality} -$\argmin_f\E_{x,y}[(y-f(x))^2]=\E[Y\mid X]$\\ -$\argmin_f\E_{x,y}[1_{[y\neq f(x)]}]\\=\argmax_yp(Y=y\mid X=x)$ - -\subsection*{Bias-Variance-noise decomposition} -$\E_{x,y}[(\hat{f}_D(x)- y)^2]= \E_x[\E_D[\hat{f}_D(x)]-f^*(x)]^2\\ -\hspace*{0.1mm}+\E_{x,D}[(\hat{f}_D(x) - \E_D[\hat{f}_D(x)])^2] +\E_{x,y}[(y-f^*(x))^2]$ where $f^*=\E[Y\mid X]$. Expected generalization error equals model bias squared, model variance and observation noise. - - -\subsection*{Logistic regression} -Parametrize $P(y\mid x)$ by $\frac{1}{1+\exp(-y w^\top x)}$.\\ -MLE is $\operatorname{argmax_w} P(y_{1:n}|w,x_{1:n})\\ -= \operatorname{argmin_w} - \sum_{i=1}^n \log P(y_i|w,x_i)\\ -= \operatorname{argmin_w} \sum_{i=1}^n \log(1+\exp(-y_i w^\top x_i))$ - -\subsection*{Gradient for logistic regression} -$\ell(w) = \log(1+\exp(-yw^\top x))$\\ -$\nabla_w \ell(w) =\frac{-yx}{1+\exp(yw^\top x)}$ - -\subsection*{Multiclass Logistic Regression} -Parametrize $P(Y=i\mid x)$ by $\frac{\exp(w_i^\top x)}{\sum_j \exp(w_j^\top x)}$. - -\subsection*{Kernelized logistic regression} -$\min_\alpha\sum_i\log(1+\exp(-y_i\alpha^\top K_i)) + \lambda\alpha^\top K \alpha$ -$\hat{P}(y\mid x)=\frac{1}{1+\exp(-y\sum_i\alpha_ik(x_i,x))}$ -\\ \ No newline at end of file diff --git a/iml/chapters/gaussian-mixture-model.tex b/iml/chapters/gaussian-mixture-model.tex index 37d133c..0ac1690 100644 --- a/iml/chapters/gaussian-mixture-model.tex +++ b/iml/chapters/gaussian-mixture-model.tex @@ -1,6 +1,6 @@ \section*{Gaussian Mixture Model} -data is generated from a convex-combination of Gaussian distributions +Data is generated from a convex-combination of Gaussian distributions $p(x | \theta) = p(x | \mu, \Sigma, w) = \sum_{j=1}^k w_j \mathcal{N}(x; \mu_j, \Sigma_j)$ @@ -20,7 +20,7 @@ \subsection*{Hard-EM Algorithm} \end{align*} \textbf{M-Step}: compute MLE of $\theta^{(t)}$ as for GBC. \smallskip -Problems: works poorly if clusters overlapping, with uniform weights and spherical covariances equivalent to k-means with Lloyd's heuristic +Problems: works poorly if clusters overlapping, Hard-EM for GMM with $w_z=\frac{1}{k}, \Sigma_z=\sigma^2{I}$ is equivalent to k-means. \subsection*{Soft-EM Algorithm} diff --git a/iml/chapters/generative-modeling.tex b/iml/chapters/generative-modeling.tex index 2023055..ead6c94 100644 --- a/iml/chapters/generative-modeling.tex +++ b/iml/chapters/generative-modeling.tex @@ -2,13 +2,19 @@ \section*{Generative Modeling} Aim to estimate $p(x, y)$ for complex situations using Bayes' rule: $p(x,y) = p(x|y) \cdot p(y)$ -\subsection*{Naive Bayes Model} +\subsection*{Gaussian Bayes Classifier} + +No independence assumption, model the features with a multivariate Gaussian $\mathcal{N}(x; \mu_y, \Sigma_y)$: + +\quad $\mu_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} x_{j}$ + +\quad $\Sigma_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} (x_{j} - \hat \mu_{y}) (x_{j} - \hat \mu_{y})^\top$ -GM for classification tasks. Assuming for a class label, each feature is independent. This helps estimating $p( x \; | \; y) =\prod_{i=1}^d p(x_i \; | \; y_i)$. +This is also called the \textbf{quadratic discriminant analysis} (QDA). LDA: $\Sigma_+ = \Sigma_-$, Fisher LDA: $p(y) = \frac{1}{2}$, classify $x$ as outlier if: $p(x) \leq \tau$. \subsection*{Gaussian Naive Bayes Classifier} -Naive Bayes Model with ind. Gaussians features. Estimate the parameters via MLE: +GBC with diagonal $\Sigma$s. Estimate the parameters via MLE: MLE for class prior: $p(y) = \hat p_y = \frac{\text{Count}(Y = y)}{n}$ MLE for feature distribution: @@ -31,16 +37,6 @@ \subsection*{Gaussian Naive Bayes Classifier} Where \color{Red}$f(x)$\color{Black} is called the discriminant function. If the conditional independence assumption is violated, the classifier can be overconfident. -\subsection*{Gaussian Bayes Classifier} - -No independence assumption, model the features with a multivariate Gaussian $\mathcal{N}(x; \mu_y, \Sigma_y)$: - -\quad $\mu_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} x_{j}$ - -\quad $\Sigma_{y} = \frac{1}{\text{Count}(Y = y)} \sum_{j \; | \; y_j = y} (x_{j} - \hat \mu_{y}) (x_{j} - \hat \mu_{y})^\top$ - -This is also called the \textbf{quadratic discriminant analysis} (QDA). LDA: $\Sigma_+ = \Sigma_-$, Fisher LDA: $p(y) = \frac{1}{2}$, Outlier detection: $p(x) \leq \tau$. - \subsection*{Avoiding Overfitting} MLE is prone to overfitting. Avoid this by restricting model class (fewer parameters, e.g. GNB) or using priors (restrict param. values). diff --git a/iml/chapters/kernels.tex b/iml/chapters/kernels.tex index 88d2fe8..3307126 100644 --- a/iml/chapters/kernels.tex +++ b/iml/chapters/kernels.tex @@ -25,4 +25,5 @@ \section*{Kernels} \end{rowlist} \textbf{Kern. Ridge Reg.} -$\frac{1}{n} ||y - K\alpha ||_2^2 + \lambda \alpha^\top K \alpha$ +Ansatz: $w^*=\Phi^\top\alpha$\\ +$\min_w \frac1n \|y - \Phi w\|^2 + \lambda ||w||_2^2 = \min_\alpha \frac1n ||y - K\alpha||_2^2 + \lambda \alpha^\top K \alpha$ diff --git a/iml/main.tex b/iml/main.tex deleted file mode 100644 index e6fb76b..0000000 --- a/iml/main.tex +++ /dev/null @@ -1,140 +0,0 @@ -% Basic stuff -\documentclass[a4paper,11pt]{article} -\usepackage{scrextend} - -% 3 column landscape layout with fewer margins -\usepackage[landscape, left=0.75cm, top=0.75cm, right=0.75cm, bottom=1cm, footskip=15pt]{geometry} -\usepackage{flowfram} -\usepackage{bbm} -\ffvadjustfalse -\setlength{\columnsep}{0.5cm} -\Ncolumn[<10]{4} -\onecolumn[10] - -% define nice looking boxes -\usepackage[many]{tcolorbox} -\changefontsizes[11pt]{11pt} - -% a base set, that is then customised -\tcbset { - base/.style={ - boxrule=0mm, - leftrule=1mm, - left=1.75mm, - arc=0mm, - fonttitle=\bfseries, - colbacktitle=black!10!white, - coltitle=black, - toptitle=0.75mm, - bottomtitle=0.25mm, - title={#1} - } -} - -\definecolor{brandblue}{rgb}{0.34, 0.7, 1} -\newtcolorbox{mainbox}[1]{ - colframe=brandblue, - base={#1} -} - -\newtcolorbox{subbox}[1]{ - colframe=black!20!white, - base={#1} -} - -% Mathematical typesetting & symbols -\usepackage{amsthm, mathtools, amssymb} -\usepackage{marvosym, wasysym} -\allowdisplaybreaks - -% Tables -\usepackage{tabularx, multirow} -\usepackage{makecell} -\usepackage{booktabs} -\renewcommand*{\arraystretch}{2} - -% Make enumerations more compact -\usepackage{enumitem} - -% To include sketches & PDFs -\usepackage{graphicx} - -% For hyperlinks -\usepackage{hyperref} -\hypersetup{ - colorlinks=true -} -\renewcommand{\baselinestretch}{.9} -% Metadata -\title{Cheatsheet\\ Introduction to Machine Learning} -\author{Thomas Gassmann} -\date{\vspace{-10pt}Sommer 2024} - -% Math helper stuff -\def\limxo{\lim_{x\to 0}} -\def\limxi{\lim_{x\to\infty}} -\def\limxn{\lim_{x\to-\infty}} -\def\R{\mathbb{R}} -\def\P{\mathbb{P}} -\def\F{\mathcal{F}} -\def\sumn{\sum_{n=0}^\infty} -\def\sumk{\sum_{k=1}^\infty} -\def\E{\mathbb{E}} -\DeclareMathOperator{\Var}{\text{Var}} -\newcommand{\middot}{~\textperiodcentered~} -\newlist{rowlist}{enumerate*}{1} -\setlist[rowlist]{label={\textbf{\roman*}\text{: }}, afterlabel={}, itemjoin=\middot} - -\newcommand{\C}{\mathbb{C}} -\newcommand{\K}{\mathbb{K}} -\newcommand{\N}{\mathbb{N}} -\newcommand{\Q}{\mathbb{Q}} -\newcommand{\Z}{\mathbb{Z}} -\newcommand{\X}{\mathbb{X}} -\renewcommand{\P}{\mathbb{P}} - -\newcommand{\No}{\mathcal{N}} - -\newcommand{\br}{\par\medskip\noindent} -% \newcommand{\P}{\mathbb{P}} -\newcommand{\Oh}{\mathcal{O}} -% \newcommand{\vphi}{\varphi} -% \newcommand{\veps}{\varepsilon} -\newcommand{\bd}{\textbf} -\newcommand{\equi}{\Leftrightarrow} -\newcommand{\imp}{\Rightarrow} -\newcommand{\emp}{\varnothing} -\newcommand{\subs}{\subseteq} -\newcommand{\ol}{\overline} -\newcommand{\ra}{\rangle} -\newcommand{\la}{\langle} -\newcommand{\ox}{\otimes} -\newcommand*\dx{\mathop{}\!\mathrm{d}x} -\newcommand*\dy{\mathop{}\!\mathrm{d}y} -\newcommand*\dz{\mathop{}\!\mathrm{d}z} - -\DeclareMathOperator*{\argmax}{arg\,max} -\DeclareMathOperator*{\argmin}{arg\,min} -\DeclareMathOperator*{\vol}{vol} -\DeclareMathOperator*{\Mat}{Mat} - -\setlength{\parindent}{0pt} - -\makeatletter -\renewcommand{\section}{\@startsection{section}{1}{0mm}% - {0pt}% - {0.5pt}%x - {\sffamily\bfseries\large}} -\renewcommand{\subsection}{\@startsection{subsection}{1}{0mm}% - {0pt}% - {0.1pt}%x - {\sffamily\bfseries}} -\begin{document} -\small - -\input{Kernels} -\input{ProbabilityModeling} -\input{Generative} -\input{Latent} - -\end{document}