index.html

<!doctype html>
<html>
<head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
    <meta name="author" content="Piotr Mazurek">

    <title>Vision Transformer</title>

    <link rel="stylesheet" href="dist/reset.css">
    <link rel="stylesheet" href="dist/reveal.css">
    <link rel="stylesheet" href="dist/custom.css">
    <link rel="stylesheet" href="plugin/pointer/pointer.css">
    <link rel="stylesheet" href="plugin/drawer/drawer.css">
    <link rel="stylesheet" href="dist/theme/sky.css" id="theme">

    <!-- Theme used for syntax highlighting of code -->
    <link rel="stylesheet" href="css/highlight/isbl-editor-light.css">
</head>
<body>
<div class="reveal">
    <div class="slides">
        <section>
            <h2>AN IMAGE IS WORTH 16X16 WORDS</h2>
            <h4>TRANSFORMERS FOR IMAGE RECOGNITION AT SCALE</h4>
            <small>Piotr Mazurek</small>
        </section>
        <section>
            <h2>Presentation plan</h2>

            <ol>
                <li>Overview</li>
                <li>A brief history of transformers</li>
                <li>Why using transformers for CV is complicated?</li>
                <li>How Vision Transformer (ViT) works?</li>
                <li>ViT performance in image classification</li>
                <li>Critics, impact, and my predictions (the fun part)</li>
            </ol>
        </section>

        <section>
            <h2>Assumptions</h2>

            <ul>
                <li>You know (the basics of) PyTorch</li>
                <li>You understand the Transformer concept</li>
                <li>You know (more less) how BERT works ([CLS] token)</li>
            </ul>
        </section>

        <section>
            <h2>Overview</h2>
            <img src="assets/vit.gif" class="r-stretch"> <br>
            <small>Source: <a href="https://github.com/lucidrains/vit-pytorch">lucidrains/vit-pytorch </a></small>
        </section>

        <section>
            <h2>Overview</h2>
            <ul>
                <li class="fragment">Divide an input image into 196 (14x14) small images of size (16x16)</li>
                <li class="fragment">Treat it as embedding in NLP</li>
                <li class="fragment">Use it as an input for traditional transformer encoder (like in BERT)</li>
                <li class="fragment">Use 12 transformer layers (Norm, Multi-head attention, etc.)</li>
                <li class="fragment">Take the last output, use it as input for Dense Layer with 1000 classes</li>
                <li class="fragment">Voilà - you have a classification model</li>
            </ul>
        </section>


        <section>
            <h3>Attention is all you need</h3>
            <img src="assets/attention.jpg" class="r-stretch"> <br>
            <small>Source: <a
                    href="https://arxiv.org/abs/1706.03762">Attention Is All You Need</a><br>Vaswani et al.
                2017</small>
        </section>

        <section>
            <h3>Transformer Encoder</h3>
            <img src="assets/encoder.png" class="r-stretch"> <br>

            <pre><code data-trim data-noescape class="python">
            encoder_layer = nn.TransformerEncoderLayer(d_model=512,
                                                        nhead=8)

            transformer_encoder = nn.TransformerEncoder(encoder_layer,
                                                        num_layers=6)
            </code></pre>
        </section>

        <section>
            <h2>BERT</h2>
            <img src="assets/bert_cls_token.png" class="r-stretch"> <br>
            <small>Source: <a href="https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/">A
                Visual Guide to Using BERT for the First Time</a>
                <br>Alammar 2020</small>
        </section>

        <section>
            <h4>Why don't we use a full image for transformer?</h4>

            <div class="fragment">Because <b>complexity</b></div>

            <p class="fragment">
                We need to store $n^2$ parameters
            </p>

            <p class="fragment">
                $(3*224*224)^2 = 22\: billion\: parameters$
            </p>
            <div class="fragment">
                <img src="assets/Visualizing-self-attention.png" class="r-stretch"> <br>
                <small>Source: <a
                        href="https://www.researchgate.net/publication/323587007_Deep_Learning_Based_Chatbot_Models">
                    Deep Learning Based Chatbot Models </a><br>Csaky 2017</small>
            </div>
        </section>

        <section>
            <h2>Architecture recap</h2>
            <img src="assets/ViT.png" class="r-stretch"> <br>
            <small>Source: <a href="https://arxiv.org/abs/2010.11929"> Vision transformer paper </a>
                <br>Dosovitskiy et al. 2020</small>

        </section>

        <section>
            <h2>Patch embeddings</h2>
            <small>

                <div class="fragment"> Image of size $(3, 224, 224)$</div>
                <br>

                <div class="fragment">Divided into $196\: (14 \times 14)$ patches of size $3 \times 16 \times 16$</div>
                <br>

                <div class="fragment">D16*14 = 224 (original image size)</div>
            </small>

            <div class="fragment">
                <img src="assets/patches_1.png" class="r-stretch"> <br>
                <small>Source: <a href=https://amaarora.github.io/2021/01/18/ViT.html"> Committed towards better
                    future </a>
                    <br>Bukhari 2021</small>
            </div>
        </section>

        <section>
            <h2>Patch embeddings</h2>

            <div class="fragment">Each patch is converted into a vector of size $3 \times 16 \times 16 = 768$</div>
            <br>

            <div class="fragment">So after that, we have <b>196</b> vectors of size <b>768</b>, a matrix of size $(196,
                768)$
            </div>
        </section>

        <section>
            <h2>Patch embeddings</h2>

            <img src="assets/patches_conv.png" class="r-stretch"> <br>
            <small>Source: <a href=https://amaarora.github.io/2021/01/18/ViT.html"> Committed towards better future </a>
                <br>Bukhari 2021</small>

            <pre><code data-trim data-noescape class="python">
x = torch.randn(1, 3, 224, 224)
# 2D conv
conv = nn.Conv2d(3, 768, 16, 16)
conv(x).reshape(-1, 196).transpose(0,1).shape
>> torch.Size([196, 768])
            </code></pre>
        </section>

        <section>
            <h2>[CLS] Token</h2>
            <img src="assets/cls_patches.png" class="r-stretch"> <br>
            <small>Source: <a href=https://amaarora.github.io/2021/01/18/ViT.html"> Committed towards better future </a>
                <br>Bukhari 2021</small>
            <small>Similarly to the situation in BERT we need to add a [CLS] token<br>
                [CLS] token is a vector of size $(1, 768)$<br>
                The final patch matrix has size $(197, 768)$, 196 from patches and 1 [CLS] token
            </small>
        </section>

        <section>
            <h3>Transformer encoder recap</h3>
            <img src="assets/encoder.png" class="r-stretch"> <br>

            <small>
                We have input embedding - patches matrix of size $(196, 768)$<br>
                We still need position embedding
            </small>
        </section>

        <section>
            <h2>Position embedding</h2>
            <img src="assets/positions.png" class="r-stretch"> <br>
            <small>Source: <a href="https://arxiv.org/abs/2010.11929"> Vision transformer paper </a>
                <br>Dosovitskiy et al. 2020</small>

            <small>
                <blockquote>"We use standard learnable 1D position embeddings and the resulting sequence of embedding
                    vectors serves as input to the encoder"
                </blockquote>

            </small>
        </section>

        <section>
            <h2>Position embedding similarities</h2>
            <img src="assets/positions_similarities.png" class="r-stretch"> <br>
            <small>Source: <a href="https://arxiv.org/abs/2010.11929"> Vision transformer paper </a>
                <br>Dosovitskiy et al. 2020</small>
        </section>


        <section>
            <h3>Vision Transformer put together</h3>
            <img src="assets/vit_horizontaly.png" class="r-stretch"> <br>
            <small>Source: <a href=https://amaarora.github.io/2021/01/18/ViT.html"> Committed towards better future </a>
                <br>Bukhari 2021</small>
        </section>

        <section>
            <h2>Transformer layers</h2>

            <img src="assets/transformers.png" class="r-stretch"> <br>
            <small>Source: <a href=https://amaarora.github.io/2021/01/18/ViT.html"> Committed towards better future </a>
                <br>Bukhari 2021</small>
        </section>


        <section>
            <h2>End-to-end training</h2>
            <pre><code data-trim data-noescape class="python">
class ViT(pl.LightningModule):
    def __init__(self, num_transformer_layers, num_classes=1000):
        super().__init__()
        self.criterion = nn.CrossEntropyLoss()

        self.conv_embedding = nn.Conv2d(3, 768, 16, 16)

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))

        encoder_layer = nn.TransformerEncoderLayer(d_model=768, nhead=8)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_transformer_layers)

        self.mlp_head = nn.Linear(768, num_classes)

        self.position_embedding_layer = nn.Embedding(197, 768)

    def forward(self, x):

        batch_size = x.shape[0]
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)


        #(batch_size, 196, 768)
        patches_embedding = self.conv_embedding(x).reshape(-1, 196).transpose(0,1)

        #(batch_size, 197, 768)
        patches_embedding = torch.cat((cls_tokens, patches_embedding), dim=1)

        #(batch_size, 197); 0, 0, ... 196, 196
        positions  = self._assign_positions_to_patches(

        #(batch_size, 197, 768)
        position_embedding = position_embedding_layer(positions)

        #(batch_size, 197, 768)
        final_embedding = patches_embedding + position_embedding

        #(batch_size, 197, 768)
        embedding_output = self.transformer_encoder(final_embedding)

        #(batch_size, 768)
        cls_vector = embedding_output[:, 0, :]

        #(batch_size, num_classes)
        return mlp_head(cls_vector)


    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = self.criterion(logits, y)
        return loss


            </code></pre>
        </section>

        <section>
            <h3>How good is ViT performance?</h3>
            <h4>TL; DR</h4>

            <ul>
                <li class="fragment">Worse than Resnet when trained just on ImageNet</li>
                <li class="fragment">Performance improved when pre-trained on big (and I mean it) dataset</li>
                <li class="fragment">Pretrained outperforms much bigger CNNs</li>
            </ul>
        </section>

        <section>
            <h2>ViT in numbers</h2>
            <img src="assets/vit_table.png" class="r-stretch"> <br>
            <small>Source: <a href="https://arxiv.org/abs/2010.11929"> Vision transformer paper </a>
                <br>Dosovitskiy et al. 2020</small>
        </section>

        <section>
            <h2>Rule of thumb</h2>

            <h4 class="fragment">Are you?</h4>

            <ul class="fragment">
                <li>A FAANG company</li>
                <li>A research lab with a budget equal to the GDP of a developing country</li>
            </ul>

            <h4 class="fragment">ViT may be for you</h4>

            <h4 class="fragment">Otherwise use EfficientNet, It is fine</h4>
            <small class="fragment">It takes 2.5k TPUv3 core days to train the best ViT model</small>
            <small class="fragment">At around USD 0.5 per core TPU/h, 2.5k*0.5*24h=USD 30,000</small>
        </section>

        <section>
            <h3>Critics</h3>
            <ul>
                <li class="fragment">Better results - only with more data</li>
                <li class="fragment">The cost of training from scratch is ridiculously high (30k$)</li>
                <li class="fragment">Is it really that different from Convolutions?</li>
            </ul>
        </section>

        <section>
            <h3>Paper's impact</h3>
            <h5>New punchline: <b>Is worth</b></h5>

            <a href="https://arxiv.org/abs/2103.13915">An Image is Worth 16x16 Words, What is a Video Worth?</a><br><br>

            <a href="https://arxiv.org/abs/2103.13915">A Video Is Worth Three Views: Trigeminal Transformers for
                Video-based Person Re-identification</a>
        </section>


        <section>
            <h3>Paper's impact</h3>
            First time transformers outperform CNNs in CV<br><br>
            2021 - Year of transformers in CV?
        </section>

        <section>
            <h3>Paper's impact</h3>
            Already new, similar architectures have emerged

            <img src="assets/swin_transformer.png" class="r-stretch"> <br>
            <small>Source: <a href="https://arxiv.org/abs/2103.14030">Swin Transformer: Hierarchical Vision
                Transformer
                using Shifted Windows </a>
                <br>Liu et al. 2021</small>
        </section>

        <section>
            <h3>Prediction #1</h3>
            <blockquote>Transformers originally were seq2seq models, using ViT as
                a tool for captioning images is a "no-brainier"
            </blockquote>
        </section>


        <section>
            <h3>Prediction #2</h3>
            <blockquote>BERT like embedding for images - prototyping CV models drastically accelerated</blockquote>
        </section>


        <section>
            <h3>Prediction #3</h3>
            <blockquote>EVEN bigger models - even better image classification</blockquote>
        </section>


        <section>
            <h3>Prediction #4</h3>
            <blockquote>More sophisticated (yet efficient) approach for patches</blockquote>
        </section>


        <section>
            <h2>Thanks</h2>
            <blockquote>"Feel free to ask any question"</blockquote>
            <p>
                <small>Piotr Mazurek</small><br/>
                <small><a href="https://tugot17.github.io/Vision-Transformer-Presentation/">tugot17.github.io/Vision-Transformer-Presentation/</a></small><br/>
            </p>
        </section>

        <section>
            <h2>Self Attention</h2>
            <h4 class="fragment">Query, Value, Key</h4>
            <div class="fragment">
                $\text{Attention}(Q, K, V) = \text{softmax}(\frac{QK^T}{\sqrt{d_k}})V$<br>
            </div>

            <div class="fragment">
                For an input sequence of size $n$ tokens in sequence with $d$ embedding size<br>
            </div>

            <p class="display-row">
            <ul class="fragment">
                <li>$Q$ size: $(n,d)$</li>
                <li>$K^T$ size: $(d, n)$</li>
                <li>$Q$ size: $(n,d)$</li>
            </ul>

            <ul class="fragment">
                <li>$QK^T$: $n^2$ complexity</li>
                <li>$softmax$: $n^2$ complexity</li>
                <li>$(QK^T)V$ $n^2$ complexity</li>
            </ul>
            </p>
        </section>


    </div>
</div>


<script src="dist/reveal.js"></script>
<script src="plugin/zoom/zoom.js"></script>
<script src="plugin/notes/notes.js"></script>
<script src="plugin/markdown/markdown.js"></script>
<script src="plugin/highlight/highlight.js"></script>
<script src="plugin/math/math.js"></script>
<script src="plugin/menu/menu.js"></script>
<script src="plugin/pointer/pointer.js"></script>
<script src="plugin/drawer/drawer.js"></script>
<script>
    // More info about config & dependencies:
    // - https://github.com/hakimel/reveal.js#configuration
    // - https://github.com/hakimel/reveal.js#dependencies
    Reveal.initialize({
        hash: true,
        controls: true,
        progress: true,
        slideNumber: true,
        overview: true,
        center: true,
        navigationMode: 'default',
        fragmentInURL: false,
        embedded: false,
        preloadIframes: null,
        autoSlide: 0,
        autoSlideStoppable: true,
        defaultTiming: 120,
        mouseWheel: false,
        previewLinks: false,
        transition: 'slide', // none/fade/slide/convex/concave/zoom
        transitionSpeed: 'default', // default/fast/slow
        backgroundTransition: 'fade', // none/fade/slide/convex/concave/zoom
        display: 'block',
        math: {
            // mathjax: 'plugin/math/MathJax.js', // You can download MathJax locally and keep it in plugins folder
            config: 'TeX-AMS_HTML-full', // See http://docs.mathjax.org/en/latest/config-files.html
            // pass other options into `MathJax.Hub.Config()`
            TeX: {Macros: {RR: "{\\bf R}"}}
        },
        menu: {
            themes: [
                {name: 'Default', theme: 'dist/theme/black.css'},
                {name: 'Dark', theme: 'dist/theme/blood.css'},
                {name: 'Simple', theme: 'dist/theme/simple.css'},
                {name: 'Serif', theme: 'dist/theme/serif.css'},
                {name: 'Night', theme: 'dist/theme/night.css'},
                {name: 'Blue', theme: 'dist/theme/sky.css'},
            ]
        },
        pointer: {
            color: "red",
            pointerSize: 18,
            alwaysVisible: false
        },
        drawer: {
            toggleBoardKey: "t",
            toggleDrawKey: "d",
            pathSize: 4
        },
        plugins: [RevealZoom, RevealMarkdown, RevealHighlight, RevealNotes, RevealMath, RevealMenu, RevealPointer, RevealDrawer]
    });
</script>
</body>
</html>