dirty.bib

@inproceedings{wuVideoCompressionImage2018,
  title = {{Video Compression through Image Interpolation}},
  author = {Wu, Chao-Yuan and Singhal, Nayan and Kr\"ahenb\"uhl, Philipp},
  year = {2018},
  booktitle=eccv,
}

@InProceedings{Habibian_2019_ICCV,
author = {Habibian, Amirhossein and {van Rozendaal}, Ties and Tomczak, Jakub M. and Cohen, Taco S.},
title = {{Video Compression With Rate-Distortion Autoencoders}},
booktitle = iccv,
year = {2019}
}

@InProceedings{rippelLearnedVideoCompression2018,
  title = {{Learned Video Compression}},
  booktitle = iccv,
  author = {Rippel, Oren and Nair, Sanjay and Lew, Carissa and Branson, Steve and Anderson, Alexander G. and Bourdev, Lubomir},
  year = {2019},
}

@inproceedings{stephan_2019_neurips,
  author = {Jun Han and Salvator Lombardo and Christopher Schroers and Stephan Mandt},
  title = {{Deep Probabilistic Video Compression}},
  booktitle = neurips,
  year = {2019},
}

@inproceedings{luDVCEndtoendDeep2018,
  title = {{DVC: An End-to-End Deep Video Compression Framework}},
  author = {Lu, Guo and Ouyang, Wanli and Xu, Dong and Zhang, Xiaoyun and Cai, Chunlei and Gao, Zhiyong},
  booktitle = cvpr,
  year = {2019},
}

@inproceedings{mentzerConditionalProbabilityModels2018,
  title = {{Conditional Probability Models for Deep Image Compression}},
  booktitle = cvpr,
  author = {Mentzer, Fabian and Agustsson, Eirikur and Tschannen, Michael and Timofte, Radu and Van Gool, Luc},
  year = {2018},
}

@inproceedings{liu2020learned,
    title={{Learned Video Compression via Joint Spatial-Temporal Correlation Exploration}},
    author={Haojie Liu and Han shen and Lichao Huang and Ming Lu and Tong Chen and Zhan Ma},
    year={2020},
    booktitle = aaai,
}

@inproceedings{Djelouah_2019_ICCV,
author = {Djelouah, Abdelaziz and Campos, Joaquim and Schaub-Meyer, Simone and Schroers, Christopher},
title = {{Neural Inter-Frame Compression for Video Coding}},
booktitle = iccv,
year = {2019}
}


@inproceedings{theisLossyImageCompression2017,
  title = {{Lossy Image Compression with Compressive Autoencoders}},
  author = {Theis, Lucas and Shi, Wenzhe and Cunningham, Andrew and Husz\'ar, Ferenc},
  year = {2017},
  booktitle=iclr,
}


@inproceedings{jaderberg2015spatial,
    title={{Spatial Transformer Networks}},
    author={Max Jaderberg and Karen Simonyan and Andrew Zisserman and Koray Kavukcuoglu},
    year={2015},
    booktitle = neurips,
}

@inproceedings{UNet,
  author = {Ronneberger, Olaf and Fischer, Philipp and Brox, Thomas},
  booktitle = miccai,
  title = {{U-Net: Convolutional Networks for Biomedical Image Segmentation}},
  year = {2015},
}
@inproceedings{FRAE,
    title={{Feedback Recurrent AutoEncoder}},
    author={Yang Yang and Guillaume Sauti{\'e}re and J. Jon Ryu and Taco S Cohen},
    year={2019},
    booktitle = {ICASSP},
}

@inproceedings{vandenOord2016pixelCNN,
  title = {{Conditional Image Generation with PixelCNN Decoders}},
  author = {{van den Oord}, Aaron and Kalchbrenner, Nal and Espeholt, Lasse and Kavukcuoglu, Koray and Vinyals, Oriol and Graves, Alex},
  booktitle = neurips,
  year = {2016},
}

@article{hu2020learning,
    title={Learning End-to-End Lossy Image Compression: A Benchmark},
    author={Yueyu Hu and Wenhan Yang and Zhan Ma and Jiaying Liu},
    year={2020},
    journal = {arXiv:2002.03711},
}

@inproceedings{balle2016end,
  title = {{End-to-End Optimized Image Compression}},
  author = {Ball\'e, Johannes and Laparra, Valero and Simoncelli, Eero P},
  year = {2017},
  booktitle = iclr,
}

@inproceedings{balleVARIATIONALIMAGECOMPRESSION2018,
  title = {{Variational Image Compression with a Scale Hyperprior}},
  author = {Ball\'e, Johannes and Minnen, David and Singh, Saurabh and Hwang, Sung Jin and Johnston, Nick},
  year = {2018},
  booktitle = iclr,
}

@inproceedings{rippel2017real,
  title={{Real-time adaptive image compression}},
  author={Rippel, Oren and Bourdev, Lubomir},
  booktitle=icml,
  year={2017},
}

@inproceedings{betavae2017iclr,
  title={{beta-VAE: Learning Basic Visual Concepts with a Constrained Variational Framework}},
  author={Irina Higgins and Loic Matthey and Arka Pal and Christopher Burgess and Xavier Glorot and Matthew Botvinick and Shakir Mohamed and Alexander Lerchner},
  booktitle=iclr,
  year={2017},
}

@inproceedings{Gregor2016,
    title={{Towards Conceptual Compression}},
    author={Karol Gregor and Frederic Besse and Danilo Jimenez Rezende and Ivo Danihelka and Daan Wierstra},
    booktitle = neurips,
    year = {2016},
}

@inproceedings{Gregor2015,
    title={{DRAW: A Recurrent Neural Network For Image Generation}},
    author={Karol Gregor and Ivo Danihelka and Alex Graves and Danilo Jimenez Rezende and Daan Wierstra},
    booktitle = icml,
    year={2015},
}

@inproceedings{l1_plus_msssim,
author={H. {Zhao} and O. {Gallo} and I. {Frosio} and J. {Kautz}},
booktitle={IEEE Transactions on Computational Imaging},
title={{Loss Functions for Image Restoration With Neural Networks}},
year={2017},
}

@inproceedings{convlstm,
  author    = {Xingjian Shi and
               Zhourong Chen and
               Hao Wang and
               Dit{-}Yan Yeung and
               Wai{-}Kin Wong and
               Wang{-}chun Woo},
  title     = {{Convolutional LSTM Network: A Machine Learning Approach for Precipitation
               Nowcasting}},
  booktitle   = neurips,
  year      = {2015},
}

@incollection{minnenJointAutoregressiveAndHierarchicalPriors2018,
title = {Joint Autoregressive and Hierarchical Priors for Learned Image Compression},
author = {Minnen, David and Ball\'{e}, Johannes and Toderici, George D},
booktitle = neurips,
year = 2018,
}

@article{liuNLAICImageCompression2019,
    title={{Non-local Attention Optimized Deep Image Compression}},
    author={Haojie Liu and Tong Chen and Peiyao Guo and Qiu Shen and Xun Cao and Yao Wang and Zhan Ma},
    year={2019},
    journal={arXiv:1904.09757},
}

@inproceedings{balleDensity2015,
    title={{Density Modeling of Images using a Generalized Normalization Transformation}},
    author={Johannes Ballé and Valero Laparra and Eero P. Simoncelli},
    booktitle = iclr,  
    year={2016},
}

@BOOK{source_coding,
  TITLE = {{Digital Signal Compression: Principles and Practice}},
  SUBTITLE = {The Science of Microfabrication},
  AUTHOR = {William A. Pearlman and Amir Said},
  YEAR = {2011},
  PUBLISHER = {Cambridge University Press},
}

@inproceedings{webb2019inversion,
title = {{Faithful Inversion of Generative Models for Effective Amortized Inference}},
author = {Webb, Stefan and Goli{\'n}ski, Adam and Zinkov, Rob and N, Siddharth and Rainforth, Tom and Teh, Yee Whye and Wood, Frank},
booktitle = neurips,
year = {2018},
}

@inproceedings{krishnan2017structured,
author = {Krishnan, Rahul G. and Shalit, Uri and Sontag, David},
title = {Structured Inference Networks for Nonlinear State Space Models},
year = {2017},
booktitle = aaai,
}

@book{koller2009probabilistic,
  author = {Koller, D. and Friedman, N.},
  isbn = {9780262013192},
  lccn = {2009008615},
  publisher = {MIT Press},
  timestamp = {2018-04-06T07:07:25.000+0200},
  title = {Probabilistic Graphical Models: Principles and Techniques},
  year = 2009
}

@inproceedings{Kinetics600,
  author    = {Jo{\~{a}}o Carreira and
               Eric Noland and
               Andras Banki{-}Horvath and
               Chloe Hillier and
               Andrew Zisserman},
  title     = {A Short Note about Kinetics-600},
  booktitle  = {arXiv abs/1808.01340},
  year      = {2018},
}

@inproceedings{heDeepResidualLearning2016,
  title = {{Deep Residual Learning for Image Recognition}},
  booktitle = cvpr,
  author = {He, K and Zhang, X and Ren, S and Sun, J},
  year = {2016},
}

@inproceedings{broken_elbo,
  author    = {Alexander A. Alemi and
               Ben Poole and
               Ian Fischer and
               Joshua V. Dillon and
               Rif A. Saurous and
               Kevin Murphy},
  title     = {{Fixing a Broken ELBO}},
  booktitle = {ICML},
  year      = {2018},
}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@misc{HM,
  title = {{High Efficiency Video Coding (HEVC)}},
  howpublished = {\url{https://hevc.hhi.fraunhofer.de/}},
  note = {Accessed: 2020-02-21}
}

@misc{digital_video_introduction,
  title = {Digital Video Introduction},
  howpublished = {\url{https://github.com/leandromoreira/digital_video_introduction/blob/master/README.md#frame-types}},
  note = {Accessed: 2020-03-02}
}


@misc{ffmpeg,
  title = {ffmpeg},
  howpublished = {\url{http://ffmpeg.org/}},
  note = {Accessed: 2020-02-21}
}

@misc{netflix-data-usage,
  title = {How Much Data Does {Netflix} Use?},
  howpublished = {\url{https://www.howtogeek.com/338983/how-much-data-does-netflix-use/}},
  note = {Accessed: 2020-02-28}
}

@misc{video-data-global-usage,
  title = {2019 Global Internet Phenomena Report},
  howpublished = {\url{https://www.ncta.com/whats-new/report-where-does-the-majority-of-internet-traffic-come}},
  note = {Accessed: 2020-02-28}
}

@article{Kinetics400,
  author    = {Will Kay and
               Jo{\~{a}}o Carreira and
               Karen Simonyan and
               Brian Zhang and
               Chloe Hillier and
               Sudheendra Vijayanarasimhan and
               Fabio Viola and
               Tim Green and
               Trevor Back and
               Paul Natsev and
               Mustafa Suleyman and
               Andrew Zisserman},
  title     = {{The Kinetics Human Action Video Dataset}},
  journal   = {arxiv:1705.06950},
  year      = {2017},
}

% Another possible temporal coherency loss
% https://research.nvidia.com/publication/interactive-reconstruction-monte-carlo-image-sequences-using-recurrent-denoising

@InProceedings{FlowNet,
  author       = {A. Dosovitskiy and P. Fischer and E. Ilg and P. H{\"a}usser and C. Haz{\i}rba{\c{s}} and V. Golkov and P. v.d. Smagt and D. Cremers and T. Brox},
  title        = {FlowNet: Learning Optical Flow with Convolutional Networks},
  booktitle    = iccv,
  year         = {2015},
}


% bibtex files for ICCV 2019 RDAE paper 

@inproceedings{agustssonSofttoHardVectorQuantization2017,
  title = {Soft-to-{{Hard Vector Quantization}} for {{End}}-to-{{End Learning Compressible Representations}}},
  booktitle = neurips,
  publisher = {{Curran Associates, Inc.}},
  author = {Agustsson, Eirikur and Mentzer, Fabian and Tschannen, Michael and Cavigelli, Lukas and Timofte, Radu and Benini, Luca and Gool, Luc V},
  year = {2017},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression},
  pages = {1141-1151}
}

@article{agustssonGenerativeAdversarialNetworks2018a,
  title = {Generative {{Adversarial Networks}} for {{Extreme Learned Image Compression}}},
  abstract = {We propose a framework for extreme learned image compression based on
Generative Adversarial Networks (GANs), obtaining visually pleasing images
at significantly lower bitrates than previous methods. This is made
possible through our GAN formulation of learned compression combined with
a generator/decoder which operates on the full-resolution image and is
trained in combination with a multi-scale discriminator. Additionally, our
method can fully synthesize unimportant regions in the decoded image such
as streets and trees from a semantic label map extracted from the original
image, therefore only requiring the storage of the preserved region and
the semantic label map. A user study confirms that for low bitrates, our
approach significantly outperforms state-of-the-art methods, saving up to
67\% compared to the next-best method BPG.},
  author = {Agustsson, Eirikur and Tschannen, Michael and Mentzer, Fabian and Timofte, Radu and Van Gool, Luc},
  journal={arXiv preprint arXiv:1804.02958},
  year = {2018},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/Data Compression}
}

@article{alemiFixingBrokenELBO2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1711.00464},
  primaryClass = {cs, stat},
  title = {Fixing a {{Broken ELBO}}},
  abstract = {Recent work in unsupervised representation learning has focused on learning deep directed latent-variable models. Fitting these models by maximizing the marginal likelihood or evidence is typically intractable, thus a common approximation is to maximize the evidence lower bound (ELBO) instead. However, maximum likelihood training (whether exact or approximate) does not necessarily result in a good latent representation, as we demonstrate both theoretically and empirically. In particular, we derive variational lower and upper bounds on the mutual information between the input and the latent variable, and use these bounds to derive a rate-distortion curve that characterizes the tradeoff between compression and reconstruction accuracy. Using this framework, we demonstrate that there is a family of models with identical ELBO, but different quantitative and qualitative characteristics. Our framework also suggests a simple new method to ensure that latent variable models with powerful stochastic decoders do not ignore their latent code.},
  journal = {arXiv:1711.00464},
  author = {Alemi, Alexander A. and Poole, Ben and Fischer, Ian and Dillon, Joshua V. and Saurous, Rif A. and Murphy, Kevin},
  month = nov,
  year = {2017},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/tacos/Zotero/storage/DD5XXEFU/Alemi et al. - 2017 - Fixing a Broken ELBO.pdf;/Users/tacos/Zotero/storage/YCFBPASF/Alemi et al. - 2017 - Fixing a Broken ELBO.pdf;/Users/tacos/Zotero/storage/9IHXJCWY/1711.html;/Users/tacos/Zotero/storage/TE26QSW6/1711.html}
}

@inproceedings{andrilukaPoseTrackBenchmarkHuman2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1710.10000},
  title = {{{PoseTrack}}: {{A Benchmark}} for {{Human Pose Estimation}} and {{Tracking}}},
  shorttitle = {{{PoseTrack}}},
  abstract = {Human poses and motions are important cues for analysis of videos with people and there is strong evidence that representations based on body pose are highly effective for a variety of tasks such as activity recognition, content retrieval and social signal processing. In this work, we aim to further advance the state of the art by establishing "PoseTrack", a new large-scale benchmark for video-based human pose estimation and articulated tracking, and bringing together the community of researchers working on visual human analysis. The benchmark encompasses three competition tracks focusing on i) single-frame multi-person pose estimation, ii) multi-person pose estimation in videos, and iii) multi-person articulated tracking. To facilitate the benchmark and challenge we collect, annotate and release a new \%large-scale benchmark dataset that features videos with multiple people labeled with person tracks and articulated pose. A centralized evaluation server is provided to allow participants to evaluate on a held-out test set. We envision that the proposed benchmark will stimulate productive research both by providing a large and representative training dataset as well as providing a platform to objectively evaluate and compare the proposed methods. The benchmark is freely accessible at https://posetrack.net.},
  booktitle = {CVPR},
  author = {Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
  year = {2018},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/Users/tacos/Zotero/storage/8NIFFXTI/Andriluka et al. - 2017 - PoseTrack A Benchmark for Human Pose Estimation a.pdf;/Users/tacos/Zotero/storage/7BZNJI8Y/1710.html}
}

%B
@article{babaeizadehStochasticVariationalVideo2017a,
  title = {Stochastic {{Variational Video Prediction}}},
  abstract = {Predicting the future in real-world settings, particularly from raw
sensory observations such as images, is exceptionally challenging.
Real-world events can be stochastic and unpredictable, and the high
dimensionality and complexity of natural images requires the predictive
model to build an intricate understanding of the natural world. Many
existing methods tackle this problem by making simplifying assumptions
about the environment. One common assumption is that the outcome is
deterministic and there is only one plausible future. This can lead to
low-quality predictions in real-world settings with stochastic dynamics.
In this paper, we develop a stochastic variational video prediction (SV2P)
method that predicts a different possible future for each sample of its
latent variables. To the best of our knowledge, our model is the first to
provide effective stochastic multi-frame prediction for real-world video.
We demonstrate the capability of the proposed method in predicting
detailed future frames of videos on multiple real-world datasets, both
action-free and action-conditioned. We find that our proposed method
produces substantially improved video predictions when compared to the
same model without stochasticity, and to other stochastic video prediction
methods. Our SV2P implementation will be open sourced upon publication.},
  author = {Babaeizadeh, Mohammad and Finn, Chelsea and Erhan, Dumitru and Campbell, Roy H and Levine, Sergey},
  journal={arXiv preprint arXiv:1710.11252},
  year = {2017},
  keywords = {Machine Learning \& Statistics/Generative/VAE,Machine Learning \& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/VAE}
}

@inproceedings{baigLearningInpaintImage2017,
  title = {Learning to {{Inpaint}} for {{Image Compression}}},
  booktitle = neurips,
  author = {Baig, Mohammad Haris and Koltun, Vladlen and Torresani, Lorenzo},
  year = {2017},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression},
  pages = {1246-1255}
}


@book{bishopPatternRecognitionMachine2006,
  edition = {1st ed. 20},
  title = {Pattern {{Recognition}} and {{Machine Learning}}},
  isbn = {978-0-387-31073-2},
  abstract = {The field of pattern recognition has undergone substantial development over the years. This book reflects these developments while providing a grounding in the basic concepts of pattern recognition and machine learning. It is aimed at advanced undergraduates or first year PhD students, as well as researchers and practitioners.},
  publisher = {{Springer}},
  author = {Bishop, Christopher M},
  month = oct,
  year = {2006},
  keywords = {Books,misc}
}

%C
@inproceedings{carreira2017quo,
  title={{Quo vadis, action recognition? A new model and the kinetics dataset}},
  author={Carreira, Joao and Zisserman, Andrew},
  booktitle={CVPR},
  year={2017}
}

@article{chenVariationalLossyAutoencoder2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1611.02731},
  title = {Variational {{Lossy Autoencoder}}},
  abstract = {Representation learning seeks to expose certain aspects of observed data in a learned representation that's amenable to downstream tasks like classification. For instance, a good representation for 2D images might be one that describes only global structure and discards information about detailed texture. In this paper, we present a simple but principled method to learn such global representations by combining Variational Autoencoder (VAE) with neural autoregressive models such as RNN, MADE and PixelRNN/CNN. Our proposed VAE model allows us to have control over what the global latent code can learn and by designing the architecture accordingly, we can force the global latent code to discard irrelevant information such as texture in 2D images, and hence the VAE only ``autoencodes'' data in a lossy fashion. In addition, by leveraging autoregressive models as both prior distribution p(z) and decoding distribution p(x|z), we can greatly improve generative modeling performance of VAEs, achieving new state-of-the-art results on MNIST, OMNIGLOT and Caltech-101 Silhouettes density estimation tasks as well as competitive results on CIFAR10.},
  language = {en},
  journal = {arXiv:1611.02731},
  author = {Chen, Xi and Kingma, Diederik P. and Salimans, Tim and Duan, Yan and Dhariwal, Prafulla and Schulman, John and Sutskever, Ilya and Abbeel, Pieter},
  year = {2016},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/tacos/Zotero/storage/8TBNUCNP/Chen et al. - 2016 - Variational Lossy Autoencoder.pdf;/Users/tacos/Zotero/storage/PJQ2UARQ/Chen et al. - 2016 - Variational Lossy Autoencoder.pdf}
}

@article{chenLearningVideoCompression2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1804.09869},
  primaryClass = {cs, eess},
  title = {Learning for {{Video Compression}}},
  abstract = {One key challenge to learning-based video compression is that motion predictive coding, a very effective tool for video compression, can hardly be trained into a neural network. In this paper we propose the concept of VoxelCNN which includes motion extension and hybrid prediction networks. VoxelCNN can model spatiotemporal coherence to effectively perform predictive coding inside the learning network. On the basis of VoxelCNN, we further explore a learning based framework for video compression with additional components of iterative analysis/synthesis, binarization, etc. Experiment results demonstrate the effectiveness of the proposed scheme. Although entropy coding and complex configurations are not employed in this paper, we still demonstrate superior performance compared with MPEG-2 and achieve comparable results with H.264 codec. The proposed learning based scheme provides a possible new direction to further improve compression efficiency and functionalities of future video coding.},
  language = {English},
  journal = {IEEE Transactions on Circuits and Systems for Video Technology},
  author = {Chen, Zhibo and He, Tianyu and Jin, Xin and Wu, Feng},
  month = apr,
  year = {2019},
  file = {/Users/tacos/Zotero/storage/D7RT33XR/Chen et al. - 2018 - Learning for Video Compression.pdf;/Users/tacos/Zotero/storage/JSHD5RYS/Chen et al. - 2018 - Learning for Video Compression.pdf}
}

@techreport{ciscoZettabyteEraTrends2017,
  title = {The {{Zettabyte Era}}: {{Trends}} and {{Analysis}}},
  author = {Cisco},
  year = {2017},
  file = {/Users/tacos/Zotero/storage/4REVJR89/2017 - The Zettabyte Era Trends and Analysis.pdf}
}

@book{coverElementsInformationTheory2006,
  address = {New York, NY, USA},
  title = {{Elements of Information Theory}},
  isbn = {978-0-471-24195-9},
  publisher = {{Wiley-Interscience}},
  author = {Cover, Thomas M. and Thomas, Joy A.},
  year = {2006},
  keywords = {misc}
}

%D
%E
%F
%G
@article{giraldoRateDistortionAutoEncoders2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1312.7381},
  primaryClass = {cs},
  title = {Rate-{{Distortion Auto}}-{{Encoders}}},
  abstract = {A rekindled the interest in auto-encoder algorithms has been spurred by recent work on deep learning. Current efforts have been directed towards effective training of auto-encoder architectures with a large number of coding units. Here, we propose a learning algorithm for auto-encoders based on a rate-distortion objective that minimizes the mutual information between the inputs and the outputs of the auto-encoder subject to a fidelity constraint. The goal is to learn a representation that is minimally committed to the input data, but that is rich enough to reconstruct the inputs up to certain level of distortion. Minimizing the mutual information acts as a regularization term whereas the fidelity constraint can be understood as a risk functional in the conventional statistical learning setting. The proposed algorithm uses a recently introduced measure of entropy based on infinitely divisible matrices that avoids the plug in estimation of densities. Experiments using over-complete bases show that the rate-distortion auto-encoders can learn a regularized input-output mapping in an implicit manner.},
  language = {en},
  journal = {arXiv:1312.7381 [cs]},
  author = {Giraldo, Luis G. Sanchez and Principe, Jose C.},
  month = dec,
  year = {2013},
  keywords = {Computer Science - Machine Learning},
  file = {/Users/tacos/Zotero/storage/FH85Y4IE/Giraldo and Principe - 2013 - Rate-Distortion Auto-Encoders.pdf}
}

@article{gregorLearningRepresentationsMaximizing2011,
  title = {Learning {{Representations}} by {{Maximizing Compression}}},
  abstract = {We give an algorithm that learns a representation of data through compression. The algorithm 1) predicts bits sequentially from those previously seen and 2) has a structure and a number of computations similar to an autoencoder. The likelihood under the model can be calculated exactly, and arithmetic coding can be used directly for compression. When training on digits the algorithm learns filters similar to those of restricted boltzman machines and denoising autoencoders. Independent samples can be drawn from the model by a single sweep through the pixels. The algorithm has a good compression performance when compared to other methods that work under random ordering of pixels.},
  author = {Gregor, Karol and LeCun, Yann},
  month = aug,
  journal={arXiv preprint arXiv:1108.1169},
  year = {2011},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/Data Compression}
}

@article{gregorDeepAutoRegressiveNetworks2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1310.8499},
  title = {Deep {{AutoRegressive Networks}}},
  abstract = {We introduce a deep, generative autoencoder capable of learning hierarchies of distributed representations from data. Successive deep stochastic hidden layers are equipped with autoregressive connections, which enable the model to be sampled from quickly and exactly via ancestral sampling. We derive an efficient approximate parameter estimation method based on the minimum description length (MDL) principle, which can be seen as maximising a variational lower bound on the log-likelihood, with a feedforward neural network implementing approximate inference. We demonstrate state-of-the-art generative performance on a number of classic data sets: several UCI data sets, MNIST and Atari 2600 games.},
  language = {en},
  journal = {arXiv:1310.8499},
  author = {Gregor, Karol and Danihelka, Ivo and Mnih, Andriy and Blundell, Charles and Wierstra, Daan},
  month = oct,
  year = {2013},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/tacos/Zotero/storage/DSBM2RXF/Gregor et al. - 2013 - Deep AutoRegressive Networks.pdf;/Users/tacos/Zotero/storage/XCWKYTN3/Gregor et al. - 2013 - Deep AutoRegressive Networks.pdf}
}


@article{heMaskRCNN2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1703.06870},
  primaryClass = {cs},
  title = {Mask {{R}}-{{CNN}}},
  abstract = {We present a conceptually simple, flexible, and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN, running at 5 fps. Moreover, Mask R-CNN is easy to generalize to other tasks, e.g., allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges, including instance segmentation, bounding-box object detection, and person keypoint detection. Without bells and whistles, Mask R-CNN outperforms all existing, single-model entries on every task, including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition. Code has been made available at: https://github.com/facebookresearch/Detectron},
  journal = {arXiv:1703.06870},
  author = {He, Kaiming and Gkioxari, Georgia and Doll\'ar, Piotr and Girshick, Ross},
  month = mar,
  year = {2017},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/Users/tacos/Zotero/storage/F6IR6WBV/He et al. - 2017 - Mask R-CNN.pdf;/Users/tacos/Zotero/storage/V4JREPKK/1703.html}
}


@inproceedings{AutoencodersMinimumDescription,
  title = {Autoencoders, {{Minimum Description Length}} and {{Helmholtz Free Energy}}},
  author={Hinton, Geoffrey E and Zemel, Richard S},
  booktitle=neurips,
  pages={3--10},
  year={1994}
}

@inproceedings{hintonKeepingNeuralNetworks,
  title = {Keeping {{Neural Networks Simple}} by {{Minimizing}} the {{Description Length}} of the {{Weights}}},
  author = {Hinton, Geoffrey E and {van Camp}, Drew},
  booktitle={ACM Conf. on Computational Learning Theory},
  year={1993},
  keywords = {Machine Learning \\\& Statistics/Neural Networks,Machine Learning \& Statistics/Neural Networks}
}

@article{honkelaVariationalLearningBitsBack2004,
  title = {Variational {{Learning}} and {{Bits}}-{{Back Coding}}: {{An Information}}-{{Theoretic View}} to {{Bayesian Learning}}},
  volume = {15},
  issn = {1045-9227},
  shorttitle = {Variational {{Learning}} and {{Bits}}-{{Back Coding}}},
  doi = {10.1109/TNN.2004.828762},
  abstract = {The bits-back coding first introduced by Wallace in 1990 and later by Hinton and van Camp in 1993 provides an interesting link between Bayesian learning and information-theoretic minimum-description-length (MDL) learning approaches. The bits-back coding allows interpreting the cost function used in the variational Bayesian method called ensemble learning as a code length in addition to the Bayesian view of misfit of the posterior approximation and a lower bound of model evidence. Combining these two viewpoints provides interesting insights to the learning process and the functions of different parts of the model. In this paper, the problem of variational Bayesian learning of hierarchical latent variable models is used to demonstrate the benefits of the two views. The code-length interpretation provides new views to many parts of the problem such as model comparison and pruning and helps explain many phenomena occurring in learning.},
  language = {en},
  number = {4},
  journal = {IEEE Transactions on Neural Networks},
  author = {Honkela, A. and Valpola, H.},
  month = jul,
  year = {2004},
  pages = {800-810},
  file = {/Users/tacos/Zotero/storage/HALC87JJ/Honkela and Valpola - 2004 - Variational Learning and Bits-Back Coding An Info.pdf;/Users/tacos/Zotero/storage/PUSRNW8R/Honkela and Valpola - 2004 - Variational Learning and Bits-Back Coding An Info.pdf}
}

%I
@article{ioffeBatchNormalizationAccelerating2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1502.03167},
  primaryClass = {cs},
  title = {Batch {{Normalization}}: {{Accelerating Deep Network Training}} by {{Reducing Internal Covariate Shift}}},
  shorttitle = {Batch {{Normalization}}},
  abstract = {Training Deep Neural Networks is complicated by the fact that the distribution of each layer's inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs. Our method draws its strength from making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization. It also acts as a regularizer, in some cases eliminating the need for Dropout. Applied to a state-of-the-art image classification model, Batch Normalization achieves the same accuracy with 14 times fewer training steps, and beats the original model by a significant margin. Using an ensemble of batchnormalized networks, we improve upon the best published result on ImageNet classification: reaching 4.9\% top-5 validation error (and 4.8\% test error), exceeding the accuracy of human raters.},
  language = {en},
  journal = {arXiv:1502.03167},
  author = {Ioffe, Sergey and Szegedy, Christian},
  month = feb,
  year = {2015},
  keywords = {Computer Science - Machine Learning,Machine Learning \& Statistics/Neural Networks,Paper bibliographies/ICML2016_GCNN,Paper bibliographies/PhD Thesis},
  file = {/Users/tacos/Zotero/storage/UPLPVKX8/Ioffe and Szegedy - 2015 - Batch Normalization Accelerating Deep Network Tra.pdf}
}

%J
@inproceedings{johnstonImprovedLossyImage2017,
  title = {Improved {{Lossy Image Compression}} with {{Priming}} and {{Spatially Adaptive Bit Rates}} for {{Recurrent Networks}}},
  abstract = {We propose a method for lossy image compression based on recurrent, convolutional neural networks that outperforms BPG (4:2:0 ), WebP, JPEG2000, and JPEG as measured by MS-SSIM. We introduce three improvements over previous research that lead to this state-of-the-art result. First, we show that training with a pixel-wise loss weighted by SSIM increases reconstruction quality according to several metrics. Second, we modify the recurrent architecture to improve spatial diffusion, which allows the network to more effectively capture and propagate image information through the network's hidden state. Finally, in addition to lossless entropy coding, we use a spatially adaptive bit allocation algorithm to more efficiently use the limited number of bits to encode visually complex image regions. We evaluate our method on the Kodak and Tecnick image sets and compare against standard codecs as well recently published methods based on deep neural networks.},
  author = {Johnston, Nick and Vincent, Damien and Minnen, David and Covell, Michele and Singh, Saurabh and Chinen, Troy and Hwang, Sung Jin and Shor, Joel and Toderici, George},
  booktitle={CVPR},
  year = {2017},
  keywords = {Machine Learning \\\& Statistics/Generative/Data Compression,Machine Learning \& Statistics/Generative/Data Compression}
}

%K
@inproceedings{kalchbrennerVideoPixelNetworks2016,
  title = {Video {{Pixel Networks}}},
  abstract = {We propose a probabilistic video model, the Video Pixel Network (VPN),
that estimates the discrete joint distribution of the raw pixel values in
a video. The model and the neural architecture reflect the time, space and
color structure of video tensors and encode it as a four-dimensional
dependency chain. The VPN approaches the best possible performance on the
Moving MNIST benchmark, a leap over the previous state of the art, and the
generated videos show only minor deviations from the ground truth. The VPN
also produces detailed samples on the action-conditional Robotic Pushing
benchmark and generalizes to the motion of novel objects.},
  author = {Kalchbrenner, Nal and {van den Oord}, Aaron and Simonyan, Karen and Danihelka, Ivo and Vinyals, Oriol and Graves, Alex and Kavukcuoglu, Koray},
  booktitle={ICML},
  pages={1771--1779},
  year={2017},
  keywords = {Machine Learning \\\& Statistics/Generative/Autoregressive,Machine Learning \& Statistics/Generative/Autoregressive}
}

@inproceedings{kingmaAdamMethodStochastic2015,
  title = {Adam: {{A Method}} for {{Stochastic Optimization}}},
  booktitle = iclr,
  author = {Kingma, D and Ba, J},
  year = {2015},
}

@article{kingmaAutoEncodingVariationalBayes2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1312.6114},
  primaryClass = {cs, stat},
  title = {Auto-{{Encoding Variational Bayes}}},
  abstract = {How can we perform efficient inference and learning in directed probabilistic models, in the presence of continuous latent variables with intractable posterior distributions, and large datasets? We introduce a stochastic variational inference and learning algorithm that scales to large datasets and, under some mild differentiability conditions, even works in the intractable case. Our contributions is two-fold. First, we show that a reparameterization of the variational lower bound yields a lower bound estimator that can be straightforwardly optimized using standard stochastic gradient methods. Second, we show that for i.i.d. datasets with continuous latent variables per datapoint, posterior inference can be made especially efficient by fitting an approximate inference model (also called a recognition model) to the intractable posterior using the proposed lower bound estimator. Theoretical advantages are reflected in experimental results.},
  journal = {arXiv:1312.6114},
  author = {Kingma, Diederik P. and Welling, Max},
  month = dec,
  year = {2013},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/tacos/Zotero/storage/4GBY57B7/Kingma and Welling - 2013 - Auto-Encoding Variational Bayes.pdf;/Users/tacos/Zotero/storage/EB6XB9XT/Kingma and Welling - 2013 - Auto-Encoding Variational Bayes.pdf;/Users/tacos/Zotero/storage/SLZZCUFT/1312.html;/Users/tacos/Zotero/storage/VD3M9KZP/1312.html}
}

%L
@inproceedings{laudeDeepLearningbasedIntra2016a,
  title = {Deep Learning-Based Intra Prediction Mode Decision for {{HEVC}}},
  doi = {10.1109/PCS.2016.7906399},
  abstract = {The High Efficiency Video Coding standard and its screen content coding
extension provide superior coding efficiency compared to predecessor
standards. However, this coding efficiency is achieved at the expense of
very complex encoders. One major complexity driver is the comprehensive
rate distortion (RD) optimization. In this paper, we present a deep
learning-based encoder control which replaces the conventional RD
optimization for the intra prediction mode with deep convolutional neural
network (CNN) classifiers. Thereby, we save the RD optimization
complexity. Our classifiers operate independently of any encoder decisions
and reconstructed sample values. Thus, no additional systematic latency is
introduced. Furthermore, the loss in coding efficiency is negligible with
an average value of 0.52\% over HM-16.6+SCM-5.2.},
  booktitle = {Picture Coding Symposium},
  author = {Laude, T and Ostermann, J},
  month = dec,
  year = {2016},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression,Encoding,neural nets,video coding,HEVC,learning (artificial intelligence),Convolutional codes,Video coding,Standards,CNN classifier,Complexity theory,comprehensive rate distortion optimization,deep convolutional neural network classifier,deep learning-based encoder control,deep learning-based intra prediction mode decision,encoder decision,high efficiency video coding standard,image classification,image reconstruction,Machine learning algorithms,optimisation,Optimization,prediction theory,rate distortion theory,RD optimization complexity,sample value reconstruction,screen content coding extension,systematic latency,Machine Learning \\\& Statistics/Generative/Data Compression},
  pages = {1-5}
}

@unpublished{CONTEXTADAPTIVEENTROPYMODEL,
  title = {Context-Adaptive {{Entropy Model}} for {{End}}-to-End {{Optimized Image Compression}}},
  author = {Lee, Jooyoung and Cho, Seunghyun and Beack, Seung-Kwon},
  year = {2018},
  file = {/Users/tacos/Zotero/storage/73S88V42/CONTEXT-ADAPTIVE ENTROPY MODEL FOR END-TO- END OPTIMIZED IMAGE COMPRESSION.pdf}
}

@inproceedings{li2018learning,
  title = {Learning Convolutional Networks for Content-Weighted Image Compression},
  booktitle = cvpr,
  author = {Li, Mu and Zuo, Wangmeng and Gu, Shuhang and Zhao, Debin and Zhang, David},
  year = {2018},
}

@article{liDisentangledSequentialAutoencoder2018a,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1803.02991},
  primaryClass = {cs},
  title = {Disentangled {{Sequential Autoencoder}}},
  abstract = {We present a VAE architecture for encoding and generating high dimensional sequential data, such as video or audio. Our deep generative model learns a latent representation of the data which is split into a static and dynamic part, allowing us to approximately disentangle latent time-dependent features (dynamics) from features which are preserved over time (content). This architecture gives us partial control over generating content and dynamics by conditioning on either one of these sets of features. In our experiments on artificially generated cartoon video clips and voice recordings, we show that we can convert the content of a given sequence into another one by such content swapping. For audio, this allows us to convert a male speaker into a female speaker and vice versa, while for video we can separately manipulate shapes and dynamics. Furthermore, we give empirical evidence for the hypothesis that stochastic RNNs as latent state models are more efficient at compressing and generating long sequences than deterministic ones, which may be relevant for applications in video compression.},
  journal = {arXiv:1803.02991 [cs]},
  author = {Li, Yingzhen and Mandt, Stephan},
  month = mar,
  year = {2018},
}


%M
@article{mehriSampleRNNUnconditionalEndtoEnd2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1612.07837},
  primaryClass = {cs},
  title = {{{SampleRNN}}: {{An Unconditional End}}-to-{{End Neural Audio Generation Model}}},
  shorttitle = {{{SampleRNN}}},
  abstract = {In this paper we propose a novel model for unconditional audio generation based on generating one audio sample at a time. We show that our model, which profits from combining memory-less modules, namely autoregressive multilayer perceptrons, and stateful recurrent neural networks in a hierarchical structure is able to capture underlying sources of variations in the temporal sequences over very long time spans, on three datasets of different nature. Human evaluation on the generated samples indicate that our model is preferred over competing models. We also show how each component of the model contributes to the exhibited performance.},
  language = {en},
  journal = {arXiv:1612.07837},
  author = {Mehri, Soroush and Kumar, Kundan and Gulrajani, Ishaan and Kumar, Rithesh and Jain, Shubham and Sotelo, Jose and Courville, Aaron and Bengio, Yoshua},
  month = dec,
  year = {2016},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Sound},
  file = {/Users/tacos/Zotero/storage/3LBIXLND/Mehri et al. - 2016 - SampleRNN An Unconditional End-to-End Neural Audi.pdf;/Users/tacos/Zotero/storage/F8SKJBS6/Mehri et al. - 2016 - SampleRNN An Unconditional End-to-End Neural Audi.pdf}
}

%N
%O
@inproceedings{ofliBerkeleyMHADComprehensive2013,
  address = {Clearwater Beach, FL, USA},
  title = {Berkeley {{MHAD}}: {{A}} Comprehensive {{Multimodal Human Action Database}}},
  isbn = {978-1-4673-5054-9 978-1-4673-5053-2 978-1-4673-5052-5},
  shorttitle = {Berkeley {{MHAD}}},
  doi = {10.1109/WACV.2013.6474999},
  abstract = {Over the years, a large number of methods have been proposed to analyze human pose and motion information from images, videos, and recently from depth data. Most methods, however, have been evaluated on datasets that were too specific to each application, limited to a particular modality, and more importantly, captured under unknown conditions. To address these issues, we introduce the Berkeley Multimodal Human Action Database (MHAD) consisting of temporally synchronized and geometrically calibrated data from an optical motion capture system, multibaseline stereo cameras from multiple views, depth sensors, accelerometers and microphones. This controlled multimodal dataset provides researchers an inclusive testbed to develop and benchmark new algorithms across multiple modalities under known capture conditions in various research domains. To demonstrate possible use of MHAD for action recognition, we compare results using the popular Bag-of-Words algorithm adapted to each modality independently with the results of various combinations of modalities using the Multiple Kernel Learning. Our comparative results show that multimodal analysis of human motion yields better action recognition rates than unimodal analysis.},
  language = {en},
  booktitle = {{{IEEE Workshop}} on {{Applications}} of {{Computer Vision}}},
  author = {Ofli, Ferda and Chaudhry, Rizwan and Kurillo, Gregorij and Vidal, Rene and Bajcsy, Ruzena},
  month = jan,
  year = {2013},
  pages = {53-60},
  file = {/Users/tacos/Zotero/storage/LRG8KVK2/Ofli et al. - 2013 - Berkeley MHAD A comprehensive Multimodal Human Ac.pdf}
}

%P
@unpublished{pessoaEndtoEndLearningVideo2018,
  title = {End-to-{{End Learning}} of {{Video Compression}} Using {{Spatio}}-{{Temporal Autoencoders}}},
  author = {Pessoa, Jorge and Aidos, Helena and Tom\'as, Pedro and Figueiredo, M\'ario AT},
  year = {2018},
  file = {/Users/tacos/Zotero/storage/RHPSBXW6/END-TO-END LEARNING OF VIDEO COMPRESSION USING SPATIO-TEMPORAL AUTOENCODERS.pdf;/Users/tacos/Zotero/storage/VF5KKJQF/END-TO-END LEARNING OF VIDEO COMPRESSION USING SPATIO-TEMPORAL AUTOENCODERS.pdf}
}

@inproceedings{prakashSemanticPerceptualImage2017a,
  title = {Semantic {{Perceptual Image Compression Using Deep Convolution Networks}}},
  doi = {10.1109/DCC.2017.56},
  abstract = {It has long been considered a significant problem to improve the visual
quality of lossy image and video compression. Recent advances in computing
power together with the availability of large training data sets has
increased interest in the application of deep learning CNNs to address
image recognition and image processing tasks. Here, we present a powerful
CNN tailored to the specific task of semantic image understanding to
achieve higher visual quality in lossy compression. A modest increase in
complexity is incorporated to the encoder which allows a standard,
off-the-shelf JPEG decoder to be used. While JPEG encoding may be
optimized for generic images, the process is ultimately unaware of the
specific content of the image to be compressed. Our technique makes JPEG
content-aware by designing and training a model to identify multiple
semantic regions in a given image. Unlike object detection techniques, our
model does not require labeling of object positions and is able to
identify objects in a single pass. We present a new CNN architecture
directed specifically to image compression, which generates a map that
highlights semantically-salient regions so that they can be encoded at
higher quality as compared to background regions. By adding a complete set
of features for every class, and then taking a threshold over the sum of
all feature activations, we generate a map that highlights
semantically-salient regions so that they can be encoded at a better
quality compared to background regions. Experiments are presented on the
Kodak PhotoCD dataset and the MIT Saliency Benchmark dataset, in which our
algorithm achieves higher visual quality for the same compressed size
while preserving PSNR.},
  booktitle = {Data Compression Conference},
  author = {Prakash, A and Moran, N and Garber, S and Dilillo, A and Storer, J},
  month = apr,
  year = {2017},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression,neural nets,Training,video coding,learning (artificial intelligence),Convolution,Image compression,convolution,convolutional neural networks,data compression,deep convolution networks,deep learning,deep learning CNN,Image coding,image processing,image recognition,jpeg,JPEG decoder,JPEG encoding,Kodak PhotoCD dataset,lossy image compression,lossy video compression,MIT Saliency Benchmark dataset,perceptual image compression,semantic image understanding,semantic perceptual image compression,semantically-salient regions,Semantics,Standards,visual quality,Visualization,Machine Learning \\\& Statistics/Generative/Data Compression},
  pages = {250-259}
}

%R
@article{rezendeStochasticBackpropagationApproximate2014,
  title = {Stochastic {{Backpropagation}} and {{Approximate Inference}} in {{Deep Generative Models}}},
  language = {en},
  author = {Rezende, Danilo Jimenez and Mohamed, Shakir and Wierstra, Daan},
  year = {2014},
  journal={arXiv preprint arXiv:1401.4082},
  file = {/Users/tacos/Zotero/storage/5675G3DU/Rezende et al. - 2014 - Stochastic Backpropagation and Approximate Inferen.pdf;/Users/tacos/Zotero/storage/CQSHQXJ2/Rezende et al. - 2014 - Stochastic Backpropagation and Approximate Inferen.pdf;/Users/tacos/Zotero/storage/HC5GVEW3/1401.html;/Users/tacos/Zotero/storage/XXNSFNVA/1401.html}
}

@inproceedings{rippelRealTimeAdaptiveImage2017a,
  title = {Real-{{Time Adaptive Image Compression}}},
  abstract = {We present a machine learning-based approach to lossy image compression
which outperforms all existing codecs, while running in real-time. Our
algorithm typically produces file sizes 3 times smaller than JPEG, 2.5
times smaller than JPEG 2000, and 2.3 times smaller than WebP on datasets
of generic images across a spectrum of quality levels. At the same time,
our codec is designed to be lightweight and deployable: for example, it
can encode or decode the Kodak dataset in less than 10ms per image on GPU.
Our architecture is an autoencoder featuring pyramidal analysis, an
adaptive coding module, and regularization of the expected codelength. We
also supplement our approach with adversarial training specialized towards
use in a compression setting: this enables us to produce visually pleasing
reconstructions for very low bitrates.},
  author = {Rippel, Oren and Bourdev, Lubomir},
  year = {2017},
  booktitle={ICML},
  pages={2922--2930},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/Data Compression}
}


%S
@article{salimansPIXELCNNIMPROVINGPIXELCNN2017,
  title = {{{PixelCNN}}++: {{Improving}} the {{PixelCNN}} with {{Discretized Logistic Mixture Likelihood}} and {{Other Modifications}}},
  abstract = {PixelCNNs are a recently proposed class of powerful generative models with tractable likelihood. Here we discuss our implementation of PixelCNNs which we make available at https://github.com/openai/pixel-cnn. Our implementation contains a number of modifications to the original model that both simplify its structure and improve its performance. 1) We use a discretized logistic mixture likelihood on the pixels, rather than a 256-way softmax, which we find to speed up training. 2) We condition on whole pixels, rather than R/G/B sub-pixels, simplifying the model structure. 3) We use downsampling to efficiently capture structure at multiple resolutions. 4) We introduce additional short-cut connections to further speed up optimization. 5) We regularize the model using dropout. Finally, we present state-of-the-art log likelihood results on CIFAR-10 to demonstrate the usefulness of these modifications.},
  language = {English},
  author = {Salimans, Tim and Karpathy, Andrej and Chen, Xi and Kingma, Diederik P},
  year = {2017},
  pages = {10},
  file = {/Users/tacos/Zotero/storage/6ZF6AJBJ/Salimans et al. - 2017 - PIXELCNN++ IMPROVING THE PIXELCNN WITH DISCRETIZE.pdf;/Users/tacos/Zotero/storage/ZHIEH28N/Salimans et al. - 2017 - PIXELCNN++ IMPROVING THE PIXELCNN WITH DISCRETIZE.pdf}
}

@inproceedings{santurkar2018generative,
  title = {Generative Compression},
  booktitle = {Picture Coding Symposium},
  author = {Santurkar, Shibani and Budden, David and Shavit, Nir},
  year = {2018},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression},
  pages = {258-262}
}

@unpublished{snellLearningGenerateImages2015,
  title = {Learning to {{Generate Images}} with {{Perceptual Similarity Metrics}}},
  abstract = {Deep networks are increasingly being applied to problems involving image synthesis, e.g., generating images from textual descriptions and reconstructing an input image from a compact representation. Supervised training of image-synthesis networks typically uses a pixel-wise loss (PL) to indicate the mismatch between a generated image and its corresponding target image. We propose instead to use a loss function that is better calibrated to human perceptual judgments of image quality: the multiscale structural-similarity score (MS-SSIM). Because MS-SSIM is differentiable, it is easily incorporated into gradient-descent learning. We compare the consequences of using MS-SSIM versus PL loss on training deterministic and stochastic autoencoders. For three different architectures, we collected human judgments of the quality of image reconstructions. Observers reliably prefer images synthesized by MS-SSIM-optimized models over those synthesized by PL-optimized models, for two distinct PL measures (\$\textbackslash{}ell\_1\$ and \$\textbackslash{}ell\_2\$ distances). We also explore the effect of training objective on image encoding and analyze conditions under which perceptually-optimized representations yield better performance on image classification. Finally, we demonstrate the superiority of perceptually-optimized networks for super-resolution imaging. Just as computer vision has advanced through the use of convolutional architectures that mimic the structure of the mammalian visual system, we argue that significant additional advances can be made in modeling images through the use of training objectives that are well aligned to characteristics of human perception.},
  author = {Snell, Jake and Ridgeway, Karl and Liao, Renjie and Roads, Brett D and Mozer, Michael C and Zemel, Richard S},
  month = nov,
  year = {2015},
  keywords = {Machine Learning \\\& Statistics/Generative/Data Compression,Machine Learning \& Statistics/Generative/Data Compression}
}

@article{sullivanOverviewHighEfficiency2012,
  title = {{Overview of the High Efficiency Video Coding (HEVC) Standard}},
  volume = {22},
  issn = {1051-8215},
  number = {12},
  journal = {IEEE Trans. Circuits Syst. Video Technol.},
  author = {Sullivan, G J and Ohm, J R and Han, W J and Wiegand, T},
  month = dec,
  year = {2012},
  pages = {1649-1668}
}

%T

@article{todericiVariableRateImage2015a,
  title = {Variable {{Rate Image Compression}} with {{Recurrent Neural Networks}}},
  author = {Toderici, George and O'Malley, Sean M and Hwang, Sung Jin and Vincent, Damien and Minnen, David and Baluja, Shumeet and Covell, Michele and Sukthankar, Rahul},
  year = {2016},
  journal = iclr,
}

@inproceedings{todericiFullResolutionImage2017,
  title = {Full {{Resolution Image Compression With Recurrent Neural Networks}}},
  booktitle = cvpr,
  author = {Toderici, George and Vincent, Damien and Johnston, Nick and Jin Hwang, Sung and Minnen, David and Shor, Joel and Covell, Michele},
  year = {2017},
}

@inproceedings{tsaiLearningBinaryResidual2017a,
  title = {Learning {{Binary Residual Representations}} for {{Domain}}-Specific {{Video Streaming}}},
  abstract = {We study domain-specific video streaming. Specifically, we target a
streaming setting where the videos to be streamed from a server to a
client are all in the same domain and they have to be compressed to a
small size for low-latency transmission. Several popular video streaming
services, such as the video game streaming services of GeForce Now and
Twitch, fall in this category. While conventional video compression
standards such as H.264 are commonly used for this task, we hypothesize
that one can leverage the property that the videos are all in the same
domain to achieve better video quality. Based on this hypothesis, we
propose a novel video compression pipeline. Specifically, we first apply
H.264 to compress domain-specific videos. We then train a novel binary
autoencoder to encode the leftover domain-specific residual information
frame-by-frame into binary representations. These binary representations
are then compressed and sent to the client together with the H.264 stream.
In our experiments, we show that our pipeline yields consistent gains over
standard H.264 compression across several benchmark datasets while using
the same channel bandwidth.},
  author = {Tsai, Yi-Hsuan and Liu, Ming-Yu and Sun, Deqing and Yang, Ming-Hsuan and Kautz, Jan},
  month = dec,
  year = {2017},
  booktitle={AAAI},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/Data Compression}
}

%U
%V
%W
@article{wainwrightGraphicalModelsExponential2007a,
  title = {Graphical {{Models}}, {{Exponential Families}}, and {{Variational Inference}}},
  volume = {1},
  issn = {1935-8237, 1935-8245},
  doi = {10.1561/2200000001},
  language = {English},
  number = {1\textendash{}2},
  journal = {Foundations and Trends\textregistered{} in Machine Learning},
  author = {Wainwright, Martin J. and Jordan, Michael I.},
  year = {2007},
  keywords = {Machine Learning \& Statistics,Paper bibliographies/ICML2015,Paper bibliographies/PhD Thesis},
  pages = {1-305},
  file = {/Users/tacos/Zotero/storage/TLJMQS45/Wainwright and Jordan - 2007 - Graphical Models, Exponential Families, and Variat.pdf;/Users/tacos/Zotero/storage/Y3H5ADWW/Wainwright and Jordan - 2007 - Graphical Models, Exponential Families, and Variat.pdf}
}

@article{wangLieGroupTransformation2011a,
  title = {Lie {{Group Transformation Models}} for {{Predictive Video Coding}}},
  journal = {Data Compression Conference},
  author = {Wang, C M and {Shol-Dickstein}, J and Tosic, Ivana and Olshausen, Bruno A},
  year = {2011},
  keywords = {Machine Learning \& Statistics,Machine Learning \& Statistics/Group Theoretical Learning,Paper bibliographies/MSc Thesis,Paper bibliographies/ICLR2015,Machine Learning \\\& Statistics,Machine Learning \\\& Statistics/Group Theoretical Learning},
  pages = {83-92}
}

@article{wangMultiScaleStructuralSimilarity2003,
  title={{Image quality assessment: from error visibility to structural similarity}},
  author={Wang, Zhou and Bovik, Alan C and Sheikh, Hamid R and Simoncelli, Eero P and others},
  journal={IEEE Trans. on Image Processing},
  volume={13},
  number={4},
  pages={600--612},
  year={2004}
}

%Z


%DATASETS
@misc{UVG,
  title = {{Ultra Video Group} test sequences},
  howpublished = {\url{http://ultravideo.cs.tut.fi/}},
  note = {Accessed: 2020-02-21}
}

@misc{Xiph,
  title = {Xiph.org Video Test Media [derf's collection]},
  howpublished = {\url{https://media.xiph.org/video/derf/}},
  note = {Accessed: 2020-02-21}
}

@misc{VTL,
  title = {Video trace library},
  howpublished = {\url{http://trace.eas.asu.edu/index.html}},
  note = {Accessed: 2019-03-18}
}

@inproceedings{posetrack,
  title={Detect-and-track: Efficient pose estimation in videos},
  author={Girdhar, Rohit and Gkioxari, Georgia and Torresani, Lorenzo and Paluri, Manohar and Tran, Du},
  booktitle={CVPR},
  year={2018}
}


%OTHERS, UNUSED(?)
@unpublished{ADAPTIVESAMPLESPACEADAPTIVEa,
  title = {{{ADAPTIVE SAMPLE}}-{{SPACE}} \& {{ADAPTIVE PROBABILITY CODING}}: {{A NEURAL}}-{{NETWORK BASED APPROACH FOR COMPRESSION}}},
  file = {/Users/tacos/Zotero/storage/QQ6QSIJ6/ADAPTIVE SAMPLE-SPACE & ADAPTIVE PROBABIL- ITY CODING A NEURAL-NETWORK BASED APPROACH FOR COMPRESSION.pdf;/Users/tacos/Zotero/storage/VV7ANDIW/ADAPTIVE SAMPLE-SPACE & ADAPTIVE PROBABIL- ITY CODING A NEURAL-NETWORK BASED APPROACH FOR COMPRESSION.pdf}
}

@unpublished{CONTEXTADAPTIVEENTROPYMODEL,
  title = {{{CONTEXT}}-{{ADAPTIVE ENTROPY MODEL FOR END}}-{{TO}}- {{END OPTIMIZED IMAGE COMPRESSION}}},
  file = {/Users/tacos/Zotero/storage/CRNA9R4R/CONTEXT-ADAPTIVE ENTROPY MODEL FOR END-TO- END OPTIMIZED IMAGE COMPRESSION.pdf}
}

@unpublished{GENERATIVEADVERSARIALNETWORKS,
  title = {Generative {{Adversarial Networks}} for {{Extreme Learned Image Compression}}},
  file = {/Users/tacos/Zotero/storage/K3Z6SLC3/GENERATIVE ADVERSARIAL NETWORKS FOR EXTREME LEARNED IMAGE COMPRESSION.pdf;/Users/tacos/Zotero/storage/LY524N9H/GENERATIVE ADVERSARIAL NETWORKS FOR EXTREME LEARNED IMAGE COMPRESSION.pdf}
}

@unpublished{PRACTICALLOSSLESSCOMPRESSIONa,
  title = {{{PRACTICAL LOSSLESS COMPRESSION WITH LATENT VARIABLES USING BITS BACK CODING}}},
  file = {/Users/tacos/Zotero/storage/F4QJ729T/PRACTICAL LOSSLESS COMPRESSION WITH LATENT VARIABLES USING BITS BACK CODING.pdf;/Users/tacos/Zotero/storage/YKF9NBFQ/PRACTICAL LOSSLESS COMPRESSION WITH LATENT VARIABLES USING BITS BACK CODING.pdf}
}

@unpublished{bastiaankleijnWavenetBasedLow2017a,
  title = {Wavenet Based Low Rate Speech Coding},
  abstract = {Traditional parametric coding of speech facilitates low rate but provides
poor reconstruction quality because of the inadequacy of the model used.
We describe how a WaveNet generative speech model can be used to generate
high quality speech from the bit stream of a standard parametric coder
operating at 2.4 kb/s. We compare this parametric coder with a waveform
coder based on the same generative model and show that approximating the
signal waveform incurs a large rate penalty. Our experiments confirm the
high performance of the WaveNet based coder and show that the speech
produced by the system is able to additionally perform implicit bandwidth
extension and does not significantly impair recognition of the original
speaker for the human listener, even when that speaker has not been used
during the training of the generative model.},
  author = {Bastiaan Kleijn, W and Lim, Felicia S C and Luebs, Alejandro and Skoglund, Jan and Stimberg, Florian and Wang, Quan and Walters, Thomas C},
  month = dec,
  year = {2017},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/Data Compression}
}


@article{bengioEstimatingPropagatingGradients2013,
  title = {Estimating or {{Propagating Gradients Through Stochastic Neurons}} for {{Conditional Computation}}},
  abstract = {Stochastic neurons and hard non-linearities can be useful for a number of
reasons in deep learning models, but in many cases they pose a challenging
problem: how to estimate the gradient of a loss function with respect to
the input of such stochastic or non-smooth neurons? I.e., can we
"back-propagate" through these stochastic neurons? We examine this
question, existing approaches, and compare four families of solutions,
applicable in different settings. One of them is the minimum variance
unbiased gradient estimator for stochatic binary neurons (a special case
of the REINFORCE algorithm). A second approach, introduced here,
decomposes the operation of a binary stochastic neuron into a stochastic
binary part and a smooth differentiable part, which approximates the
expected effect of the pure stochatic binary neuron to first order. A
third approach involves the injection of additive or multiplicative noise
in a computational graph that is otherwise differentiable. A fourth
approach heuristically copies the gradient with respect to the stochastic
output directly as an estimator of the gradient with respect to the
sigmoid argument (we call this the straight-through estimator). To explore
a context where these estimators are useful, we consider a small-scale
version of \{\textbackslash{}em conditional computation\}, where sparse stochastic units
form a distributed representation of gaters that can turn off in
combinatorially many ways large chunks of the computation performed in the
rest of the neural network. In this case, it is important that the gating
units produce an actual 0 most of the time. The resulting sparsity can be
potentially be exploited to greatly reduce the computational cost of large
deep networks for which conditional computation would be useful.},
  author = {Bengio, Yoshua and L\'eonard, Nicholas and Courville, Aaron},
  year = {2013},
}

@article{reed2016generating,
  title={Generating interpretable images with controllable structure},
  author={Reed, Scott and van den Oord, A{\"a}ron and Kalchbrenner, Nal and Bapst, Victor and Botvinick, Matt and de Freitas, Nando},
  booktitle = {ICLR},  
  year={2016},
}

@unpublished{bastiaankleijnWavenetBasedLow2017,
  title = {Wavenet Based Low Rate Speech Coding},
  abstract = {Traditional parametric coding of speech facilitates low rate but provides
poor reconstruction quality because of the inadequacy of the model used.
We describe how a WaveNet generative speech model can be used to generate
high quality speech from the bit stream of a standard parametric coder
operating at 2.4 kb/s. We compare this parametric coder with a waveform
coder based on the same generative model and show that approximating the
signal waveform incurs a large rate penalty. Our experiments confirm the
high performance of the WaveNet based coder and show that the speech
produced by the system is able to additionally perform implicit bandwidth
extension and does not significantly impair recognition of the original
speaker for the human listener, even when that speaker has not been used
during the training of the generative model.},
  author = {Bastiaan Kleijn, W and Lim, Felicia S C and Luebs, Alejandro and Skoglund, Jan and Stimberg, Florian and Wang, Quan and Walters, Thomas C},
  month = dec,
  year = {2017},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression}
}

@article{hintonKeepingNeuralNetworks,
  title = {Keeping {{Neural Networks Simple}} by {{Minimizing}} the {{Description Length}} of the {{Weights}}},
  author = {Hinton, Geoffrey E and {van Camp}, Drew},
  keywords = {Machine Learning \& Statistics/Neural Networks}
}

@unpublished{babaeizadehStochasticVariationalVideo2017,
  title = {Stochastic {{Variational Video Prediction}}},
  abstract = {Predicting the future in real-world settings, particularly from raw
sensory observations such as images, is exceptionally challenging.
Real-world events can be stochastic and unpredictable, and the high
dimensionality and complexity of natural images requires the predictive
model to build an intricate understanding of the natural world. Many
existing methods tackle this problem by making simplifying assumptions
about the environment. One common assumption is that the outcome is
deterministic and there is only one plausible future. This can lead to
low-quality predictions in real-world settings with stochastic dynamics.
In this paper, we develop a stochastic variational video prediction (SV2P)
method that predicts a different possible future for each sample of its
latent variables. To the best of our knowledge, our model is the first to
provide effective stochastic multi-frame prediction for real-world video.
We demonstrate the capability of the proposed method in predicting
detailed future frames of videos on multiple real-world datasets, both
action-free and action-conditioned. We find that our proposed method
produces substantially improved video predictions when compared to the
same model without stochasticity, and to other stochastic video prediction
methods. Our SV2P implementation will be open sourced upon publication.},
  author = {Babaeizadeh, Mohammad and Finn, Chelsea and Erhan, Dumitru and Campbell, Roy H and Levine, Sergey},
  month = oct,
  year = {2017},
  keywords = {Machine Learning \& Statistics/Generative/VAE,Machine Learning \& Statistics/Generative/Data Compression}
}

@inproceedings{prakashSemanticPerceptualImage2017,
  title = {Semantic {{Perceptual Image Compression Using Deep Convolution Networks}}},
  doi = {10.1109/DCC.2017.56},
  abstract = {It has long been considered a significant problem to improve the visual
quality of lossy image and video compression. Recent advances in computing
power together with the availability of large training data sets has
increased interest in the application of deep learning CNNs to address
image recognition and image processing tasks. Here, we present a powerful
CNN tailored to the specific task of semantic image understanding to
achieve higher visual quality in lossy compression. A modest increase in
complexity is incorporated to the encoder which allows a standard,
off-the-shelf JPEG decoder to be used. While JPEG encoding may be
optimized for generic images, the process is ultimately unaware of the
specific content of the image to be compressed. Our technique makes JPEG
content-aware by designing and training a model to identify multiple
semantic regions in a given image. Unlike object detection techniques, our
model does not require labeling of object positions and is able to
identify objects in a single pass. We present a new CNN architecture
directed specifically to image compression, which generates a map that
highlights semantically-salient regions so that they can be encoded at
higher quality as compared to background regions. By adding a complete set
of features for every class, and then taking a threshold over the sum of
all feature activations, we generate a map that highlights
semantically-salient regions so that they can be encoded at a better
quality compared to background regions. Experiments are presented on the
Kodak PhotoCD dataset and the MIT Saliency Benchmark dataset, in which our
algorithm achieves higher visual quality for the same compressed size
while preserving PSNR.},
  booktitle = {2017 {{Data Compression Conference}} ({{DCC}})},
  author = {Prakash, A and Moran, N and Garber, S and Dilillo, A and Storer, J},
  month = apr,
  year = {2017},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression,neural nets,Training,video coding,learning (artificial intelligence),Convolution,Image compression,convolution,convolutional neural networks,data compression,deep convolution networks,deep learning,deep learning CNN,Image coding,image processing,image recognition,jpeg,JPEG decoder,JPEG encoding,Kodak PhotoCD dataset,lossy image compression,lossy video compression,MIT Saliency Benchmark dataset,perceptual image compression,semantic image understanding,semantic perceptual image compression,semantically-salient regions,Semantics,Standards,visual quality,Visualization},
  pages = {250-259}
}


@unpublished{snellLearningGenerateImages2015,
  title = {Learning to {{Generate Images}} with {{Perceptual Similarity Metrics}}},
  abstract = {Deep networks are increasingly being applied to problems involving image
synthesis, e.g., generating images from textual descriptions and
reconstructing an input image from a compact representation. Supervised
training of image-synthesis networks typically uses a pixel-wise loss (PL)
to indicate the mismatch between a generated image and its corresponding
target image. We propose instead to use a loss function that is better
calibrated to human perceptual judgments of image quality: the multiscale
structural-similarity score (MS-SSIM). Because MS-SSIM is differentiable,
it is easily incorporated into gradient-descent learning. We compare the
consequences of using MS-SSIM versus PL loss on training deterministic and
stochastic autoencoders. For three different architectures, we collected
human judgments of the quality of image reconstructions. Observers
reliably prefer images synthesized by MS-SSIM-optimized models over those
synthesized by PL-optimized models, for two distinct PL measures (\$$\backslash$ell\_1\$
and \$$\backslash$ell\_2\$ distances). We also explore the effect of training objective
on image encoding and analyze conditions under which
perceptually-optimized representations yield better performance on image
classification. Finally, we demonstrate the superiority of
perceptually-optimized networks for super-resolution imaging. Just as
computer vision has advanced through the use of convolutional
architectures that mimic the structure of the mammalian visual system, we
argue that significant additional advances can be made in modeling images
through the use of training objectives that are well aligned to
characteristics of human perception.},
  author = {Snell, Jake and Ridgeway, Karl and Liao, Renjie and Roads, Brett D and Mozer, Michael C and Zemel, Richard S},
  month = nov,
  year = {2015},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression}
}

@unpublished{tsaiLearningBinaryResidual2017,
  title = {Learning {{Binary Residual Representations}} for {{Domain}}-Specific {{Video Streaming}}},
  abstract = {We study domain-specific video streaming. Specifically, we target a
streaming setting where the videos to be streamed from a server to a
client are all in the same domain and they have to be compressed to a
small size for low-latency transmission. Several popular video streaming
services, such as the video game streaming services of GeForce Now and
Twitch, fall in this category. While conventional video compression
standards such as H.264 are commonly used for this task, we hypothesize
that one can leverage the property that the videos are all in the same
domain to achieve better video quality. Based on this hypothesis, we
propose a novel video compression pipeline. Specifically, we first apply
H.264 to compress domain-specific videos. We then train a novel binary
autoencoder to encode the leftover domain-specific residual information
frame-by-frame into binary representations. These binary representations
are then compressed and sent to the client together with the H.264 stream.
In our experiments, we show that our pipeline yields consistent gains over
standard H.264 compression across several benchmark datasets while using
the same channel bandwidth.},
  author = {Tsai, Yi-Hsuan and Liu, Ming-Yu and Sun, Deqing and Yang, Ming-Hsuan and Kautz, Jan},
  month = dec,
  year = {2017},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression}
}

@inproceedings{rippelRealTimeAdaptiveImage2017,
  title = {Real-{{Time Adaptive Image Compression}}},
  abstract = {We present a machine learning-based approach to lossy image compression
which outperforms all existing codecs, while running in real-time. Our
algorithm typically produces file sizes 3 times smaller than JPEG, 2.5
times smaller than JPEG 2000, and 2.3 times smaller than WebP on datasets
of generic images across a spectrum of quality levels. At the same time,
our codec is designed to be lightweight and deployable: for example, it
can encode or decode the Kodak dataset in less than 10ms per image on GPU.
Our architecture is an autoencoder featuring pyramidal analysis, an
adaptive coding module, and regularization of the expected codelength. We
also supplement our approach with adversarial training specialized towards
use in a compression setting: this enables us to produce visually pleasing
reconstructions for very low bitrates.},
  author = {Rippel, Oren and Bourdev, Lubomir},
  booktitle={ICML},
  pages={2922--2930},
  year={2017},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression}
}

@unpublished{gregorLearningRepresentationsMaximizing2011,
  title = {Learning {{Representations}} by {{Maximizing Compression}}},
  abstract = {We give an algorithm that learns a representation of data through
compression. The algorithm 1) predicts bits sequentially from those
previously seen and 2) has a structure and a number of computations
similar to an autoencoder. The likelihood under the model can be
calculated exactly, and arithmetic coding can be used directly for
compression. When training on digits the algorithm learns filters similar
to those of restricted boltzman machines and denoising autoencoders.
Independent samples can be drawn from the model by a single sweep through
the pixels. The algorithm has a good compression performance when compared
to other methods that work under random ordering of pixels.},
  author = {Gregor, Karol and LeCun, Yann},
  month = aug,
  year = {2011},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression}
}


@unpublished{balleEndtoendOptimizationNonlinear2016,
  title = {{End-to-End Optimization of Nonlinear Transform Codes for Perceptual Quality}},
  abstract = {We introduce a general framework for end-to-end optimization of the
rate--distortion performance of nonlinear transform codes assuming scalar
quantization. The framework can be used to optimize any differentiable
pair of analysis and synthesis transforms in combination with any
differentiable perceptual metric. As an example, we consider a code built
from a linear transform followed by a form of multi-dimensional local gain
control. Distortion is measured with a state-of-the-art perceptual metric.
When optimized over a large database of images, this representation offers
substantial improvements in bitrate and perceptual appearance over fixed
(DCT) codes, and over linear transform codes optimized for mean squared
error.},
  author = {Ball\'e, Johannes and Laparra, Valero and Simoncelli, Eero P},
  month = jul,
  year = {2016},
}

@unpublished{agustssonGenerativeAdversarialNetworks2018,
  title = {Generative {{Adversarial Networks}} for {{Extreme Learned Image Compression}}},
  abstract = {We propose a framework for extreme learned image compression based on
Generative Adversarial Networks (GANs), obtaining visually pleasing images
at significantly lower bitrates than previous methods. This is made
possible through our GAN formulation of learned compression combined with
a generator/decoder which operates on the full-resolution image and is
trained in combination with a multi-scale discriminator. Additionally, our
method can fully synthesize unimportant regions in the decoded image such
as streets and trees from a semantic label map extracted from the original
image, therefore only requiring the storage of the preserved region and
the semantic label map. A user study confirms that for low bitrates, our
approach significantly outperforms state-of-the-art methods, saving up to
67\% compared to the next-best method BPG.},
  author = {Agustsson, Eirikur and Tschannen, Michael and Mentzer, Fabian and Timofte, Radu and Van Gool, Luc},
  month = apr,
  year = {2018},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression}
}

@unpublished{kalchbrennerVideoPixelNetworks2016,
  title = {Video {{Pixel Networks}}},
  abstract = {We propose a probabilistic video model, the Video Pixel Network (VPN),
that estimates the discrete joint distribution of the raw pixel values in
a video. The model and the neural architecture reflect the time, space and
color structure of video tensors and encode it as a four-dimensional
dependency chain. The VPN approaches the best possible performance on the
Moving MNIST benchmark, a leap over the previous state of the art, and the
generated videos show only minor deviations from the ground truth. The VPN
also produces detailed samples on the action-conditional Robotic Pushing
benchmark and generalizes to the motion of novel objects.},
  author = {Kalchbrenner, Nal and {van den Oord}, Aaron and Simonyan, Karen and Danihelka, Ivo and Vinyals, Oriol and Graves, Alex and Kavukcuoglu, Koray},
  month = oct,
  year = {2016},
  keywords = {Machine Learning \& Statistics/Generative/Autoregressive}
}


@inproceedings{laudeDeepLearningbasedIntra2016,
  title = {Deep Learning-Based Intra Prediction Mode Decision for {{HEVC}}},
  doi = {10.1109/PCS.2016.7906399},
  abstract = {The High Efficiency Video Coding standard and its screen content coding
extension provide superior coding efficiency compared to predecessor
standards. However, this coding efficiency is achieved at the expense of
very complex encoders. One major complexity driver is the comprehensive
rate distortion (RD) optimization. In this paper, we present a deep
learning-based encoder control which replaces the conventional RD
optimization for the intra prediction mode with deep convolutional neural
network (CNN) classifiers. Thereby, we save the RD optimization
complexity. Our classifiers operate independently of any encoder decisions
and reconstructed sample values. Thus, no additional systematic latency is
introduced. Furthermore, the loss in coding efficiency is negligible with
an average value of 0.52\% over HM-16.6+SCM-5.2.},
  booktitle = {2016 {{Picture Coding Symposium}} ({{PCS}})},
  author = {Laude, T and Ostermann, J},
  month = dec,
  year = {2016},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression,Encoding,neural nets,video coding,HEVC,learning (artificial intelligence),Convolutional codes,Video coding,Standards,CNN classifier,Complexity theory,comprehensive rate distortion optimization,deep convolutional neural network classifier,deep learning-based encoder control,deep learning-based intra prediction mode decision,encoder decision,high efficiency video coding standard,image classification,image reconstruction,Machine learning algorithms,optimisation,Optimization,prediction theory,rate distortion theory,RD optimization complexity,sample value reconstruction,screen content coding extension,systematic latency},
  pages = {1-5}
}

@unpublished{johnstonImprovedLossyImage2017,
  title = {Improved {{Lossy Image Compression}} with {{Priming}} and {{Spatially Adaptive Bit Rates}} for {{Recurrent Networks}}},
  abstract = {We propose a method for lossy image compression based on recurrent,
convolutional neural networks that outperforms BPG (4:2:0 ), WebP,
JPEG2000, and JPEG as measured by MS-SSIM. We introduce three improvements
over previous research that lead to this state-of-the-art result. First,
we show that training with a pixel-wise loss weighted by SSIM increases
reconstruction quality according to several metrics. Second, we modify the
recurrent architecture to improve spatial diffusion, which allows the
network to more effectively capture and propagate image information
through the network's hidden state. Finally, in addition to lossless
entropy coding, we use a spatially adaptive bit allocation algorithm to
more efficiently use the limited number of bits to encode visually complex
image regions. We evaluate our method on the Kodak and Tecnick image sets
and compare against standard codecs as well recently published methods
based on deep neural networks.},
  author = {Johnston, Nick and Vincent, Damien and Minnen, David and Covell, Michele and Singh, Saurabh and Chinen, Troy and Hwang, Sung Jin and Shor, Joel and Toderici, George},
  month = mar,
  year = {2017},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression}
}

@unpublished{todericiVariableRateImage2015,
  title = {Variable {{Rate Image Compression}} with {{Recurrent Neural Networks}}},
  abstract = {A large fraction of Internet traffic is now driven by requests from mobile
devices with relatively small screens and often stringent bandwidth
requirements. Due to these factors, it has become the norm for modern
graphics-heavy websites to transmit low-resolution, low-bytecount image
previews (thumbnails) as part of the initial page load process to improve
apparent page responsiveness. Increasing thumbnail compression beyond the
capabilities of existing codecs is therefore a current research focus, as
any byte savings will significantly enhance the experience of mobile
device users. Toward this end, we propose a general framework for
variable-rate image compression and a novel architecture based on
convolutional and deconvolutional LSTM recurrent networks. Our models
address the main issues that have prevented autoencoder neural networks
from competing with existing image compression algorithms: (1) our
networks only need to be trained once (not per-image), regardless of input
image dimensions and the desired compression rate; (2) our networks are
progressive, meaning that the more bits are sent, the more accurate the
image reconstruction; and (3) the proposed architecture is at least as
efficient as a standard purpose-trained autoencoder for a given number of
bits. On a large-scale benchmark of 32\$$\backslash$times\$32 thumbnails, our
LSTM-based approaches provide better visual quality than (headerless)
JPEG, JPEG2000 and WebP, with a storage size that is reduced by 10\% or
more.},
  author = {Toderici, George and O'Malley, Sean M and Hwang, Sung Jin and Vincent, Damien and Minnen, David and Baluja, Shumeet and Covell, Michele and Sukthankar, Rahul},
  month = nov,
  year = {2015},
  keywords = {Machine Learning \& Statistics/Generative/Data Compression}
}


@book{bishopPatternRecognitionMachine2006,
  edition = {1st ed. 20},
  title = {Pattern Recognition and Machine Learning},
  isbn = {978-0-387-31073-2},
  abstract = {The field of pattern recognition has undergone substantial development
over the years. This book reflects these developments while providing a
grounding in the basic concepts of pattern recognition and machine
learning. It is aimed at advanced undergraduates or first year PhD
students, as well as researchers and practitioners.},
  publisher = {{Springer}},
  author = {Bishop, Christopher M},
  month = oct,
  year = {2006},
  keywords = {misc,Books}
}

@article{wangLieGroupTransformation2011,
  title = {Lie {{Group Transformation Models}} for {{Predictive Video Coding}}},
  journal = {Data Compression Conference},
  author = {Wang, C M and {Shol-Dickstein}, J and Tosic, Ivana and Olshausen, Bruno A},
  year = {2011},
  keywords = {Machine Learning \& Statistics,Machine Learning \& Statistics/Group Theoretical Learning,Paper bibliographies/MSc Thesis,Paper bibliographies/ICLR2015},
  pages = {83-92}
}

@article{chenLearningVideoCompression2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1804.09869},
  primaryClass = {cs, eess},
  title = {Learning for {{Video Compression}}},
  abstract = {One key challenge to learning-based video compression is that motion predictive coding, a very effective tool for video compression, can hardly be trained into a neural network. In this paper we propose the concept of VoxelCNN which includes motion extension and hybrid prediction networks. VoxelCNN can model spatiotemporal coherence to effectively perform predictive coding inside the learning network. On the basis of VoxelCNN, we further explore a learning based framework for video compression with additional components of iterative analysis/synthesis, binarization, etc. Experiment results demonstrate the effectiveness of the proposed scheme. Although entropy coding and complex configurations are not employed in this paper, we still demonstrate superior performance compared with MPEG-2 and achieve comparable results with H.264 codec. The proposed learning based scheme provides a possible new direction to further improve compression efficiency and functionalities of future video coding.},
  language = {en},
  journal = {arXiv:1804.09869 [cs, eess]},
  author = {Chen, Zhibo and He, Tianyu and Jin, Xin and Wu, Feng},
  month = apr,
  journal={IEEE Transactions on Circuits and Systems for Video Technology},
  year={2019},
  file = {/Users/tacos/Zotero/storage/D7RT33XR/Chen et al. - 2018 - Learning for Video Compression.pdf}
}

@article{salimansPIXELCNNIMPROVINGPIXELCNN2017,
  title = {{{PixelCNN}}++: {{Improving the PixelCNN with Discretized Logistic Mixture Likelihood and Other Modifications}}},
  abstract = {PixelCNNs are a recently proposed class of powerful generative models with tractable likelihood. Here we discuss our implementation of PixelCNNs which we make available at https://github.com/openai/pixel-cnn. Our implementation contains a number of modifications to the original model that both simplify its structure and improve its performance. 1) We use a discretized logistic mixture likelihood on the pixels, rather than a 256-way softmax, which we find to speed up training. 2) We condition on whole pixels, rather than R/G/B sub-pixels, simplifying the model structure. 3) We use downsampling to efficiently capture structure at multiple resolutions. 4) We introduce additional short-cut connections to further speed up optimization. 5) We regularize the model using dropout. Finally, we present state-of-the-art log likelihood results on CIFAR-10 to demonstrate the usefulness of these modifications.},
  language = {en},
  author = {Salimans, Tim and Karpathy, Andrej and Chen, Xi and Kingma, Diederik P},
  year = {2017},
  pages = {10},
  file = {/Users/tacos/Zotero/storage/ZHIEH28N/Salimans et al. - 2017 - PIXELCNN++ IMPROVING THE PIXELCNN WITH DISCRETIZE.pdf}
}

@techreport{ciscoZettabyteEraTrends2017,
  title = {The {{Zettabyte Era}}: {{Trends}} and {{Analysis}}},
  author = {Cisco},
  year = {2017},
  file = {/Users/tacos/Zotero/storage/4REVJR89/2017 - The Zettabyte Era Trends and Analysis.pdf}
}

@article{kingmaAutoEncodingVariationalBayes2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1312.6114},
  primaryClass = {cs, stat},
  title = {Auto-{{Encoding Variational Bayes}}},
  abstract = {How can we perform efficient inference and learning in directed probabilistic models, in the presence of continuous latent variables with intractable posterior distributions, and large datasets? We introduce a stochastic variational inference and learning algorithm that scales to large datasets and, under some mild differentiability conditions, even works in the intractable case. Our contributions is two-fold. First, we show that a reparameterization of the variational lower bound yields a lower bound estimator that can be straightforwardly optimized using standard stochastic gradient methods. Second, we show that for i.i.d. datasets with continuous latent variables per datapoint, posterior inference can be made especially efficient by fitting an approximate inference model (also called a recognition model) to the intractable posterior using the proposed lower bound estimator. Theoretical advantages are reflected in experimental results.},
  journal = {arXiv:1312.6114 [cs, stat]},
  author = {Kingma, Diederik P. and Welling, Max},
  month = dec,
  year = {2013},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/tacos/Zotero/storage/EB6XB9XT/Kingma and Welling - 2013 - Auto-Encoding Variational Bayes.pdf;/Users/tacos/Zotero/storage/SLZZCUFT/1312.html}
}

@article{alemiFixingBrokenELBO2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1711.00464},
  primaryClass = {cs, stat},
  title = {Fixing a {{Broken ELBO}}},
  abstract = {Recent work in unsupervised representation learning has focused on learning deep directed latent-variable models. Fitting these models by maximizing the marginal likelihood or evidence is typically intractable, thus a common approximation is to maximize the evidence lower bound (ELBO) instead. However, maximum likelihood training (whether exact or approximate) does not necessarily result in a good latent representation, as we demonstrate both theoretically and empirically. In particular, we derive variational lower and upper bounds on the mutual information between the input and the latent variable, and use these bounds to derive a rate-distortion curve that characterizes the tradeoff between compression and reconstruction accuracy. Using this framework, we demonstrate that there is a family of models with identical ELBO, but different quantitative and qualitative characteristics. Our framework also suggests a simple new method to ensure that latent variable models with powerful stochastic decoders do not ignore their latent code.},
  journal = {arXiv:1711.00464 [cs, stat]},
  author = {Alemi, Alexander A. and Poole, Ben and Fischer, Ian and Dillon, Joshua V. and Saurous, Rif A. and Murphy, Kevin},
  month = nov,
  year = {2017},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/tacos/Zotero/storage/DD5XXEFU/Alemi et al. - 2017 - Fixing a Broken ELBO.pdf;/Users/tacos/Zotero/storage/9IHXJCWY/1711.html}
}

@unpublished{ADAPTIVESAMPLESPACEADAPTIVE,
  title = {{{ADAPTIVE SAMPLE}}-{{SPACE}} \& {{ADAPTIVE PROBABILITY CODING}}: {{A NEURAL}}-{{NETWORK BASED APPROACH FOR COMPRESSION}}},
  file = {/Users/tacos/Zotero/storage/QQ6QSIJ6/ADAPTIVE SAMPLE-SPACE & ADAPTIVE PROBABIL- ITY CODING A NEURAL-NETWORK BASED APPROACH FOR COMPRESSION.pdf}
}

@unpublished{ENDTOENDLEARNINGVIDEO,
  title = {{{End}}-{{to}}-{{End Learning of Video Compression using Spatio}}-{{Temporal Autoencoders}}},
  author={Pessoa, Jorge and Aidos, Helena and Tom{\'a}s, Pedro and Figueiredo, M{\'a}rio AT},
  year={2018},
  file = {/Users/tacos/Zotero/storage/VF5KKJQF/END-TO-END LEARNING OF VIDEO COMPRESSION USING SPATIO-TEMPORAL AUTOENCODERS.pdf}
}

@unpublished{PRACTICALLOSSLESSCOMPRESSION,
  title = {{{Practical Lossless Compression with Latent Variables using Bits Back Coding}}},
  file = {/Users/tacos/Zotero/storage/F4QJ729T/PRACTICAL LOSSLESS COMPRESSION WITH LATENT VARIABLES USING BITS BACK CODING.pdf}
}

@unpublished{CONTEXTADAPTIVEENTROPYMODEL,
  title={{Context-adaptive Entropy Model for End-to-end Optimized Image Compression}},
  author={Lee, Jooyoung and Cho, Seunghyun and Beack, Seung-Kwon},
  journal={arXiv preprint arXiv:1809.10452},
  year={2018},
  file = {/Users/tacos/Zotero/storage/CRNA9R4R/CONTEXT-ADAPTIVE ENTROPY MODEL FOR END-TO- END OPTIMIZED IMAGE COMPRESSION.pdf}
}

@unpublished{GENERATIVEADVERSARIALNETWORKS,
  title = {{{Generative Adversarial Networks for Extreme Learned Image Compression}}},
  file = {/Users/tacos/Zotero/storage/LY524N9H/GENERATIVE ADVERSARIAL NETWORKS FOR EXTREME LEARNED IMAGE COMPRESSION.pdf}
}

@article{rezendeStochasticBackpropagationApproximate2014,
  title = {Stochastic {{Backpropagation}} and {{Approximate Inference}} in {{Deep Generative Models}}},
  language = {en},
  author = {Rezende, Danilo Jimenez and Mohamed, Shakir and Wierstra, Daan},
  month = jan,
  year = {2014},
  file = {/Users/tacos/Zotero/storage/5675G3DU/Rezende et al. - 2014 - Stochastic Backpropagation and Approximate Inferen.pdf;/Users/tacos/Zotero/storage/XXNSFNVA/1401.html}
}

@article{chenVariationalLossyAutoencoder2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1611.02731},
  primaryClass = {cs, stat},
  title = {Variational {{Lossy Autoencoder}}},
  abstract = {Representation learning seeks to expose certain aspects of observed data in a learned representation that's amenable to downstream tasks like classification. For instance, a good representation for 2D images might be one that describes only global structure and discards information about detailed texture. In this paper, we present a simple but principled method to learn such global representations by combining Variational Autoencoder (VAE) with neural autoregressive models such as RNN, MADE and PixelRNN/CNN. Our proposed VAE model allows us to have control over what the global latent code can learn and by designing the architecture accordingly, we can force the global latent code to discard irrelevant information such as texture in 2D images, and hence the VAE only ``autoencodes'' data in a lossy fashion. In addition, by leveraging autoregressive models as both prior distribution p(z) and decoding distribution p(x|z), we can greatly improve generative modeling performance of VAEs, achieving new state-of-the-art results on MNIST, OMNIGLOT and Caltech-101 Silhouettes density estimation tasks as well as competitive results on CIFAR10.},
  language = {en},
  journal = {arXiv:1611.02731 [cs, stat]},
  author = {Chen, Xi and Kingma, Diederik P. and Salimans, Tim and Duan, Yan and Dhariwal, Prafulla and Schulman, John and Sutskever, Ilya and Abbeel, Pieter},
  month = nov,
  year = {2016},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/tacos/Zotero/storage/8TBNUCNP/Chen et al. - 2016 - Variational Lossy Autoencoder.pdf}
}

@article{gregorDeepAutoRegressiveNetworks2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1310.8499},
  primaryClass = {cs, stat},
  title = {Deep {{AutoRegressive Networks}}},
  abstract = {We introduce a deep, generative autoencoder capable of learning hierarchies of distributed representations from data. Successive deep stochastic hidden layers are equipped with autoregressive connections, which enable the model to be sampled from quickly and exactly via ancestral sampling. We derive an efficient approximate parameter estimation method based on the minimum description length (MDL) principle, which can be seen as maximising a variational lower bound on the log-likelihood, with a feedforward neural network implementing approximate inference. We demonstrate state-of-the-art generative performance on a number of classic data sets: several UCI data sets, MNIST and Atari 2600 games.},
  language = {en},
  journal = {arXiv:1310.8499 [cs, stat]},
  author = {Gregor, Karol and Danihelka, Ivo and Mnih, Andriy and Blundell, Charles and Wierstra, Daan},
  month = oct,
  year = {2013},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/tacos/Zotero/storage/DSBM2RXF/Gregor et al. - 2013 - Deep AutoRegressive Networks.pdf}
}

@article{honkelaVariationalLearningBitsBack2004,
  title = {Variational {{Learning}} and {{Bits}}-{{Back Coding}}: {{An Information}}-{{Theoretic View}} to {{Bayesian Learning}}},
  volume = {15},
  issn = {1045-9227},
  shorttitle = {Variational {{Learning}} and {{Bits}}-{{Back Coding}}},
  doi = {10.1109/TNN.2004.828762},
  abstract = {The bits-back coding first introduced by Wallace in 1990 and later by Hinton and van Camp in 1993 provides an interesting link between Bayesian learning and information-theoretic minimum-description-length (MDL) learning approaches. The bits-back coding allows interpreting the cost function used in the variational Bayesian method called ensemble learning as a code length in addition to the Bayesian view of misfit of the posterior approximation and a lower bound of model evidence. Combining these two viewpoints provides interesting insights to the learning process and the functions of different parts of the model. In this paper, the problem of variational Bayesian learning of hierarchical latent variable models is used to demonstrate the benefits of the two views. The code-length interpretation provides new views to many parts of the problem such as model comparison and pruning and helps explain many phenomena occurring in learning.},
  language = {en},
  number = {4},
  journal = {IEEE Transactions on Neural Networks},
  author = {Honkela, A. and Valpola, H.},
  month = jul,
  year = {2004},
  pages = {800-810},
  file = {/Users/tacos/Zotero/storage/HALC87JJ/Honkela and Valpola - 2004 - Variational Learning and Bits-Back Coding An Info.pdf}
}

@misc{AutoencodersMinimumDescription,
  title = {Autoencoders, {{Minimum Description Length}} and {{Helmholtz Free Energy}}},
  howpublished = {http://www.cs.toronto.edu/\textasciitilde{}fritz/absps/cvq.pdf},
  file = {/Users/tacos/Zotero/storage/X7MQ9ENT/Autoencoders, Minimum Description Length and Helmholtz Free Energy.pdf}
}

@article{mehriSampleRNNUnconditionalEndtoEnd2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1612.07837},
  primaryClass = {cs},
  title = {{{SampleRNN}}: {{An Unconditional End}}-to-{{End Neural Audio Generation Model}}},
  shorttitle = {{{SampleRNN}}},
  abstract = {In this paper we propose a novel model for unconditional audio generation based on generating one audio sample at a time. We show that our model, which profits from combining memory-less modules, namely autoregressive multilayer perceptrons, and stateful recurrent neural networks in a hierarchical structure is able to capture underlying sources of variations in the temporal sequences over very long time spans, on three datasets of different nature. Human evaluation on the generated samples indicate that our model is preferred over competing models. We also show how each component of the model contributes to the exhibited performance.},
  language = {en},
  journal = {arXiv:1612.07837 [cs]},
  author = {Mehri, Soroush and Kumar, Kundan and Gulrajani, Ishaan and Kumar, Rithesh and Jain, Shubham and Sotelo, Jose and Courville, Aaron and Bengio, Yoshua},
  month = dec,
  year = {2016},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Sound},
  file = {/Users/tacos/Zotero/storage/3LBIXLND/Mehri et al. - 2016 - SampleRNN An Unconditional End-to-End Neural Audi.pdf}
}


@article{wainwrightGraphicalModelsExponential2007a,
  title = {Graphical {{Models}}, {{Exponential Families}}, and {{Variational Inference}}},
  volume = {1},
  issn = {1935-8237, 1935-8245},
  doi = {10.1561/2200000001},
  language = {en},
  number = {1\textendash{}2},
  journal = {Foundations and Trends\textregistered{} in Machine Learning},
  author = {Wainwright, Martin J. and Jordan, Michael I.},
  year = {2007},
  pages = {1-305},
  file = {/Users/tacos/Zotero/storage/Y3H5ADWW/Wainwright and Jordan - 2007 - Graphical Models, Exponential Families, and Variat.pdf}
}

@book{coverElementsInformationTheory2006,
  address = {New York, NY, USA},
  title = {Elements of {{Information Theory}} ({{Wiley Series}} in {{Telecommunications}} and {{Signal Processing}})},
  isbn = {978-0-471-24195-9},
  publisher = {{Wiley-Interscience}},
  author = {Cover, Thomas M. and Thomas, Joy A.},
  year = {2006}
}


@article{giraldo2013rate,
  title={Rate-distortion auto-encoders},
  author={Giraldo, Luis G Sanchez and Principe, Jose C},
  journal={arXiv preprint arXiv:1312.7381},
  year={2013}
}

@inproceedings{agustsson2017soft,
  title={Soft-to-hard vector quantization for end-to-end learning compressible representations},
  author={Agustsson, Eirikur and Mentzer, Fabian and Tschannen, Michael and Cavigelli, Lukas and Timofte, Radu and Benini, Luca and Gool, Luc V},
  booktitle={Advances in Neural Information Processing Systems},
  pages={1141--1151},
  year={2017}
}

@inproceedings{toderici2017full,
  title={Full resolution image compression with recurrent neural networks},
  author={Toderici, George and Vincent, Damien and Johnston, Nick and Jin Hwang, Sung and Minnen, David and Shor, Joel and Covell, Michele},
  booktitle=cvpr,
  year={2017}
}

@inproceedings{santurkar2018generative,
  title={Generative compression},
  author={Santurkar, Shibani and Budden, David and Shavit, Nir},
  booktitle={Picture Coding Symposium (PCS)},
  pages={258--262},
  year={2018}
}

@article{agustsson2018generative,
  title={Generative adversarial networks for extreme learned image compression},
  author={Agustsson, Eirikur and Tschannen, Michael and Mentzer, Fabian and Timofte, Radu and Van Gool, Luc},
  journal={arXiv preprint arXiv:1804.02958},
  year={2018}
}

@inproceedings{kalchbrenner2017video,
  title={Video pixel networks},
  author={Kalchbrenner, Nal and van den Oord, Aaron and Simonyan, Karen and Danihelka, Ivo and Vinyals, Oriol and Graves, Alex and Kavukcuoglu, Koray},
  booktitle={ICML},
  pages={1771--1779},
  year={2017}
}

@inproceedings{baig2017learning,
  title={Learning to inpaint for image compression},
  author={Baig, Mohammad Haris and Koltun, Vladlen and Torresani, Lorenzo},
  booktitle={Advances in Neural Information Processing Systems},
  pages={1246--1255},
  year={2017}
}

@InProceedings{johnston_2018_CVPR,
author = {Johnston, Nick and Vincent, Damien and Minnen, David and Covell, Michele and Singh, Saurabh and Chinen, Troy and Jin Hwang, Sung and Shor, Joel and Toderici, George},
title = {Improved Lossy Image Compression With Priming and Spatially Adaptive Bit Rates for Recurrent Networks},
booktitle = cvpr,
year = {2018}
}

@ARTICLE{avc,
author={T. {Wiegand} and G. J. {Sullivan} and G. {Bjontegaard} and A. {Luthra}},
journal={IEEE Transactions on Circuits and Systems for Video Technology},
title={{Overview of the H.264/AVC video coding standard}},
year={2003},
volume={13},
number={7},
pages={560-576},
ISSN={1558-2205},
month={July},}

@InProceedings{FlowNet2,
  author       = {E. Ilg and N. Mayer and T. Saikia and M. Keuper and A. Dosovitskiy and T. Brox},
  title        = {{FlowNet 2.0: Evolution of Optical Flow Estimation with Deep Networks}},
  booktitle    = cvpr,
  year         = {2017},
}

@article{opencv,
    author = {Bradski, G.},
    citeulike-article-id = {2236121},
    journal = {Dr. Dobb's Journal of Software Tools},
    keywords = {bibtex-import},
    posted-at = {2008-01-15 19:21:54},
    priority = {4},
    title = {{The OpenCV Library}},
    year = {2000}
}

@article{Xue_2019,
   title={{Video Enhancement with Task-Oriented Flow}},
   volume={127},
   number={8},
   journal=ijcv,
   author={Xue, Tianfan and Chen, Baian and Wu, Jiajun and Wei, Donglai and Freeman, William T.},
   year={2019},
   month={Feb},
}

@inproceedings{PyTorch,
    title = {{PyTorch: An Imperative Style, High-Performance Deep Learning Library}},
    author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
    booktitle = neurips,
    year = {2019},
}

@inproceedings{Gao_2019_ReCoNet,
   title={{ReCoNet: Real-Time Coherent Video Style Transfer Network}},
   author={Gao, Chang and Gu, Derun and Zhang, Fangjun and Yu, Yizhou},
   year={2019},
   booktitle = accv,
}

@inproceedings{Lai-ECCV-2018,
    author    = {Lai, Wei-Sheng and Huang, Jia-Bin and Wang, Oliver and Shechtman, Eli and Yumer, Ersin and Yang, Ming-Hsuan}, 
    title     = {{Learning Blind Video Temporal Consistency}}, 
    booktitle = eccv,
    year      = {2018}
}

@inproceedings{Wang2009MSELoveItLeaveIt,
    author={Z. {Wang} and A. C. {Bovik}},
    booktitle={IEEE Signal Processing Magazine},
    title={{Mean Squared Error: Love it or leave it? A new look at Signal Fidelity Measures}},
    year={2009},
    volume={26},
    pages={98-117},
    ISSN={1558-0792},
    month={Jan},
}