-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdirty.bib
1715 lines (1537 loc) · 103 KB
/
dirty.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@inproceedings{wuVideoCompressionImage2018,
title = {{Video Compression through Image Interpolation}},
author = {Wu, Chao-Yuan and Singhal, Nayan and Kr\"ahenb\"uhl, Philipp},
year = {2018},
booktitle=eccv,
}
@InProceedings{Habibian_2019_ICCV,
author = {Habibian, Amirhossein and {van Rozendaal}, Ties and Tomczak, Jakub M. and Cohen, Taco S.},
title = {{Video Compression With Rate-Distortion Autoencoders}},
booktitle = iccv,
year = {2019}
}
@InProceedings{rippelLearnedVideoCompression2018,
title = {{Learned Video Compression}},
booktitle = iccv,
author = {Rippel, Oren and Nair, Sanjay and Lew, Carissa and Branson, Steve and Anderson, Alexander G. and Bourdev, Lubomir},
year = {2019},
}
@inproceedings{stephan_2019_neurips,
author = {Jun Han and Salvator Lombardo and Christopher Schroers and Stephan Mandt},
title = {{Deep Probabilistic Video Compression}},
booktitle = neurips,
year = {2019},
}
@inproceedings{luDVCEndtoendDeep2018,
title = {{DVC: An End-to-End Deep Video Compression Framework}},
author = {Lu, Guo and Ouyang, Wanli and Xu, Dong and Zhang, Xiaoyun and Cai, Chunlei and Gao, Zhiyong},
booktitle = cvpr,
year = {2019},
}
@inproceedings{mentzerConditionalProbabilityModels2018,
title = {{Conditional Probability Models for Deep Image Compression}},
booktitle = cvpr,
author = {Mentzer, Fabian and Agustsson, Eirikur and Tschannen, Michael and Timofte, Radu and Van Gool, Luc},
year = {2018},
}
@inproceedings{liu2020learned,
title={{Learned Video Compression via Joint Spatial-Temporal Correlation Exploration}},
author={Haojie Liu and Han shen and Lichao Huang and Ming Lu and Tong Chen and Zhan Ma},
year={2020},
booktitle = aaai,
}
@inproceedings{Djelouah_2019_ICCV,
author = {Djelouah, Abdelaziz and Campos, Joaquim and Schaub-Meyer, Simone and Schroers, Christopher},
title = {{Neural Inter-Frame Compression for Video Coding}},
booktitle = iccv,
year = {2019}
}
@inproceedings{theisLossyImageCompression2017,
title = {{Lossy Image Compression with Compressive Autoencoders}},
author = {Theis, Lucas and Shi, Wenzhe and Cunningham, Andrew and Husz\'ar, Ferenc},
year = {2017},
booktitle=iclr,
}
@inproceedings{jaderberg2015spatial,
title={{Spatial Transformer Networks}},
author={Max Jaderberg and Karen Simonyan and Andrew Zisserman and Koray Kavukcuoglu},
year={2015},
booktitle = neurips,
}
@inproceedings{UNet,
author = {Ronneberger, Olaf and Fischer, Philipp and Brox, Thomas},
booktitle = miccai,
title = {{U-Net: Convolutional Networks for Biomedical Image Segmentation}},
year = {2015},
}
@inproceedings{FRAE,
title={{Feedback Recurrent AutoEncoder}},
author={Yang Yang and Guillaume Sauti{\'e}re and J. Jon Ryu and Taco S Cohen},
year={2019},
booktitle = {ICASSP},
}
@inproceedings{vandenOord2016pixelCNN,
title = {{Conditional Image Generation with PixelCNN Decoders}},
author = {{van den Oord}, Aaron and Kalchbrenner, Nal and Espeholt, Lasse and Kavukcuoglu, Koray and Vinyals, Oriol and Graves, Alex},
booktitle = neurips,
year = {2016},
}
@article{hu2020learning,
title={Learning End-to-End Lossy Image Compression: A Benchmark},
author={Yueyu Hu and Wenhan Yang and Zhan Ma and Jiaying Liu},
year={2020},
journal = {arXiv:2002.03711},
}
@inproceedings{balle2016end,
title = {{End-to-End Optimized Image Compression}},
author = {Ball\'e, Johannes and Laparra, Valero and Simoncelli, Eero P},
year = {2017},
booktitle = iclr,
}
@inproceedings{balleVARIATIONALIMAGECOMPRESSION2018,
title = {{Variational Image Compression with a Scale Hyperprior}},
author = {Ball\'e, Johannes and Minnen, David and Singh, Saurabh and Hwang, Sung Jin and Johnston, Nick},
year = {2018},
booktitle = iclr,
}
@inproceedings{rippel2017real,
title={{Real-time adaptive image compression}},
author={Rippel, Oren and Bourdev, Lubomir},
booktitle=icml,
year={2017},
}
@inproceedings{betavae2017iclr,
title={{beta-VAE: Learning Basic Visual Concepts with a Constrained Variational Framework}},
author={Irina Higgins and Loic Matthey and Arka Pal and Christopher Burgess and Xavier Glorot and Matthew Botvinick and Shakir Mohamed and Alexander Lerchner},
booktitle=iclr,
year={2017},
}
@inproceedings{Gregor2016,
title={{Towards Conceptual Compression}},
author={Karol Gregor and Frederic Besse and Danilo Jimenez Rezende and Ivo Danihelka and Daan Wierstra},
booktitle = neurips,
year = {2016},
}
@inproceedings{Gregor2015,
title={{DRAW: A Recurrent Neural Network For Image Generation}},
author={Karol Gregor and Ivo Danihelka and Alex Graves and Danilo Jimenez Rezende and Daan Wierstra},
booktitle = icml,
year={2015},
}
@inproceedings{l1_plus_msssim,
author={H. {Zhao} and O. {Gallo} and I. {Frosio} and J. {Kautz}},
booktitle={IEEE Transactions on Computational Imaging},
title={{Loss Functions for Image Restoration With Neural Networks}},
year={2017},
}
@inproceedings{convlstm,
author = {Xingjian Shi and
Zhourong Chen and
Hao Wang and
Dit{-}Yan Yeung and
Wai{-}Kin Wong and
Wang{-}chun Woo},
title = {{Convolutional LSTM Network: A Machine Learning Approach for Precipitation
Nowcasting}},
booktitle = neurips,
year = {2015},
}
@incollection{minnenJointAutoregressiveAndHierarchicalPriors2018,
title = {Joint Autoregressive and Hierarchical Priors for Learned Image Compression},
author = {Minnen, David and Ball\'{e}, Johannes and Toderici, George D},
booktitle = neurips,
year = 2018,
}
@article{liuNLAICImageCompression2019,
title={{Non-local Attention Optimized Deep Image Compression}},
author={Haojie Liu and Tong Chen and Peiyao Guo and Qiu Shen and Xun Cao and Yao Wang and Zhan Ma},
year={2019},
journal={arXiv:1904.09757},
}
@inproceedings{balleDensity2015,
title={{Density Modeling of Images using a Generalized Normalization Transformation}},
author={Johannes Ballé and Valero Laparra and Eero P. Simoncelli},
booktitle = iclr,
year={2016},
}
@BOOK{source_coding,
TITLE = {{Digital Signal Compression: Principles and Practice}},
SUBTITLE = {The Science of Microfabrication},
AUTHOR = {William A. Pearlman and Amir Said},
YEAR = {2011},
PUBLISHER = {Cambridge University Press},
}
@inproceedings{webb2019inversion,
title = {{Faithful Inversion of Generative Models for Effective Amortized Inference}},
author = {Webb, Stefan and Goli{\'n}ski, Adam and Zinkov, Rob and N, Siddharth and Rainforth, Tom and Teh, Yee Whye and Wood, Frank},
booktitle = neurips,
year = {2018},
}
@inproceedings{krishnan2017structured,
author = {Krishnan, Rahul G. and Shalit, Uri and Sontag, David},
title = {Structured Inference Networks for Nonlinear State Space Models},
year = {2017},
booktitle = aaai,
}
@book{koller2009probabilistic,
author = {Koller, D. and Friedman, N.},
isbn = {9780262013192},
lccn = {2009008615},
publisher = {MIT Press},
timestamp = {2018-04-06T07:07:25.000+0200},
title = {Probabilistic Graphical Models: Principles and Techniques},
year = 2009
}
@inproceedings{Kinetics600,
author = {Jo{\~{a}}o Carreira and
Eric Noland and
Andras Banki{-}Horvath and
Chloe Hillier and
Andrew Zisserman},
title = {A Short Note about Kinetics-600},
booktitle = {arXiv abs/1808.01340},
year = {2018},
}
@inproceedings{heDeepResidualLearning2016,
title = {{Deep Residual Learning for Image Recognition}},
booktitle = cvpr,
author = {He, K and Zhang, X and Ren, S and Sun, J},
year = {2016},
}
@inproceedings{broken_elbo,
author = {Alexander A. Alemi and
Ben Poole and
Ian Fischer and
Joshua V. Dillon and
Rif A. Saurous and
Kevin Murphy},
title = {{Fixing a Broken ELBO}},
booktitle = {ICML},
year = {2018},
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@misc{HM,
title = {{High Efficiency Video Coding (HEVC)}},
howpublished = {\url{https://hevc.hhi.fraunhofer.de/}},
note = {Accessed: 2020-02-21}
}
@misc{digital_video_introduction,
title = {Digital Video Introduction},
howpublished = {\url{https://github.com/leandromoreira/digital_video_introduction/blob/master/README.md#frame-types}},
note = {Accessed: 2020-03-02}
}
@misc{ffmpeg,
title = {ffmpeg},
howpublished = {\url{http://ffmpeg.org/}},
note = {Accessed: 2020-02-21}
}
@misc{netflix-data-usage,
title = {How Much Data Does {Netflix} Use?},
howpublished = {\url{https://www.howtogeek.com/338983/how-much-data-does-netflix-use/}},
note = {Accessed: 2020-02-28}
}
@misc{video-data-global-usage,
title = {2019 Global Internet Phenomena Report},
howpublished = {\url{https://www.ncta.com/whats-new/report-where-does-the-majority-of-internet-traffic-come}},
note = {Accessed: 2020-02-28}
}
@article{Kinetics400,
author = {Will Kay and
Jo{\~{a}}o Carreira and
Karen Simonyan and
Brian Zhang and
Chloe Hillier and
Sudheendra Vijayanarasimhan and
Fabio Viola and
Tim Green and
Trevor Back and
Paul Natsev and
Mustafa Suleyman and
Andrew Zisserman},
title = {{The Kinetics Human Action Video Dataset}},
journal = {arxiv:1705.06950},
year = {2017},
}
% Another possible temporal coherency loss
% https://research.nvidia.com/publication/interactive-reconstruction-monte-carlo-image-sequences-using-recurrent-denoising
@InProceedings{FlowNet,
author = {A. Dosovitskiy and P. Fischer and E. Ilg and P. H{\"a}usser and C. Haz{\i}rba{\c{s}} and V. Golkov and P. v.d. Smagt and D. Cremers and T. Brox},
title = {FlowNet: Learning Optical Flow with Convolutional Networks},
booktitle = iccv,
year = {2015},
}
% bibtex files for ICCV 2019 RDAE paper
@inproceedings{agustssonSofttoHardVectorQuantization2017,
title = {Soft-to-{{Hard Vector Quantization}} for {{End}}-to-{{End Learning Compressible Representations}}},
booktitle = neurips,
publisher = {{Curran Associates, Inc.}},
author = {Agustsson, Eirikur and Mentzer, Fabian and Tschannen, Michael and Cavigelli, Lukas and Timofte, Radu and Benini, Luca and Gool, Luc V},
year = {2017},
keywords = {Machine Learning \& Statistics/Generative/Data Compression},
pages = {1141-1151}
}
@article{agustssonGenerativeAdversarialNetworks2018a,
title = {Generative {{Adversarial Networks}} for {{Extreme Learned Image Compression}}},
abstract = {We propose a framework for extreme learned image compression based on
Generative Adversarial Networks (GANs), obtaining visually pleasing images
at significantly lower bitrates than previous methods. This is made
possible through our GAN formulation of learned compression combined with
a generator/decoder which operates on the full-resolution image and is
trained in combination with a multi-scale discriminator. Additionally, our
method can fully synthesize unimportant regions in the decoded image such
as streets and trees from a semantic label map extracted from the original
image, therefore only requiring the storage of the preserved region and
the semantic label map. A user study confirms that for low bitrates, our
approach significantly outperforms state-of-the-art methods, saving up to
67\% compared to the next-best method BPG.},
author = {Agustsson, Eirikur and Tschannen, Michael and Mentzer, Fabian and Timofte, Radu and Van Gool, Luc},
journal={arXiv preprint arXiv:1804.02958},
year = {2018},
keywords = {Machine Learning \& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/Data Compression}
}
@article{alemiFixingBrokenELBO2017,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1711.00464},
primaryClass = {cs, stat},
title = {Fixing a {{Broken ELBO}}},
abstract = {Recent work in unsupervised representation learning has focused on learning deep directed latent-variable models. Fitting these models by maximizing the marginal likelihood or evidence is typically intractable, thus a common approximation is to maximize the evidence lower bound (ELBO) instead. However, maximum likelihood training (whether exact or approximate) does not necessarily result in a good latent representation, as we demonstrate both theoretically and empirically. In particular, we derive variational lower and upper bounds on the mutual information between the input and the latent variable, and use these bounds to derive a rate-distortion curve that characterizes the tradeoff between compression and reconstruction accuracy. Using this framework, we demonstrate that there is a family of models with identical ELBO, but different quantitative and qualitative characteristics. Our framework also suggests a simple new method to ensure that latent variable models with powerful stochastic decoders do not ignore their latent code.},
journal = {arXiv:1711.00464},
author = {Alemi, Alexander A. and Poole, Ben and Fischer, Ian and Dillon, Joshua V. and Saurous, Rif A. and Murphy, Kevin},
month = nov,
year = {2017},
keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
file = {/Users/tacos/Zotero/storage/DD5XXEFU/Alemi et al. - 2017 - Fixing a Broken ELBO.pdf;/Users/tacos/Zotero/storage/YCFBPASF/Alemi et al. - 2017 - Fixing a Broken ELBO.pdf;/Users/tacos/Zotero/storage/9IHXJCWY/1711.html;/Users/tacos/Zotero/storage/TE26QSW6/1711.html}
}
@inproceedings{andrilukaPoseTrackBenchmarkHuman2018,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1710.10000},
title = {{{PoseTrack}}: {{A Benchmark}} for {{Human Pose Estimation}} and {{Tracking}}},
shorttitle = {{{PoseTrack}}},
abstract = {Human poses and motions are important cues for analysis of videos with people and there is strong evidence that representations based on body pose are highly effective for a variety of tasks such as activity recognition, content retrieval and social signal processing. In this work, we aim to further advance the state of the art by establishing "PoseTrack", a new large-scale benchmark for video-based human pose estimation and articulated tracking, and bringing together the community of researchers working on visual human analysis. The benchmark encompasses three competition tracks focusing on i) single-frame multi-person pose estimation, ii) multi-person pose estimation in videos, and iii) multi-person articulated tracking. To facilitate the benchmark and challenge we collect, annotate and release a new \%large-scale benchmark dataset that features videos with multiple people labeled with person tracks and articulated pose. A centralized evaluation server is provided to allow participants to evaluate on a held-out test set. We envision that the proposed benchmark will stimulate productive research both by providing a large and representative training dataset as well as providing a platform to objectively evaluate and compare the proposed methods. The benchmark is freely accessible at https://posetrack.net.},
booktitle = {CVPR},
author = {Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
year = {2018},
keywords = {Computer Science - Computer Vision and Pattern Recognition},
file = {/Users/tacos/Zotero/storage/8NIFFXTI/Andriluka et al. - 2017 - PoseTrack A Benchmark for Human Pose Estimation a.pdf;/Users/tacos/Zotero/storage/7BZNJI8Y/1710.html}
}
%B
@article{babaeizadehStochasticVariationalVideo2017a,
title = {Stochastic {{Variational Video Prediction}}},
abstract = {Predicting the future in real-world settings, particularly from raw
sensory observations such as images, is exceptionally challenging.
Real-world events can be stochastic and unpredictable, and the high
dimensionality and complexity of natural images requires the predictive
model to build an intricate understanding of the natural world. Many
existing methods tackle this problem by making simplifying assumptions
about the environment. One common assumption is that the outcome is
deterministic and there is only one plausible future. This can lead to
low-quality predictions in real-world settings with stochastic dynamics.
In this paper, we develop a stochastic variational video prediction (SV2P)
method that predicts a different possible future for each sample of its
latent variables. To the best of our knowledge, our model is the first to
provide effective stochastic multi-frame prediction for real-world video.
We demonstrate the capability of the proposed method in predicting
detailed future frames of videos on multiple real-world datasets, both
action-free and action-conditioned. We find that our proposed method
produces substantially improved video predictions when compared to the
same model without stochasticity, and to other stochastic video prediction
methods. Our SV2P implementation will be open sourced upon publication.},
author = {Babaeizadeh, Mohammad and Finn, Chelsea and Erhan, Dumitru and Campbell, Roy H and Levine, Sergey},
journal={arXiv preprint arXiv:1710.11252},
year = {2017},
keywords = {Machine Learning \& Statistics/Generative/VAE,Machine Learning \& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/VAE}
}
@inproceedings{baigLearningInpaintImage2017,
title = {Learning to {{Inpaint}} for {{Image Compression}}},
booktitle = neurips,
author = {Baig, Mohammad Haris and Koltun, Vladlen and Torresani, Lorenzo},
year = {2017},
keywords = {Machine Learning \& Statistics/Generative/Data Compression},
pages = {1246-1255}
}
@book{bishopPatternRecognitionMachine2006,
edition = {1st ed. 20},
title = {Pattern {{Recognition}} and {{Machine Learning}}},
isbn = {978-0-387-31073-2},
abstract = {The field of pattern recognition has undergone substantial development over the years. This book reflects these developments while providing a grounding in the basic concepts of pattern recognition and machine learning. It is aimed at advanced undergraduates or first year PhD students, as well as researchers and practitioners.},
publisher = {{Springer}},
author = {Bishop, Christopher M},
month = oct,
year = {2006},
keywords = {Books,misc}
}
%C
@inproceedings{carreira2017quo,
title={{Quo vadis, action recognition? A new model and the kinetics dataset}},
author={Carreira, Joao and Zisserman, Andrew},
booktitle={CVPR},
year={2017}
}
@article{chenVariationalLossyAutoencoder2016,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1611.02731},
title = {Variational {{Lossy Autoencoder}}},
abstract = {Representation learning seeks to expose certain aspects of observed data in a learned representation that's amenable to downstream tasks like classification. For instance, a good representation for 2D images might be one that describes only global structure and discards information about detailed texture. In this paper, we present a simple but principled method to learn such global representations by combining Variational Autoencoder (VAE) with neural autoregressive models such as RNN, MADE and PixelRNN/CNN. Our proposed VAE model allows us to have control over what the global latent code can learn and by designing the architecture accordingly, we can force the global latent code to discard irrelevant information such as texture in 2D images, and hence the VAE only ``autoencodes'' data in a lossy fashion. In addition, by leveraging autoregressive models as both prior distribution p(z) and decoding distribution p(x|z), we can greatly improve generative modeling performance of VAEs, achieving new state-of-the-art results on MNIST, OMNIGLOT and Caltech-101 Silhouettes density estimation tasks as well as competitive results on CIFAR10.},
language = {en},
journal = {arXiv:1611.02731},
author = {Chen, Xi and Kingma, Diederik P. and Salimans, Tim and Duan, Yan and Dhariwal, Prafulla and Schulman, John and Sutskever, Ilya and Abbeel, Pieter},
year = {2016},
keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
file = {/Users/tacos/Zotero/storage/8TBNUCNP/Chen et al. - 2016 - Variational Lossy Autoencoder.pdf;/Users/tacos/Zotero/storage/PJQ2UARQ/Chen et al. - 2016 - Variational Lossy Autoencoder.pdf}
}
@article{chenLearningVideoCompression2018,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1804.09869},
primaryClass = {cs, eess},
title = {Learning for {{Video Compression}}},
abstract = {One key challenge to learning-based video compression is that motion predictive coding, a very effective tool for video compression, can hardly be trained into a neural network. In this paper we propose the concept of VoxelCNN which includes motion extension and hybrid prediction networks. VoxelCNN can model spatiotemporal coherence to effectively perform predictive coding inside the learning network. On the basis of VoxelCNN, we further explore a learning based framework for video compression with additional components of iterative analysis/synthesis, binarization, etc. Experiment results demonstrate the effectiveness of the proposed scheme. Although entropy coding and complex configurations are not employed in this paper, we still demonstrate superior performance compared with MPEG-2 and achieve comparable results with H.264 codec. The proposed learning based scheme provides a possible new direction to further improve compression efficiency and functionalities of future video coding.},
language = {English},
journal = {IEEE Transactions on Circuits and Systems for Video Technology},
author = {Chen, Zhibo and He, Tianyu and Jin, Xin and Wu, Feng},
month = apr,
year = {2019},
file = {/Users/tacos/Zotero/storage/D7RT33XR/Chen et al. - 2018 - Learning for Video Compression.pdf;/Users/tacos/Zotero/storage/JSHD5RYS/Chen et al. - 2018 - Learning for Video Compression.pdf}
}
@techreport{ciscoZettabyteEraTrends2017,
title = {The {{Zettabyte Era}}: {{Trends}} and {{Analysis}}},
author = {Cisco},
year = {2017},
file = {/Users/tacos/Zotero/storage/4REVJR89/2017 - The Zettabyte Era Trends and Analysis.pdf}
}
@book{coverElementsInformationTheory2006,
address = {New York, NY, USA},
title = {{Elements of Information Theory}},
isbn = {978-0-471-24195-9},
publisher = {{Wiley-Interscience}},
author = {Cover, Thomas M. and Thomas, Joy A.},
year = {2006},
keywords = {misc}
}
%D
%E
%F
%G
@article{giraldoRateDistortionAutoEncoders2013,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1312.7381},
primaryClass = {cs},
title = {Rate-{{Distortion Auto}}-{{Encoders}}},
abstract = {A rekindled the interest in auto-encoder algorithms has been spurred by recent work on deep learning. Current efforts have been directed towards effective training of auto-encoder architectures with a large number of coding units. Here, we propose a learning algorithm for auto-encoders based on a rate-distortion objective that minimizes the mutual information between the inputs and the outputs of the auto-encoder subject to a fidelity constraint. The goal is to learn a representation that is minimally committed to the input data, but that is rich enough to reconstruct the inputs up to certain level of distortion. Minimizing the mutual information acts as a regularization term whereas the fidelity constraint can be understood as a risk functional in the conventional statistical learning setting. The proposed algorithm uses a recently introduced measure of entropy based on infinitely divisible matrices that avoids the plug in estimation of densities. Experiments using over-complete bases show that the rate-distortion auto-encoders can learn a regularized input-output mapping in an implicit manner.},
language = {en},
journal = {arXiv:1312.7381 [cs]},
author = {Giraldo, Luis G. Sanchez and Principe, Jose C.},
month = dec,
year = {2013},
keywords = {Computer Science - Machine Learning},
file = {/Users/tacos/Zotero/storage/FH85Y4IE/Giraldo and Principe - 2013 - Rate-Distortion Auto-Encoders.pdf}
}
@article{gregorLearningRepresentationsMaximizing2011,
title = {Learning {{Representations}} by {{Maximizing Compression}}},
abstract = {We give an algorithm that learns a representation of data through compression. The algorithm 1) predicts bits sequentially from those previously seen and 2) has a structure and a number of computations similar to an autoencoder. The likelihood under the model can be calculated exactly, and arithmetic coding can be used directly for compression. When training on digits the algorithm learns filters similar to those of restricted boltzman machines and denoising autoencoders. Independent samples can be drawn from the model by a single sweep through the pixels. The algorithm has a good compression performance when compared to other methods that work under random ordering of pixels.},
author = {Gregor, Karol and LeCun, Yann},
month = aug,
journal={arXiv preprint arXiv:1108.1169},
year = {2011},
keywords = {Machine Learning \& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/Data Compression}
}
@article{gregorDeepAutoRegressiveNetworks2013,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1310.8499},
title = {Deep {{AutoRegressive Networks}}},
abstract = {We introduce a deep, generative autoencoder capable of learning hierarchies of distributed representations from data. Successive deep stochastic hidden layers are equipped with autoregressive connections, which enable the model to be sampled from quickly and exactly via ancestral sampling. We derive an efficient approximate parameter estimation method based on the minimum description length (MDL) principle, which can be seen as maximising a variational lower bound on the log-likelihood, with a feedforward neural network implementing approximate inference. We demonstrate state-of-the-art generative performance on a number of classic data sets: several UCI data sets, MNIST and Atari 2600 games.},
language = {en},
journal = {arXiv:1310.8499},
author = {Gregor, Karol and Danihelka, Ivo and Mnih, Andriy and Blundell, Charles and Wierstra, Daan},
month = oct,
year = {2013},
keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
file = {/Users/tacos/Zotero/storage/DSBM2RXF/Gregor et al. - 2013 - Deep AutoRegressive Networks.pdf;/Users/tacos/Zotero/storage/XCWKYTN3/Gregor et al. - 2013 - Deep AutoRegressive Networks.pdf}
}
@article{heMaskRCNN2017,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1703.06870},
primaryClass = {cs},
title = {Mask {{R}}-{{CNN}}},
abstract = {We present a conceptually simple, flexible, and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN, running at 5 fps. Moreover, Mask R-CNN is easy to generalize to other tasks, e.g., allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges, including instance segmentation, bounding-box object detection, and person keypoint detection. Without bells and whistles, Mask R-CNN outperforms all existing, single-model entries on every task, including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition. Code has been made available at: https://github.com/facebookresearch/Detectron},
journal = {arXiv:1703.06870},
author = {He, Kaiming and Gkioxari, Georgia and Doll\'ar, Piotr and Girshick, Ross},
month = mar,
year = {2017},
keywords = {Computer Science - Computer Vision and Pattern Recognition},
file = {/Users/tacos/Zotero/storage/F6IR6WBV/He et al. - 2017 - Mask R-CNN.pdf;/Users/tacos/Zotero/storage/V4JREPKK/1703.html}
}
@inproceedings{AutoencodersMinimumDescription,
title = {Autoencoders, {{Minimum Description Length}} and {{Helmholtz Free Energy}}},
author={Hinton, Geoffrey E and Zemel, Richard S},
booktitle=neurips,
pages={3--10},
year={1994}
}
@inproceedings{hintonKeepingNeuralNetworks,
title = {Keeping {{Neural Networks Simple}} by {{Minimizing}} the {{Description Length}} of the {{Weights}}},
author = {Hinton, Geoffrey E and {van Camp}, Drew},
booktitle={ACM Conf. on Computational Learning Theory},
year={1993},
keywords = {Machine Learning \\\& Statistics/Neural Networks,Machine Learning \& Statistics/Neural Networks}
}
@article{honkelaVariationalLearningBitsBack2004,
title = {Variational {{Learning}} and {{Bits}}-{{Back Coding}}: {{An Information}}-{{Theoretic View}} to {{Bayesian Learning}}},
volume = {15},
issn = {1045-9227},
shorttitle = {Variational {{Learning}} and {{Bits}}-{{Back Coding}}},
doi = {10.1109/TNN.2004.828762},
abstract = {The bits-back coding first introduced by Wallace in 1990 and later by Hinton and van Camp in 1993 provides an interesting link between Bayesian learning and information-theoretic minimum-description-length (MDL) learning approaches. The bits-back coding allows interpreting the cost function used in the variational Bayesian method called ensemble learning as a code length in addition to the Bayesian view of misfit of the posterior approximation and a lower bound of model evidence. Combining these two viewpoints provides interesting insights to the learning process and the functions of different parts of the model. In this paper, the problem of variational Bayesian learning of hierarchical latent variable models is used to demonstrate the benefits of the two views. The code-length interpretation provides new views to many parts of the problem such as model comparison and pruning and helps explain many phenomena occurring in learning.},
language = {en},
number = {4},
journal = {IEEE Transactions on Neural Networks},
author = {Honkela, A. and Valpola, H.},
month = jul,
year = {2004},
pages = {800-810},
file = {/Users/tacos/Zotero/storage/HALC87JJ/Honkela and Valpola - 2004 - Variational Learning and Bits-Back Coding An Info.pdf;/Users/tacos/Zotero/storage/PUSRNW8R/Honkela and Valpola - 2004 - Variational Learning and Bits-Back Coding An Info.pdf}
}
%I
@article{ioffeBatchNormalizationAccelerating2015,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1502.03167},
primaryClass = {cs},
title = {Batch {{Normalization}}: {{Accelerating Deep Network Training}} by {{Reducing Internal Covariate Shift}}},
shorttitle = {Batch {{Normalization}}},
abstract = {Training Deep Neural Networks is complicated by the fact that the distribution of each layer's inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs. Our method draws its strength from making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization. It also acts as a regularizer, in some cases eliminating the need for Dropout. Applied to a state-of-the-art image classification model, Batch Normalization achieves the same accuracy with 14 times fewer training steps, and beats the original model by a significant margin. Using an ensemble of batchnormalized networks, we improve upon the best published result on ImageNet classification: reaching 4.9\% top-5 validation error (and 4.8\% test error), exceeding the accuracy of human raters.},
language = {en},
journal = {arXiv:1502.03167},
author = {Ioffe, Sergey and Szegedy, Christian},
month = feb,
year = {2015},
keywords = {Computer Science - Machine Learning,Machine Learning \& Statistics/Neural Networks,Paper bibliographies/ICML2016_GCNN,Paper bibliographies/PhD Thesis},
file = {/Users/tacos/Zotero/storage/UPLPVKX8/Ioffe and Szegedy - 2015 - Batch Normalization Accelerating Deep Network Tra.pdf}
}
%J
@inproceedings{johnstonImprovedLossyImage2017,
title = {Improved {{Lossy Image Compression}} with {{Priming}} and {{Spatially Adaptive Bit Rates}} for {{Recurrent Networks}}},
abstract = {We propose a method for lossy image compression based on recurrent, convolutional neural networks that outperforms BPG (4:2:0 ), WebP, JPEG2000, and JPEG as measured by MS-SSIM. We introduce three improvements over previous research that lead to this state-of-the-art result. First, we show that training with a pixel-wise loss weighted by SSIM increases reconstruction quality according to several metrics. Second, we modify the recurrent architecture to improve spatial diffusion, which allows the network to more effectively capture and propagate image information through the network's hidden state. Finally, in addition to lossless entropy coding, we use a spatially adaptive bit allocation algorithm to more efficiently use the limited number of bits to encode visually complex image regions. We evaluate our method on the Kodak and Tecnick image sets and compare against standard codecs as well recently published methods based on deep neural networks.},
author = {Johnston, Nick and Vincent, Damien and Minnen, David and Covell, Michele and Singh, Saurabh and Chinen, Troy and Hwang, Sung Jin and Shor, Joel and Toderici, George},
booktitle={CVPR},
year = {2017},
keywords = {Machine Learning \\\& Statistics/Generative/Data Compression,Machine Learning \& Statistics/Generative/Data Compression}
}
%K
@inproceedings{kalchbrennerVideoPixelNetworks2016,
title = {Video {{Pixel Networks}}},
abstract = {We propose a probabilistic video model, the Video Pixel Network (VPN),
that estimates the discrete joint distribution of the raw pixel values in
a video. The model and the neural architecture reflect the time, space and
color structure of video tensors and encode it as a four-dimensional
dependency chain. The VPN approaches the best possible performance on the
Moving MNIST benchmark, a leap over the previous state of the art, and the
generated videos show only minor deviations from the ground truth. The VPN
also produces detailed samples on the action-conditional Robotic Pushing
benchmark and generalizes to the motion of novel objects.},
author = {Kalchbrenner, Nal and {van den Oord}, Aaron and Simonyan, Karen and Danihelka, Ivo and Vinyals, Oriol and Graves, Alex and Kavukcuoglu, Koray},
booktitle={ICML},
pages={1771--1779},
year={2017},
keywords = {Machine Learning \\\& Statistics/Generative/Autoregressive,Machine Learning \& Statistics/Generative/Autoregressive}
}
@inproceedings{kingmaAdamMethodStochastic2015,
title = {Adam: {{A Method}} for {{Stochastic Optimization}}},
booktitle = iclr,
author = {Kingma, D and Ba, J},
year = {2015},
}
@article{kingmaAutoEncodingVariationalBayes2013,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1312.6114},
primaryClass = {cs, stat},
title = {Auto-{{Encoding Variational Bayes}}},
abstract = {How can we perform efficient inference and learning in directed probabilistic models, in the presence of continuous latent variables with intractable posterior distributions, and large datasets? We introduce a stochastic variational inference and learning algorithm that scales to large datasets and, under some mild differentiability conditions, even works in the intractable case. Our contributions is two-fold. First, we show that a reparameterization of the variational lower bound yields a lower bound estimator that can be straightforwardly optimized using standard stochastic gradient methods. Second, we show that for i.i.d. datasets with continuous latent variables per datapoint, posterior inference can be made especially efficient by fitting an approximate inference model (also called a recognition model) to the intractable posterior using the proposed lower bound estimator. Theoretical advantages are reflected in experimental results.},
journal = {arXiv:1312.6114},
author = {Kingma, Diederik P. and Welling, Max},
month = dec,
year = {2013},
keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
file = {/Users/tacos/Zotero/storage/4GBY57B7/Kingma and Welling - 2013 - Auto-Encoding Variational Bayes.pdf;/Users/tacos/Zotero/storage/EB6XB9XT/Kingma and Welling - 2013 - Auto-Encoding Variational Bayes.pdf;/Users/tacos/Zotero/storage/SLZZCUFT/1312.html;/Users/tacos/Zotero/storage/VD3M9KZP/1312.html}
}
%L
@inproceedings{laudeDeepLearningbasedIntra2016a,
title = {Deep Learning-Based Intra Prediction Mode Decision for {{HEVC}}},
doi = {10.1109/PCS.2016.7906399},
abstract = {The High Efficiency Video Coding standard and its screen content coding
extension provide superior coding efficiency compared to predecessor
standards. However, this coding efficiency is achieved at the expense of
very complex encoders. One major complexity driver is the comprehensive
rate distortion (RD) optimization. In this paper, we present a deep
learning-based encoder control which replaces the conventional RD
optimization for the intra prediction mode with deep convolutional neural
network (CNN) classifiers. Thereby, we save the RD optimization
complexity. Our classifiers operate independently of any encoder decisions
and reconstructed sample values. Thus, no additional systematic latency is
introduced. Furthermore, the loss in coding efficiency is negligible with
an average value of 0.52\% over HM-16.6+SCM-5.2.},
booktitle = {Picture Coding Symposium},
author = {Laude, T and Ostermann, J},
month = dec,
year = {2016},
keywords = {Machine Learning \& Statistics/Generative/Data Compression,Encoding,neural nets,video coding,HEVC,learning (artificial intelligence),Convolutional codes,Video coding,Standards,CNN classifier,Complexity theory,comprehensive rate distortion optimization,deep convolutional neural network classifier,deep learning-based encoder control,deep learning-based intra prediction mode decision,encoder decision,high efficiency video coding standard,image classification,image reconstruction,Machine learning algorithms,optimisation,Optimization,prediction theory,rate distortion theory,RD optimization complexity,sample value reconstruction,screen content coding extension,systematic latency,Machine Learning \\\& Statistics/Generative/Data Compression},
pages = {1-5}
}
@unpublished{CONTEXTADAPTIVEENTROPYMODEL,
title = {Context-Adaptive {{Entropy Model}} for {{End}}-to-End {{Optimized Image Compression}}},
author = {Lee, Jooyoung and Cho, Seunghyun and Beack, Seung-Kwon},
year = {2018},
file = {/Users/tacos/Zotero/storage/73S88V42/CONTEXT-ADAPTIVE ENTROPY MODEL FOR END-TO- END OPTIMIZED IMAGE COMPRESSION.pdf}
}
@inproceedings{li2018learning,
title = {Learning Convolutional Networks for Content-Weighted Image Compression},
booktitle = cvpr,
author = {Li, Mu and Zuo, Wangmeng and Gu, Shuhang and Zhao, Debin and Zhang, David},
year = {2018},
}
@article{liDisentangledSequentialAutoencoder2018a,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1803.02991},
primaryClass = {cs},
title = {Disentangled {{Sequential Autoencoder}}},
abstract = {We present a VAE architecture for encoding and generating high dimensional sequential data, such as video or audio. Our deep generative model learns a latent representation of the data which is split into a static and dynamic part, allowing us to approximately disentangle latent time-dependent features (dynamics) from features which are preserved over time (content). This architecture gives us partial control over generating content and dynamics by conditioning on either one of these sets of features. In our experiments on artificially generated cartoon video clips and voice recordings, we show that we can convert the content of a given sequence into another one by such content swapping. For audio, this allows us to convert a male speaker into a female speaker and vice versa, while for video we can separately manipulate shapes and dynamics. Furthermore, we give empirical evidence for the hypothesis that stochastic RNNs as latent state models are more efficient at compressing and generating long sequences than deterministic ones, which may be relevant for applications in video compression.},
journal = {arXiv:1803.02991 [cs]},
author = {Li, Yingzhen and Mandt, Stephan},
month = mar,
year = {2018},
}
%M
@article{mehriSampleRNNUnconditionalEndtoEnd2016,
archivePrefix = {arXiv},
eprinttype = {arxiv},
eprint = {1612.07837},
primaryClass = {cs},
title = {{{SampleRNN}}: {{An Unconditional End}}-to-{{End Neural Audio Generation Model}}},
shorttitle = {{{SampleRNN}}},
abstract = {In this paper we propose a novel model for unconditional audio generation based on generating one audio sample at a time. We show that our model, which profits from combining memory-less modules, namely autoregressive multilayer perceptrons, and stateful recurrent neural networks in a hierarchical structure is able to capture underlying sources of variations in the temporal sequences over very long time spans, on three datasets of different nature. Human evaluation on the generated samples indicate that our model is preferred over competing models. We also show how each component of the model contributes to the exhibited performance.},
language = {en},
journal = {arXiv:1612.07837},
author = {Mehri, Soroush and Kumar, Kundan and Gulrajani, Ishaan and Kumar, Rithesh and Jain, Shubham and Sotelo, Jose and Courville, Aaron and Bengio, Yoshua},
month = dec,
year = {2016},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Sound},
file = {/Users/tacos/Zotero/storage/3LBIXLND/Mehri et al. - 2016 - SampleRNN An Unconditional End-to-End Neural Audi.pdf;/Users/tacos/Zotero/storage/F8SKJBS6/Mehri et al. - 2016 - SampleRNN An Unconditional End-to-End Neural Audi.pdf}
}
%N
%O
@inproceedings{ofliBerkeleyMHADComprehensive2013,
address = {Clearwater Beach, FL, USA},
title = {Berkeley {{MHAD}}: {{A}} Comprehensive {{Multimodal Human Action Database}}},
isbn = {978-1-4673-5054-9 978-1-4673-5053-2 978-1-4673-5052-5},
shorttitle = {Berkeley {{MHAD}}},
doi = {10.1109/WACV.2013.6474999},
abstract = {Over the years, a large number of methods have been proposed to analyze human pose and motion information from images, videos, and recently from depth data. Most methods, however, have been evaluated on datasets that were too specific to each application, limited to a particular modality, and more importantly, captured under unknown conditions. To address these issues, we introduce the Berkeley Multimodal Human Action Database (MHAD) consisting of temporally synchronized and geometrically calibrated data from an optical motion capture system, multibaseline stereo cameras from multiple views, depth sensors, accelerometers and microphones. This controlled multimodal dataset provides researchers an inclusive testbed to develop and benchmark new algorithms across multiple modalities under known capture conditions in various research domains. To demonstrate possible use of MHAD for action recognition, we compare results using the popular Bag-of-Words algorithm adapted to each modality independently with the results of various combinations of modalities using the Multiple Kernel Learning. Our comparative results show that multimodal analysis of human motion yields better action recognition rates than unimodal analysis.},
language = {en},
booktitle = {{{IEEE Workshop}} on {{Applications}} of {{Computer Vision}}},
author = {Ofli, Ferda and Chaudhry, Rizwan and Kurillo, Gregorij and Vidal, Rene and Bajcsy, Ruzena},
month = jan,
year = {2013},
pages = {53-60},
file = {/Users/tacos/Zotero/storage/LRG8KVK2/Ofli et al. - 2013 - Berkeley MHAD A comprehensive Multimodal Human Ac.pdf}
}
%P
@unpublished{pessoaEndtoEndLearningVideo2018,
title = {End-to-{{End Learning}} of {{Video Compression}} Using {{Spatio}}-{{Temporal Autoencoders}}},
author = {Pessoa, Jorge and Aidos, Helena and Tom\'as, Pedro and Figueiredo, M\'ario AT},
year = {2018},
file = {/Users/tacos/Zotero/storage/RHPSBXW6/END-TO-END LEARNING OF VIDEO COMPRESSION USING SPATIO-TEMPORAL AUTOENCODERS.pdf;/Users/tacos/Zotero/storage/VF5KKJQF/END-TO-END LEARNING OF VIDEO COMPRESSION USING SPATIO-TEMPORAL AUTOENCODERS.pdf}
}
@inproceedings{prakashSemanticPerceptualImage2017a,
title = {Semantic {{Perceptual Image Compression Using Deep Convolution Networks}}},
doi = {10.1109/DCC.2017.56},
abstract = {It has long been considered a significant problem to improve the visual
quality of lossy image and video compression. Recent advances in computing
power together with the availability of large training data sets has
increased interest in the application of deep learning CNNs to address
image recognition and image processing tasks. Here, we present a powerful
CNN tailored to the specific task of semantic image understanding to
achieve higher visual quality in lossy compression. A modest increase in
complexity is incorporated to the encoder which allows a standard,
off-the-shelf JPEG decoder to be used. While JPEG encoding may be
optimized for generic images, the process is ultimately unaware of the
specific content of the image to be compressed. Our technique makes JPEG
content-aware by designing and training a model to identify multiple
semantic regions in a given image. Unlike object detection techniques, our
model does not require labeling of object positions and is able to
identify objects in a single pass. We present a new CNN architecture
directed specifically to image compression, which generates a map that
highlights semantically-salient regions so that they can be encoded at
higher quality as compared to background regions. By adding a complete set
of features for every class, and then taking a threshold over the sum of
all feature activations, we generate a map that highlights
semantically-salient regions so that they can be encoded at a better
quality compared to background regions. Experiments are presented on the
Kodak PhotoCD dataset and the MIT Saliency Benchmark dataset, in which our
algorithm achieves higher visual quality for the same compressed size
while preserving PSNR.},
booktitle = {Data Compression Conference},
author = {Prakash, A and Moran, N and Garber, S and Dilillo, A and Storer, J},
month = apr,
year = {2017},
keywords = {Machine Learning \& Statistics/Generative/Data Compression,neural nets,Training,video coding,learning (artificial intelligence),Convolution,Image compression,convolution,convolutional neural networks,data compression,deep convolution networks,deep learning,deep learning CNN,Image coding,image processing,image recognition,jpeg,JPEG decoder,JPEG encoding,Kodak PhotoCD dataset,lossy image compression,lossy video compression,MIT Saliency Benchmark dataset,perceptual image compression,semantic image understanding,semantic perceptual image compression,semantically-salient regions,Semantics,Standards,visual quality,Visualization,Machine Learning \\\& Statistics/Generative/Data Compression},
pages = {250-259}
}
%R
@article{rezendeStochasticBackpropagationApproximate2014,
title = {Stochastic {{Backpropagation}} and {{Approximate Inference}} in {{Deep Generative Models}}},
language = {en},
author = {Rezende, Danilo Jimenez and Mohamed, Shakir and Wierstra, Daan},
year = {2014},
journal={arXiv preprint arXiv:1401.4082},
file = {/Users/tacos/Zotero/storage/5675G3DU/Rezende et al. - 2014 - Stochastic Backpropagation and Approximate Inferen.pdf;/Users/tacos/Zotero/storage/CQSHQXJ2/Rezende et al. - 2014 - Stochastic Backpropagation and Approximate Inferen.pdf;/Users/tacos/Zotero/storage/HC5GVEW3/1401.html;/Users/tacos/Zotero/storage/XXNSFNVA/1401.html}
}
@inproceedings{rippelRealTimeAdaptiveImage2017a,
title = {Real-{{Time Adaptive Image Compression}}},
abstract = {We present a machine learning-based approach to lossy image compression
which outperforms all existing codecs, while running in real-time. Our
algorithm typically produces file sizes 3 times smaller than JPEG, 2.5
times smaller than JPEG 2000, and 2.3 times smaller than WebP on datasets
of generic images across a spectrum of quality levels. At the same time,
our codec is designed to be lightweight and deployable: for example, it
can encode or decode the Kodak dataset in less than 10ms per image on GPU.
Our architecture is an autoencoder featuring pyramidal analysis, an
adaptive coding module, and regularization of the expected codelength. We
also supplement our approach with adversarial training specialized towards
use in a compression setting: this enables us to produce visually pleasing
reconstructions for very low bitrates.},
author = {Rippel, Oren and Bourdev, Lubomir},
year = {2017},
booktitle={ICML},
pages={2922--2930},
keywords = {Machine Learning \& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/Data Compression}
}
%S
@article{salimansPIXELCNNIMPROVINGPIXELCNN2017,
title = {{{PixelCNN}}++: {{Improving}} the {{PixelCNN}} with {{Discretized Logistic Mixture Likelihood}} and {{Other Modifications}}},
abstract = {PixelCNNs are a recently proposed class of powerful generative models with tractable likelihood. Here we discuss our implementation of PixelCNNs which we make available at https://github.com/openai/pixel-cnn. Our implementation contains a number of modifications to the original model that both simplify its structure and improve its performance. 1) We use a discretized logistic mixture likelihood on the pixels, rather than a 256-way softmax, which we find to speed up training. 2) We condition on whole pixels, rather than R/G/B sub-pixels, simplifying the model structure. 3) We use downsampling to efficiently capture structure at multiple resolutions. 4) We introduce additional short-cut connections to further speed up optimization. 5) We regularize the model using dropout. Finally, we present state-of-the-art log likelihood results on CIFAR-10 to demonstrate the usefulness of these modifications.},
language = {English},
author = {Salimans, Tim and Karpathy, Andrej and Chen, Xi and Kingma, Diederik P},
year = {2017},
pages = {10},
file = {/Users/tacos/Zotero/storage/6ZF6AJBJ/Salimans et al. - 2017 - PIXELCNN++ IMPROVING THE PIXELCNN WITH DISCRETIZE.pdf;/Users/tacos/Zotero/storage/ZHIEH28N/Salimans et al. - 2017 - PIXELCNN++ IMPROVING THE PIXELCNN WITH DISCRETIZE.pdf}
}
@inproceedings{santurkar2018generative,
title = {Generative Compression},
booktitle = {Picture Coding Symposium},
author = {Santurkar, Shibani and Budden, David and Shavit, Nir},
year = {2018},
keywords = {Machine Learning \& Statistics/Generative/Data Compression},
pages = {258-262}
}
@unpublished{snellLearningGenerateImages2015,
title = {Learning to {{Generate Images}} with {{Perceptual Similarity Metrics}}},
abstract = {Deep networks are increasingly being applied to problems involving image synthesis, e.g., generating images from textual descriptions and reconstructing an input image from a compact representation. Supervised training of image-synthesis networks typically uses a pixel-wise loss (PL) to indicate the mismatch between a generated image and its corresponding target image. We propose instead to use a loss function that is better calibrated to human perceptual judgments of image quality: the multiscale structural-similarity score (MS-SSIM). Because MS-SSIM is differentiable, it is easily incorporated into gradient-descent learning. We compare the consequences of using MS-SSIM versus PL loss on training deterministic and stochastic autoencoders. For three different architectures, we collected human judgments of the quality of image reconstructions. Observers reliably prefer images synthesized by MS-SSIM-optimized models over those synthesized by PL-optimized models, for two distinct PL measures (\$\textbackslash{}ell\_1\$ and \$\textbackslash{}ell\_2\$ distances). We also explore the effect of training objective on image encoding and analyze conditions under which perceptually-optimized representations yield better performance on image classification. Finally, we demonstrate the superiority of perceptually-optimized networks for super-resolution imaging. Just as computer vision has advanced through the use of convolutional architectures that mimic the structure of the mammalian visual system, we argue that significant additional advances can be made in modeling images through the use of training objectives that are well aligned to characteristics of human perception.},
author = {Snell, Jake and Ridgeway, Karl and Liao, Renjie and Roads, Brett D and Mozer, Michael C and Zemel, Richard S},
month = nov,
year = {2015},
keywords = {Machine Learning \\\& Statistics/Generative/Data Compression,Machine Learning \& Statistics/Generative/Data Compression}
}
@article{sullivanOverviewHighEfficiency2012,
title = {{Overview of the High Efficiency Video Coding (HEVC) Standard}},
volume = {22},
issn = {1051-8215},
number = {12},
journal = {IEEE Trans. Circuits Syst. Video Technol.},
author = {Sullivan, G J and Ohm, J R and Han, W J and Wiegand, T},
month = dec,
year = {2012},
pages = {1649-1668}
}
%T
@article{todericiVariableRateImage2015a,
title = {Variable {{Rate Image Compression}} with {{Recurrent Neural Networks}}},
author = {Toderici, George and O'Malley, Sean M and Hwang, Sung Jin and Vincent, Damien and Minnen, David and Baluja, Shumeet and Covell, Michele and Sukthankar, Rahul},
year = {2016},
journal = iclr,
}
@inproceedings{todericiFullResolutionImage2017,
title = {Full {{Resolution Image Compression With Recurrent Neural Networks}}},
booktitle = cvpr,
author = {Toderici, George and Vincent, Damien and Johnston, Nick and Jin Hwang, Sung and Minnen, David and Shor, Joel and Covell, Michele},
year = {2017},
}
@inproceedings{tsaiLearningBinaryResidual2017a,
title = {Learning {{Binary Residual Representations}} for {{Domain}}-Specific {{Video Streaming}}},
abstract = {We study domain-specific video streaming. Specifically, we target a
streaming setting where the videos to be streamed from a server to a
client are all in the same domain and they have to be compressed to a
small size for low-latency transmission. Several popular video streaming
services, such as the video game streaming services of GeForce Now and
Twitch, fall in this category. While conventional video compression
standards such as H.264 are commonly used for this task, we hypothesize
that one can leverage the property that the videos are all in the same
domain to achieve better video quality. Based on this hypothesis, we
propose a novel video compression pipeline. Specifically, we first apply
H.264 to compress domain-specific videos. We then train a novel binary
autoencoder to encode the leftover domain-specific residual information
frame-by-frame into binary representations. These binary representations
are then compressed and sent to the client together with the H.264 stream.
In our experiments, we show that our pipeline yields consistent gains over
standard H.264 compression across several benchmark datasets while using
the same channel bandwidth.},
author = {Tsai, Yi-Hsuan and Liu, Ming-Yu and Sun, Deqing and Yang, Ming-Hsuan and Kautz, Jan},
month = dec,
year = {2017},
booktitle={AAAI},
keywords = {Machine Learning \& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/Data Compression}
}
%U
%V
%W
@article{wainwrightGraphicalModelsExponential2007a,
title = {Graphical {{Models}}, {{Exponential Families}}, and {{Variational Inference}}},
volume = {1},
issn = {1935-8237, 1935-8245},
doi = {10.1561/2200000001},
language = {English},
number = {1\textendash{}2},
journal = {Foundations and Trends\textregistered{} in Machine Learning},
author = {Wainwright, Martin J. and Jordan, Michael I.},
year = {2007},
keywords = {Machine Learning \& Statistics,Paper bibliographies/ICML2015,Paper bibliographies/PhD Thesis},
pages = {1-305},
file = {/Users/tacos/Zotero/storage/TLJMQS45/Wainwright and Jordan - 2007 - Graphical Models, Exponential Families, and Variat.pdf;/Users/tacos/Zotero/storage/Y3H5ADWW/Wainwright and Jordan - 2007 - Graphical Models, Exponential Families, and Variat.pdf}
}
@article{wangLieGroupTransformation2011a,
title = {Lie {{Group Transformation Models}} for {{Predictive Video Coding}}},
journal = {Data Compression Conference},
author = {Wang, C M and {Shol-Dickstein}, J and Tosic, Ivana and Olshausen, Bruno A},
year = {2011},
keywords = {Machine Learning \& Statistics,Machine Learning \& Statistics/Group Theoretical Learning,Paper bibliographies/MSc Thesis,Paper bibliographies/ICLR2015,Machine Learning \\\& Statistics,Machine Learning \\\& Statistics/Group Theoretical Learning},
pages = {83-92}
}
@article{wangMultiScaleStructuralSimilarity2003,
title={{Image quality assessment: from error visibility to structural similarity}},
author={Wang, Zhou and Bovik, Alan C and Sheikh, Hamid R and Simoncelli, Eero P and others},
journal={IEEE Trans. on Image Processing},
volume={13},
number={4},
pages={600--612},
year={2004}
}
%Z
%DATASETS
@misc{UVG,
title = {{Ultra Video Group} test sequences},
howpublished = {\url{http://ultravideo.cs.tut.fi/}},
note = {Accessed: 2020-02-21}
}
@misc{Xiph,
title = {Xiph.org Video Test Media [derf's collection]},
howpublished = {\url{https://media.xiph.org/video/derf/}},
note = {Accessed: 2020-02-21}
}
@misc{VTL,
title = {Video trace library},
howpublished = {\url{http://trace.eas.asu.edu/index.html}},
note = {Accessed: 2019-03-18}
}
@inproceedings{posetrack,
title={Detect-and-track: Efficient pose estimation in videos},
author={Girdhar, Rohit and Gkioxari, Georgia and Torresani, Lorenzo and Paluri, Manohar and Tran, Du},
booktitle={CVPR},
year={2018}
}
%OTHERS, UNUSED(?)
@unpublished{ADAPTIVESAMPLESPACEADAPTIVEa,
title = {{{ADAPTIVE SAMPLE}}-{{SPACE}} \& {{ADAPTIVE PROBABILITY CODING}}: {{A NEURAL}}-{{NETWORK BASED APPROACH FOR COMPRESSION}}},
file = {/Users/tacos/Zotero/storage/QQ6QSIJ6/ADAPTIVE SAMPLE-SPACE & ADAPTIVE PROBABIL- ITY CODING A NEURAL-NETWORK BASED APPROACH FOR COMPRESSION.pdf;/Users/tacos/Zotero/storage/VV7ANDIW/ADAPTIVE SAMPLE-SPACE & ADAPTIVE PROBABIL- ITY CODING A NEURAL-NETWORK BASED APPROACH FOR COMPRESSION.pdf}
}
@unpublished{CONTEXTADAPTIVEENTROPYMODEL,
title = {{{CONTEXT}}-{{ADAPTIVE ENTROPY MODEL FOR END}}-{{TO}}- {{END OPTIMIZED IMAGE COMPRESSION}}},
file = {/Users/tacos/Zotero/storage/CRNA9R4R/CONTEXT-ADAPTIVE ENTROPY MODEL FOR END-TO- END OPTIMIZED IMAGE COMPRESSION.pdf}
}
@unpublished{GENERATIVEADVERSARIALNETWORKS,
title = {Generative {{Adversarial Networks}} for {{Extreme Learned Image Compression}}},
file = {/Users/tacos/Zotero/storage/K3Z6SLC3/GENERATIVE ADVERSARIAL NETWORKS FOR EXTREME LEARNED IMAGE COMPRESSION.pdf;/Users/tacos/Zotero/storage/LY524N9H/GENERATIVE ADVERSARIAL NETWORKS FOR EXTREME LEARNED IMAGE COMPRESSION.pdf}
}
@unpublished{PRACTICALLOSSLESSCOMPRESSIONa,
title = {{{PRACTICAL LOSSLESS COMPRESSION WITH LATENT VARIABLES USING BITS BACK CODING}}},
file = {/Users/tacos/Zotero/storage/F4QJ729T/PRACTICAL LOSSLESS COMPRESSION WITH LATENT VARIABLES USING BITS BACK CODING.pdf;/Users/tacos/Zotero/storage/YKF9NBFQ/PRACTICAL LOSSLESS COMPRESSION WITH LATENT VARIABLES USING BITS BACK CODING.pdf}
}
@unpublished{bastiaankleijnWavenetBasedLow2017a,
title = {Wavenet Based Low Rate Speech Coding},
abstract = {Traditional parametric coding of speech facilitates low rate but provides
poor reconstruction quality because of the inadequacy of the model used.
We describe how a WaveNet generative speech model can be used to generate
high quality speech from the bit stream of a standard parametric coder
operating at 2.4 kb/s. We compare this parametric coder with a waveform
coder based on the same generative model and show that approximating the
signal waveform incurs a large rate penalty. Our experiments confirm the
high performance of the WaveNet based coder and show that the speech
produced by the system is able to additionally perform implicit bandwidth
extension and does not significantly impair recognition of the original
speaker for the human listener, even when that speaker has not been used
during the training of the generative model.},
author = {Bastiaan Kleijn, W and Lim, Felicia S C and Luebs, Alejandro and Skoglund, Jan and Stimberg, Florian and Wang, Quan and Walters, Thomas C},
month = dec,
year = {2017},
keywords = {Machine Learning \& Statistics/Generative/Data Compression,Machine Learning \\\& Statistics/Generative/Data Compression}
}