From 1f3e865eb4f43feb97f65f923e4fd3952afb8ea4 Mon Sep 17 00:00:00 2001
From: Yuri Putivsky <yuri@fb.com>
Date: Mon, 3 Feb 2020 14:56:24 -0800
Subject: [PATCH] Integrated base decoder into VideoReader class and
 video_utils.py (#1766)

Summary:
Pull Request resolved: https://github.com/pytorch/vision/pull/1766

Replaced FfmpegDecoder (incompativle with VUE) by base decoder (compatible with VUE).
Modified python utilities video_utils.py for internal simplification. Public interface got preserved.

Reviewed By: fmassa

Differential Revision: D19415903

fbshipit-source-id: 4d7a0158bd77bac0a18732fe4183fdd9a57f6402
---
 setup.py                                      |  32 +-
 .../csrc/cpu/decoder/audio_sampler.cpp        |  40 +-
 torchvision/csrc/cpu/decoder/audio_sampler.h  |   2 -
 torchvision/csrc/cpu/decoder/audio_stream.cpp |  53 +-
 torchvision/csrc/cpu/decoder/audio_stream.h   |   5 -
 torchvision/csrc/cpu/decoder/cc_stream.cpp    |   2 -
 torchvision/csrc/cpu/decoder/cc_stream.h      |   2 -
 torchvision/csrc/cpu/decoder/decoder.cpp      | 217 ++++---
 torchvision/csrc/cpu/decoder/decoder.h        |  20 +-
 torchvision/csrc/cpu/decoder/defs.h           |  83 ++-
 .../csrc/cpu/decoder/memory_buffer.cpp        |  75 +++
 torchvision/csrc/cpu/decoder/memory_buffer.h  |  25 +
 .../csrc/cpu/decoder/seekable_buffer.cpp      | 159 +++--
 .../csrc/cpu/decoder/seekable_buffer.h        |  29 +-
 torchvision/csrc/cpu/decoder/stream.cpp       | 201 +++++--
 torchvision/csrc/cpu/decoder/stream.h         |  38 +-
 .../csrc/cpu/decoder/subtitle_sampler.cpp     |   2 -
 .../csrc/cpu/decoder/subtitle_sampler.h       |   2 -
 .../csrc/cpu/decoder/subtitle_stream.cpp      |  31 +-
 .../csrc/cpu/decoder/subtitle_stream.h        |   8 +-
 torchvision/csrc/cpu/decoder/sync_decoder.cpp |  35 +-
 torchvision/csrc/cpu/decoder/sync_decoder.h   |   7 +-
 .../csrc/cpu/decoder/sync_decoder_test.cpp    | 139 ++++-
 torchvision/csrc/cpu/decoder/time_keeper.cpp  |  12 +-
 torchvision/csrc/cpu/decoder/time_keeper.h    |   8 +-
 torchvision/csrc/cpu/decoder/util.cpp         |   2 -
 torchvision/csrc/cpu/decoder/util.h           |   2 -
 .../csrc/cpu/decoder/video_sampler.cpp        |   2 -
 torchvision/csrc/cpu/decoder/video_sampler.h  |   2 -
 torchvision/csrc/cpu/decoder/video_stream.cpp |  58 +-
 torchvision/csrc/cpu/decoder/video_stream.h   |   9 +-
 .../cpu/video_reader/FfmpegAudioSampler.cpp   | 118 ----
 .../cpu/video_reader/FfmpegAudioSampler.h     |  32 -
 .../cpu/video_reader/FfmpegAudioStream.cpp    | 103 ----
 .../csrc/cpu/video_reader/FfmpegAudioStream.h |  54 --
 .../csrc/cpu/video_reader/FfmpegDecoder.cpp   | 412 -------------
 .../csrc/cpu/video_reader/FfmpegDecoder.h     | 127 ----
 .../csrc/cpu/video_reader/FfmpegHeaders.h     |  13 -
 .../csrc/cpu/video_reader/FfmpegSampler.h     |  16 -
 .../csrc/cpu/video_reader/FfmpegStream.cpp    | 188 ------
 .../csrc/cpu/video_reader/FfmpegStream.h      |  69 ---
 .../csrc/cpu/video_reader/FfmpegUtil.cpp      | 111 ----
 .../csrc/cpu/video_reader/FfmpegUtil.h        |  27 -
 .../cpu/video_reader/FfmpegVideoSampler.cpp   |  90 ---
 .../cpu/video_reader/FfmpegVideoSampler.h     |  32 -
 .../cpu/video_reader/FfmpegVideoStream.cpp    | 115 ----
 .../csrc/cpu/video_reader/FfmpegVideoStream.h |  54 --
 .../csrc/cpu/video_reader/Interface.cpp       |  22 -
 torchvision/csrc/cpu/video_reader/Interface.h | 127 ----
 .../csrc/cpu/video_reader/VideoReader.cpp     | 547 ++++++++++++------
 .../csrc/cpu/video_reader/VideoReader.h       |  96 ---
 torchvision/csrc/cpu/video_reader/util.cpp    |  60 --
 torchvision/csrc/cpu/video_reader/util.h      |  26 -
 53 files changed, 1144 insertions(+), 2597 deletions(-)
 create mode 100644 torchvision/csrc/cpu/decoder/memory_buffer.cpp
 create mode 100644 torchvision/csrc/cpu/decoder/memory_buffer.h
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.cpp
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.h
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegAudioStream.cpp
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegAudioStream.h
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegDecoder.cpp
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegDecoder.h
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegHeaders.h
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegSampler.h
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegStream.cpp
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegStream.h
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegUtil.h
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp
 delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h
 delete mode 100644 torchvision/csrc/cpu/video_reader/Interface.cpp
 delete mode 100644 torchvision/csrc/cpu/video_reader/Interface.h
 delete mode 100644 torchvision/csrc/cpu/video_reader/util.cpp
 delete mode 100644 torchvision/csrc/cpu/video_reader/util.h

diff --git a/setup.py b/setup.py
index 4ebf6355e7a..bba3bb6fe45 100644
--- a/setup.py
+++ b/setup.py
@@ -180,41 +180,21 @@ def get_extensions():
         ffmpeg_root = os.path.dirname(ffmpeg_bin)
         ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include')
 
-        # TorchVision video reader
+        # TorchVision base decoder + video reader
         video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader')
         video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp"))
-
-        ext_modules.append(
-            CppExtension(
-                'torchvision.video_reader',
-                video_reader_src,
-                include_dirs=[
-                    video_reader_src_dir,
-                    ffmpeg_include_dir,
-                    extensions_dir,
-                ],
-                libraries=[
-                    'avcodec',
-                    'avformat',
-                    'avutil',
-                    'swresample',
-                    'swscale',
-                ],
-                extra_compile_args=["-std=c++14"],
-                extra_link_args=["-std=c++14"],
-            )
-        )
-
-        # TorchVision base decoder
         base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder')
         base_decoder_src = glob.glob(os.path.join(base_decoder_src_dir, "[!sync_decoder_test]*.cpp"))
 
+        combined_src = video_reader_src + base_decoder_src
+
         ext_modules.append(
             CppExtension(
-                'torchvision.base_decoder',
-                base_decoder_src,
+                'torchvision.video_reader',
+                combined_src,
                 include_dirs=[
                     base_decoder_src_dir,
+                    video_reader_src_dir,
                     ffmpeg_include_dir,
                     extensions_dir,
                 ],
diff --git a/torchvision/csrc/cpu/decoder/audio_sampler.cpp b/torchvision/csrc/cpu/decoder/audio_sampler.cpp
index c10fceb852d..4092df98359 100644
--- a/torchvision/csrc/cpu/decoder/audio_sampler.cpp
+++ b/torchvision/csrc/cpu/decoder/audio_sampler.cpp
@@ -1,15 +1,10 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "audio_sampler.h"
 #include <c10/util/Logging.h>
 #include "util.h"
 
-// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24
-
-#ifndef SWR_CH_MAX
-#define SWR_CH_MAX 32
-#endif
+#define AVRESAMPLE_MAX_CHANNELS 32
 
+// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24
 namespace ffmpeg {
 
 namespace {
@@ -94,9 +89,12 @@ int AudioSampler::numOutputSamples(int inSamples) const {
 }
 
 int AudioSampler::getSamplesBytes(AVFrame* frame) const {
-  return av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) *
-      numOutputSamples(frame ? frame->nb_samples : 0) *
-      params_.out.audio.channels;
+  return av_samples_get_buffer_size(
+      nullptr,
+      params_.out.audio.channels,
+      numOutputSamples(frame ? frame->nb_samples : 0),
+      (AVSampleFormat)params_.out.audio.format,
+      1);
 }
 
 int AudioSampler::sample(
@@ -104,7 +102,7 @@ int AudioSampler::sample(
     int inNumSamples,
     ByteStorage* out,
     int outNumSamples) {
-  uint8_t* outPlanes[SWR_CH_MAX] = {nullptr};
+  uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
   int result;
   if ((result = preparePlanes(
            params_.out.audio, out->writableTail(), outNumSamples, outPlanes)) <
@@ -140,9 +138,12 @@ int AudioSampler::sample(AVFrame* frame, ByteStorage* out) {
     return 0;
   }
 
-  const auto samplesBytes =
-      av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) *
-      outNumSamples * params_.out.audio.channels;
+  const auto samplesBytes = av_samples_get_buffer_size(
+      nullptr,
+      params_.out.audio.channels,
+      outNumSamples,
+      (AVSampleFormat)params_.out.audio.format,
+      1);
 
   // bytes must be allocated
   CHECK_LE(samplesBytes, out->tail());
@@ -167,14 +168,17 @@ int AudioSampler::sample(const ByteStorage* in, ByteStorage* out) {
     return 0;
   }
 
-  const auto samplesBytes =
-      av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) *
-      outNumSamples * params_.out.audio.channels;
+  const auto samplesBytes = av_samples_get_buffer_size(
+      nullptr,
+      params_.out.audio.channels,
+      outNumSamples,
+      (AVSampleFormat)params_.out.audio.format,
+      1);
 
   out->clear();
   out->ensure(samplesBytes);
 
-  uint8_t* inPlanes[SWR_CH_MAX] = {nullptr};
+  uint8_t* inPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
   int result;
   if (in &&
       (result = preparePlanes(
diff --git a/torchvision/csrc/cpu/decoder/audio_sampler.h b/torchvision/csrc/cpu/decoder/audio_sampler.h
index d68a21ea20e..c6a021d2084 100644
--- a/torchvision/csrc/cpu/decoder/audio_sampler.h
+++ b/torchvision/csrc/cpu/decoder/audio_sampler.h
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "defs.h"
diff --git a/torchvision/csrc/cpu/decoder/audio_stream.cpp b/torchvision/csrc/cpu/decoder/audio_stream.cpp
index 17ab9fceb7b..ed4d6622ecd 100644
--- a/torchvision/csrc/cpu/decoder/audio_stream.cpp
+++ b/torchvision/csrc/cpu/decoder/audio_stream.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "audio_stream.h"
 #include <c10/util/Logging.h>
 #include <limits>
@@ -8,11 +6,23 @@
 namespace ffmpeg {
 
 namespace {
+bool operator==(const AudioFormat& x, const AVFrame& y) {
+  return x.samples == y.sample_rate && x.channels == y.channels &&
+      x.format == y.format;
+}
+
 bool operator==(const AudioFormat& x, const AVCodecContext& y) {
   return x.samples == y.sample_rate && x.channels == y.channels &&
       x.format == y.sample_fmt;
 }
 
+AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) {
+  x.samples = y.sample_rate;
+  x.channels = y.channels;
+  x.format = y.format;
+  return x;
+}
+
 AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) {
   x.samples = y.sample_rate;
   x.channels = y.channels;
@@ -29,7 +39,8 @@ AudioStream::AudioStream(
     : Stream(
           inputCtx,
           MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime) {}
+          convertPtsToWallTime,
+          0) {}
 
 AudioStream::~AudioStream() {
   if (sampler_) {
@@ -65,12 +76,15 @@ int AudioStream::initFormat() {
 
 int AudioStream::estimateBytes(bool flush) {
   ensureSampler();
-  if (!(sampler_->getInputFormat().audio == *codecCtx_)) {
+  // check if input format gets changed
+  if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_)
+            : !(sampler_->getInputFormat().audio == *frame_)) {
     // - reinit sampler
     SamplerParameters params;
     params.type = format_.type;
     params.out = format_.format;
-    toAudioFormat(params.in.audio, *codecCtx_);
+    flush ? toAudioFormat(params.in.audio, *codecCtx_)
+          : toAudioFormat(params.in.audio, *frame_);
     if (flush || !sampler_->init(params)) {
       return -1;
     }
@@ -84,7 +98,7 @@ int AudioStream::estimateBytes(bool flush) {
             << ", channels: " << format_.format.audio.channels
             << ", format: " << format_.format.audio.format;
   }
-  return sampler_->getSamplesBytes(frame_);
+  return sampler_->getSamplesBytes(flush ? nullptr : frame_);
 }
 
 int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
@@ -92,31 +106,4 @@ int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
   return sampler_->sample(flush ? nullptr : frame_, out);
 }
 
-void AudioStream::setHeader(DecoderHeader* header) {
-  header->seqno = numGenerator_++;
-
-  if (codecCtx_->time_base.num != 0) {
-    header->pts = av_rescale_q(
-        av_frame_get_best_effort_timestamp(frame_),
-        codecCtx_->time_base,
-        AV_TIME_BASE_Q);
-  } else {
-    // If the codec time_base is missing then we would've skipped the
-    // rescalePackage step to rescale to codec time_base, so here we can
-    // rescale straight from the stream time_base into AV_TIME_BASE_Q.
-    header->pts = av_rescale_q(
-        av_frame_get_best_effort_timestamp(frame_),
-        inputCtx_->streams[format_.stream]->time_base,
-        AV_TIME_BASE_Q);
-  }
-
-  if (convertPtsToWallTime_) {
-    keeper_.adjust(header->pts);
-  }
-
-  header->keyFrame = 1;
-  header->fps = std::numeric_limits<double>::quiet_NaN();
-  header->format = format_;
-}
-
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/audio_stream.h b/torchvision/csrc/cpu/decoder/audio_stream.h
index c7708a3356d..4d200114e4a 100644
--- a/torchvision/csrc/cpu/decoder/audio_stream.h
+++ b/torchvision/csrc/cpu/decoder/audio_stream.h
@@ -1,10 +1,7 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "audio_sampler.h"
 #include "stream.h"
-#include "time_keeper.h"
 
 namespace ffmpeg {
 
@@ -25,13 +22,11 @@ class AudioStream : public Stream {
   int initFormat() override;
   int estimateBytes(bool flush) override;
   int copyFrameBytes(ByteStorage* out, bool flush) override;
-  void setHeader(DecoderHeader* header) override;
 
   void ensureSampler();
 
  private:
   std::unique_ptr<AudioSampler> sampler_;
-  TimeKeeper keeper_;
 };
 
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/cc_stream.cpp b/torchvision/csrc/cpu/decoder/cc_stream.cpp
index 47de485b100..7b443146289 100644
--- a/torchvision/csrc/cpu/decoder/cc_stream.cpp
+++ b/torchvision/csrc/cpu/decoder/cc_stream.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "cc_stream.h"
 
 namespace ffmpeg {
diff --git a/torchvision/csrc/cpu/decoder/cc_stream.h b/torchvision/csrc/cpu/decoder/cc_stream.h
index 34506d3259f..d8c98f7be23 100644
--- a/torchvision/csrc/cpu/decoder/cc_stream.h
+++ b/torchvision/csrc/cpu/decoder/cc_stream.h
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "subtitle_stream.h"
diff --git a/torchvision/csrc/cpu/decoder/decoder.cpp b/torchvision/csrc/cpu/decoder/decoder.cpp
index d8f324863e4..b78c1e47214 100644
--- a/torchvision/csrc/cpu/decoder/decoder.cpp
+++ b/torchvision/csrc/cpu/decoder/decoder.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "decoder.h"
 #include <c10/util/Logging.h>
 #include <future>
@@ -15,9 +13,8 @@ namespace ffmpeg {
 
 namespace {
 
-constexpr ssize_t kMinSeekBufferSize = 1024;
-constexpr ssize_t kMaxSeekBufferSize = 4 * 1024;
-constexpr size_t kIoBufferSize = 4 * 1024;
+constexpr size_t kIoBufferSize = 96 * 1024;
+constexpr size_t kIoPaddingSize = AV_INPUT_BUFFER_PADDING_SIZE;
 constexpr size_t kLogBufferSize = 1024;
 
 int ffmpeg_lock(void** mutex, enum AVLockOp op) {
@@ -205,7 +202,7 @@ void Decoder::initOnce() {
     av_lockmgr_register(&ffmpeg_lock);
     av_log_set_callback(Decoder::logFunction);
     av_log_set_level(AV_LOG_ERROR);
-    LOG(INFO) << "Registered ffmpeg libs";
+    VLOG(1) << "Registered ffmpeg libs";
   });
 }
 
@@ -213,10 +210,6 @@ Decoder::Decoder() {
   initOnce();
 }
 
-Decoder::~Decoder() {
-  cleanUp();
-}
-
 bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) {
   cleanUp();
 
@@ -229,42 +222,28 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) {
   // set callback and params
   params_ = params;
 
-  auto tmpCtx = avformat_alloc_context();
-
-  if (!tmpCtx) {
+  if (!(inputCtx_ = avformat_alloc_context())) {
     LOG(ERROR) << "Cannot allocate format context";
     return false;
   }
 
   AVInputFormat* fmt = nullptr;
+  int result = 0;
   if (in) {
-    const size_t avioCtxBufferSize = kIoBufferSize;
-    uint8_t* avioCtxBuffer = (uint8_t*)av_malloc(avioCtxBufferSize);
-    if (!avioCtxBuffer) {
-      LOG(ERROR) << "av_malloc cannot allocate " << avioCtxBufferSize
-                 << " bytes";
-      avformat_close_input(&tmpCtx);
-      cleanUp();
-      return false;
-    }
-
-    bool canSeek = in(nullptr, 0, 0) == 0;
-
-    if (!seekableBuffer_.init(
-            std::forward<DecoderInCallback>(in),
-            kMinSeekBufferSize,
-            kMaxSeekBufferSize,
-            params_.timeoutMs)) {
-      LOG(ERROR) << "seekable buffer initialization failed";
-      av_free(avioCtxBuffer);
-      avformat_close_input(&tmpCtx);
+    ImageType type = ImageType::UNKNOWN;
+    if ((result = seekableBuffer_.init(
+             std::forward<DecoderInCallback>(in),
+             params_.timeoutMs,
+             params_.maxSeekableBytes,
+             params_.isImage ? &type : nullptr)) < 0) {
+      LOG(ERROR) << "can't initiate seekable buffer";
       cleanUp();
       return false;
     }
 
     if (params_.isImage) {
       const char* fmtName = "image2";
-      switch (seekableBuffer_.getImageType()) {
+      switch (type) {
         case ImageType::JPEG:
           fmtName = "jpeg_pipe";
           break;
@@ -281,6 +260,16 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) {
       fmt = av_find_input_format(fmtName);
     }
 
+    const size_t avioCtxBufferSize = kIoBufferSize;
+    uint8_t* avioCtxBuffer =
+        (uint8_t*)av_malloc(avioCtxBufferSize + kIoPaddingSize);
+    if (!avioCtxBuffer) {
+      LOG(ERROR) << "av_malloc cannot allocate " << avioCtxBufferSize
+                 << " bytes";
+      cleanUp();
+      return false;
+    }
+
     if (!(avioCtx_ = avio_alloc_context(
               avioCtxBuffer,
               avioCtxBufferSize,
@@ -288,36 +277,23 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) {
               reinterpret_cast<void*>(this),
               &Decoder::readFunction,
               nullptr,
-              canSeek ? &Decoder::seekFunction : nullptr))) {
+              result == 1 ? &Decoder::seekFunction : nullptr))) {
       LOG(ERROR) << "avio_alloc_context failed";
       av_free(avioCtxBuffer);
-      avformat_close_input(&tmpCtx);
       cleanUp();
       return false;
     }
 
-    tmpCtx->pb = avioCtx_;
+    inputCtx_->pb = avioCtx_;
+    inputCtx_->flags |= AVFMT_FLAG_CUSTOM_IO;
   }
 
-  interrupted_ = false;
-  // ffmpeg avformat_open_input call can hang if media source doesn't respond
-  // set a guard for handle such situations
-  std::promise<bool> p;
-  std::future<bool> f = p.get_future();
-  std::thread guard([&f, this]() {
-    auto timeout = std::chrono::milliseconds(params_.timeoutMs);
-    if (std::future_status::timeout == f.wait_for(timeout)) {
-      LOG(ERROR) << "Cannot open stream within " << params_.timeoutMs << " ms";
-      interrupted_ = true;
-    }
-  });
-
-  tmpCtx->opaque = reinterpret_cast<void*>(this);
-  tmpCtx->interrupt_callback.callback = Decoder::shutdownFunction;
-  tmpCtx->interrupt_callback.opaque = reinterpret_cast<void*>(this);
+  inputCtx_->opaque = reinterpret_cast<void*>(this);
+  inputCtx_->interrupt_callback.callback = Decoder::shutdownFunction;
+  inputCtx_->interrupt_callback.opaque = reinterpret_cast<void*>(this);
 
   // add network timeout
-  tmpCtx->flags |= AVFMT_FLAG_NONBLOCK;
+  inputCtx_->flags |= AVFMT_FLAG_NONBLOCK;
 
   AVDictionary* options = nullptr;
   av_dict_set_int(&options, "analyzeduration", params_.timeoutMs * 1000, 0);
@@ -326,19 +302,38 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) {
     av_dict_set_int(&options, "listen", 1, 0);
   }
 
-  int result = 0;
+  interrupted_ = false;
+
+  // ffmpeg avformat_open_input call can hang if media source doesn't respond
+  // set a guard for handle such situations, if requested
+  std::promise<bool> p;
+  std::future<bool> f = p.get_future();
+  std::unique_ptr<std::thread> guard;
+  if (params_.preventStaleness) {
+    guard = std::make_unique<std::thread>([&f, this]() {
+      auto timeout = std::chrono::milliseconds(params_.timeoutMs);
+      if (std::future_status::timeout == f.wait_for(timeout)) {
+        LOG(ERROR) << "Cannot open stream within " << params_.timeoutMs
+                   << " ms";
+        interrupted_ = true;
+      }
+    });
+  }
+
   if (fmt) {
-    result = avformat_open_input(&tmpCtx, nullptr, fmt, &options);
+    result = avformat_open_input(&inputCtx_, nullptr, fmt, &options);
   } else {
     result =
-        avformat_open_input(&tmpCtx, params_.uri.c_str(), nullptr, &options);
+        avformat_open_input(&inputCtx_, params_.uri.c_str(), nullptr, &options);
   }
-  av_dict_free(&options);
 
-  p.set_value(true);
-  guard.join();
+  av_dict_free(&options);
 
-  inputCtx_ = tmpCtx;
+  if (guard) {
+    p.set_value(true);
+    guard->join();
+    guard.reset();
+  }
 
   if (result < 0 || interrupted_) {
     LOG(ERROR) << "avformat_open_input failed, error: "
@@ -356,7 +351,7 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) {
     return false;
   }
 
-  if (!activateStreams()) {
+  if (!openStreams()) {
     LOG(ERROR) << "Cannot activate streams";
     cleanUp();
     return false;
@@ -364,20 +359,19 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) {
 
   onInit();
 
-  if (params.startOffsetMs != 0) {
-    av_seek_frame(
-        inputCtx_,
-        -1,
-        params.startOffsetMs * AV_TIME_BASE / 1000,
-        AVSEEK_FLAG_FRAME | AVSEEK_FLAG_ANY);
+  if (params.startOffset != 0) {
+    auto offset = params.startOffset <= params.seekAccuracy
+        ? 0
+        : params.startOffset - params.seekAccuracy;
+
+    av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD);
   }
 
-  LOG(INFO) << "Decoder initialized, log level: " << params_.logLevel;
-  outOfRange_ = false;
+  VLOG(1) << "Decoder initialized, log level: " << params_.logLevel;
   return true;
 }
 
-bool Decoder::activateStreams() {
+bool Decoder::openStreams() {
   for (int i = 0; i < inputCtx_->nb_streams; i++) {
     // - find the corespondent format at params_.formats set
     MediaFormat format;
@@ -418,6 +412,7 @@ bool Decoder::activateStreams() {
         return false;
       }
       streams_.emplace(i, std::move(stream));
+      inRange_.set(i, true);
     }
   }
 
@@ -458,8 +453,8 @@ void Decoder::cleanUp() {
   seekableBuffer_.shutdown();
 }
 
-int Decoder::getBytes(size_t workingTimeInMs) {
-  if (outOfRange_) {
+int Decoder::getFrame(size_t workingTimeInMs) {
+  if (inRange_.none()) {
     return ENODATA;
   }
   // decode frames until cache is full and leave thread
@@ -478,14 +473,16 @@ int Decoder::getBytes(size_t workingTimeInMs) {
     return std::chrono::steady_clock::now() <= end;
   };
 
-  int result = ETIMEDOUT;
+  int result = 0;
   size_t decodingErrors = 0;
-  while (!interrupted_ && watcher()) {
+  bool decodedFrame = false;
+  while (!interrupted_ && inRange_.any() && !decodedFrame && watcher()) {
     result = av_read_frame(inputCtx_, &avPacket);
     if (result == AVERROR(EAGAIN)) {
       VLOG(4) << "Decoder is busy...";
+      std::this_thread::yield();
       result = 0; // reset error, EAGAIN is not an error at all
-      break;
+      continue;
     } else if (result == AVERROR_EOF) {
       flushStreams();
       VLOG(1) << "End of stream";
@@ -499,24 +496,24 @@ int Decoder::getBytes(size_t workingTimeInMs) {
 
     // get stream
     auto stream = findByIndex(avPacket.stream_index);
-    if (stream == nullptr) {
+    if (stream == nullptr || !inRange_.test(stream->getIndex())) {
       av_packet_unref(&avPacket);
       continue;
     }
 
-    stream->rescalePackage(&avPacket);
-
-    AVPacket copyPacket = avPacket;
-
     size_t numConsecutiveNoBytes = 0;
     // it can be only partial decoding of the package bytes
     do {
       // decode package
-      if ((result = processPacket(stream, &copyPacket)) < 0) {
+      bool gotFrame = false;
+      bool hasMsg = false;
+      // packet either got consumed completely or not at all
+      if ((result = processPacket(stream, &avPacket, &gotFrame, &hasMsg)) < 0) {
+        LOG(ERROR) << "processPacket failed with code: " << result;
         break;
       }
 
-      if (result == 0 && params_.maxProcessNoBytes != 0 &&
+      if (!gotFrame && params_.maxProcessNoBytes != 0 &&
           ++numConsecutiveNoBytes > params_.maxProcessNoBytes) {
         LOG(ERROR) << "Exceeding max amount of consecutive no bytes";
         break;
@@ -525,14 +522,14 @@ int Decoder::getBytes(size_t workingTimeInMs) {
         numConsecutiveNoBytes = 0;
       }
 
-      copyPacket.size -= result;
-      copyPacket.data += result;
-    } while (copyPacket.size > 0);
+      decodedFrame |= hasMsg;
+    } while (result == 0);
 
     // post loop check
     if (result < 0) {
       if (params_.maxPackageErrors != 0 && // check errors
           ++decodingErrors >= params_.maxPackageErrors) { // reached the limit
+        LOG(ERROR) << "Exceeding max amount of consecutive package errors";
         break;
       }
     } else {
@@ -546,7 +543,27 @@ int Decoder::getBytes(size_t workingTimeInMs) {
 
   av_packet_unref(&avPacket);
 
-  return result;
+  VLOG(2) << "Interrupted loop"
+          << ", interrupted_ " << interrupted_ << ", inRange_.any() "
+          << inRange_.any() << ", decodedFrame " << decodedFrame << ", result "
+          << result;
+
+  // loop can be terminated, either by:
+  // 1. explcitly iterrupted
+  // 2. terminated by workable timeout
+  // 3. unrecoverable error or ENODATA (end of stream)
+  // 4. decoded frames pts are out of the specified range
+  // 5. success decoded frame
+  if (interrupted_) {
+    return EINTR;
+  }
+  if (result != 0) {
+    return result;
+  }
+  if (inRange_.none()) {
+    return ENODATA;
+  }
+  return 0;
 }
 
 Stream* Decoder::findByIndex(int streamIndex) const {
@@ -563,17 +580,23 @@ Stream* Decoder::findByType(const MediaFormat& format) const {
   return nullptr;
 }
 
-int Decoder::processPacket(Stream* stream, AVPacket* packet) {
+int Decoder::processPacket(Stream* stream,
+                           AVPacket* packet,
+                           bool* gotFrame,
+                           bool* hasMsg) {
   // decode package
-  int gotFrame = 0;
   int result;
   DecoderOutputMessage msg;
   msg.payload = createByteStorage(0);
-  if ((result = stream->decodeFrame(packet, &gotFrame)) >= 0 && gotFrame &&
-      stream->getFrameBytes(&msg, params_.headerOnly) > 0) {
+  *hasMsg = false;
+  if ((result = stream->decodePacket(
+           packet, &msg, params_.headerOnly, gotFrame)) >= 0 && *gotFrame) {
     // check end offset
-    if (params_.endOffsetMs <= 0 ||
-        !(outOfRange_ = msg.header.pts > params_.endOffsetMs * 1000)) {
+    bool endInRange =
+        params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
+    inRange_.set(stream->getIndex(), endInRange);
+    if (endInRange && msg.header.pts >= params_.startOffset) {
+      *hasMsg = true;
       push(std::move(msg));
     }
   }
@@ -587,9 +610,13 @@ void Decoder::flushStreams() {
     while (msg.payload = createByteStorage(0),
            stream.second->flush(&msg, params_.headerOnly) > 0) {
       // check end offset
-      if (params_.endOffsetMs <= 0 ||
-          !(outOfRange_ = msg.header.pts > params_.endOffsetMs * 1000)) {
+      bool endInRange =
+          params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
+      inRange_.set(stream.second->getIndex(), endInRange);
+      if (endInRange && msg.header.pts >= params_.startOffset) {
         push(std::move(msg));
+      } else {
+        msg.payload.reset();
       }
     }
   }
diff --git a/torchvision/csrc/cpu/decoder/decoder.h b/torchvision/csrc/cpu/decoder/decoder.h
index 971eec10aa4..11894fabb74 100644
--- a/torchvision/csrc/cpu/decoder/decoder.h
+++ b/torchvision/csrc/cpu/decoder/decoder.h
@@ -1,7 +1,7 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
+#include <bitset>
+#include <unordered_map>
 #include "seekable_buffer.h"
 #include "stream.h"
 
@@ -15,7 +15,6 @@ namespace ffmpeg {
 class Decoder : public MediaDecoder {
  public:
   Decoder();
-  ~Decoder() override;
 
   // MediaDecoder overrides
   bool init(const DecoderParameters& params, DecoderInCallback&& in) override;
@@ -25,9 +24,10 @@ class Decoder : public MediaDecoder {
 
  protected:
   // function does actual work, derived class calls it in working thread
-  // periodically. On success method returns 0, ENOADATA on EOF and error on
+  // periodically. On success method returns 0, ENOADATA on EOF, ETIMEDOUT if
+  // no frames got decoded in the specified timeout time, and error on
   // unrecoverable error.
-  int getBytes(size_t workingTimeInMs = 100);
+  int getFrame(size_t workingTimeInMs = 100);
 
   // Derived class must override method and consume the provided message
   virtual void push(DecoderOutputMessage&& buffer) = 0;
@@ -56,13 +56,15 @@ class Decoder : public MediaDecoder {
   virtual int64_t seekCallback(int64_t offset, int whence);
   virtual int shutdownCallback();
 
-  bool activateStreams();
+  bool openStreams();
   Stream* findByIndex(int streamIndex) const;
   Stream* findByType(const MediaFormat& format) const;
-  int processPacket(Stream* stream, AVPacket* packet);
+  int processPacket(Stream* stream,
+                    AVPacket* packet,
+                    bool* gotFrame,
+                    bool* hasMsg);
   void flushStreams();
   void cleanUp();
-
  private:
   DecoderParameters params_;
   SeekableBuffer seekableBuffer_;
@@ -72,6 +74,6 @@ class Decoder : public MediaDecoder {
   AVFormatContext* inputCtx_{nullptr};
   AVIOContext* avioCtx_{nullptr};
   std::unordered_map<ssize_t, std::unique_ptr<Stream>> streams_;
-  bool outOfRange_{false};
+  std::bitset<64> inRange_;
 };
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/defs.h b/torchvision/csrc/cpu/decoder/defs.h
index 62854668b90..2e282bb59c6 100644
--- a/torchvision/csrc/cpu/decoder/defs.h
+++ b/torchvision/csrc/cpu/decoder/defs.h
@@ -27,7 +27,7 @@ struct AudioFormat {
 
   size_t samples{0}; // number samples per second (frequency)
   size_t channels{0}; // number of channels
-  ssize_t format{-1}; // AVSampleFormat, auto AV_SAMPLE_FMT_NONE
+  long format{-1}; // AVSampleFormat, auto AV_SAMPLE_FMT_NONE
   size_t padding[2];
   // -- alignment 40 bytes
 };
@@ -42,7 +42,7 @@ struct VideoFormat {
 
   size_t width{0}; // width in pixels
   size_t height{0}; // height in pixels
-  ssize_t format{-1}; // AVPixelFormat, auto AV_PIX_FMT_NONE
+  long format{-1}; // AVPixelFormat, auto AV_PIX_FMT_NONE
   size_t minDimension{0}; // choose min dimension and rescale accordingly
   size_t cropImage{0}; // request image crop
   // -- alignment 40 bytes
@@ -50,7 +50,7 @@ struct VideoFormat {
 
 // subtitle/cc
 struct SubtitleFormat {
-  ssize_t type{0}; // AVSubtitleType, auto SUBTITLE_NONE
+  long type{0}; // AVSubtitleType, auto SUBTITLE_NONE
   size_t padding[4];
   // -- alignment 40 bytes
 };
@@ -94,28 +94,27 @@ struct MediaFormat {
     }
   }
 
-  explicit MediaFormat(ssize_t s = -1)
-      : type(TYPE_AUDIO), stream(s), format() {}
-  explicit MediaFormat(int x, ssize_t s = -1)
+  explicit MediaFormat(long s = -1) : type(TYPE_AUDIO), stream(s), format() {}
+  explicit MediaFormat(int x, long s = -1)
       : type(TYPE_VIDEO), stream(s), format(x) {}
-  explicit MediaFormat(char x, ssize_t s = -1)
+  explicit MediaFormat(char x, long s = -1)
       : type(TYPE_SUBTITLE), stream(s), format(x) {}
-  explicit MediaFormat(double x, ssize_t s = -1)
+  explicit MediaFormat(double x, long s = -1)
       : type(TYPE_CC), stream(s), format(x) {}
 
-  static MediaFormat makeMediaFormat(AudioFormat format, ssize_t stream) {
+  static MediaFormat makeMediaFormat(AudioFormat format, long stream) {
     MediaFormat result(stream);
     result.format.audio = format;
     return result;
   }
 
-  static MediaFormat makeMediaFormat(VideoFormat format, ssize_t stream) {
+  static MediaFormat makeMediaFormat(VideoFormat format, long stream) {
     MediaFormat result(0, stream);
     result.format.video = format;
     return result;
   }
 
-  static MediaFormat makeMediaFormat(SubtitleFormat format, ssize_t stream) {
+  static MediaFormat makeMediaFormat(SubtitleFormat format, long stream) {
     MediaFormat result('0', stream);
     result.format.subtitle = format;
     return result;
@@ -126,17 +125,17 @@ struct MediaFormat {
   // stream index:
   // set -1 for one stream auto detection, -2 for all streams auto detection,
   // >= 0, specified stream, if caller knows the stream index (unlikely)
-  ssize_t stream;
+  long stream;
   // union keeps one of the possible formats, defined by MediaType
   FormatUnion format;
 
   // output parameters, ignored while initialization
   // time base numerator
-  ssize_t num{0};
+  long num{0};
   // time base denominator
-  ssize_t den{1};
-  // duration of the stream, in stream time base, if available
-  ssize_t duration{-1};
+  long den{1};
+  // duration of the stream, in miscroseconds, if available
+  long duration{-1};
 };
 
 struct DecoderParameters {
@@ -146,29 +145,33 @@ struct DecoderParameters {
   // timeout on getting bytes for decoding
   size_t timeoutMs{1000};
   // logging level, default AV_LOG_PANIC
-  ssize_t logLevel{0};
+  long logLevel{0};
   // when decoder would give up, 0 means never
   size_t maxPackageErrors{0};
   // max allowed consecutive times no bytes are processed. 0 means for infinite.
   size_t maxProcessNoBytes{0};
-  // start offset
-  ssize_t startOffsetMs{0};
-  // end offset
-  ssize_t endOffsetMs{-1};
+  // start offset (us)
+  long startOffset{0};
+  // end offset (us)
+  long endOffset{-1};
   // logging id
   int64_t loggingUuid{0};
+  // internal max seekable buffer size
+  size_t maxSeekableBytes{0};
   // adjust header pts to the epoch time
   bool convertPtsToWallTime{false};
   // indicate if input stream is an encoded image
   bool isImage{false};
-  // what media types should be processed, default none
-  std::set<MediaFormat> formats;
   // listen and wait for new rtmp stream
   bool listen{false};
   // don't copy frame body, only header
   bool headerOnly{false};
-  // seek tolerated accuracy
-  double seekAccuracySec{1.0};
+  // interrupt init method on timeout
+  bool preventStaleness{true};
+  // seek tolerated accuracy (us)
+  double seekAccuracy{1000000.0};
+  // what media types should be processed, default none
+  std::set<MediaFormat> formats;
 };
 
 struct DecoderHeader {
@@ -176,7 +179,7 @@ struct DecoderHeader {
   size_t seqno{0};
   // decoded timestamp in microseconds from either beginning of the stream or
   // from epoch time, see DecoderParameters::convertPtsToWallTime
-  ssize_t pts{0};
+  long pts{0};
   // decoded key frame
   size_t keyFrame{0};
   // frames per second, valid only for video streams
@@ -219,27 +222,21 @@ struct DecoderOutputMessage {
  * Normally input/output parameter @out set to valid, not null buffer pointer,
  * which indicates "read" call, however there are "seek" modes as well.
 
- * @out != nullptr, @size != 0, @timeoutMs != 0 => read from the current offset
- * @size bytes => return number bytes read, 0 if no more bytes available, < 0
- * on error.
-
- * @out == nullptr, @size == 0, @timeoutMs == 0 => does provider support "seek"
- * capability in a first place? return 0 on success, < 0 if "seek" mode is not
- * supported.
-
- * @out == nullptr, @size > 0 => seek the absolute offset == @size, return
- * 0 on success and < 0 on error.
+ * @out != nullptr => read from the current offset, @whence got ignored,
+ * @size bytes to read => return number bytes got read, 0 if no more bytes
+ * available, < 0 on error.
 
- * @out == nullptr, @size < 0 => seek the end of the media, return 0 on success
- * and < 0 on failure. Provider might support seek doesn't know the media size.
+ * @out == nullptr, @timeoutMs == 0 => does provider support "seek"
+ * capability in a first place? @size & @whence got ignored, return 0 on
+ * success, < 0 if "seek" mode is not supported.
 
- * Additionally if @out is set to null AND @size is set to zero AND
- * @timeoutMs is set to zero, caller requests the seek capability of the
- * provider, i.e. returns 0 on success and error if provider is not supporting
- * seek.
+ * @out == nullptr, @timeoutMs != 0 => normal seek call
+ * offset == @size, i.e. @whence = [SEEK_SET, SEEK_CUR, SEEK_END, AVSEEK_SIZE)
+ * return < 0 on error, position if @whence = [SEEK_SET, SEEK_CUR, SEEK_END],
+ * length of buffer if @whence = [AVSEEK_SIZE].
  */
 using DecoderInCallback =
-    std::function<int(uint8_t* out, int size, uint64_t timeoutMs)>;
+    std::function<int(uint8_t* out, int size, int whence, uint64_t timeoutMs)>;
 
 using DecoderOutCallback = std::function<void(DecoderOutputMessage&&)>;
 
diff --git a/torchvision/csrc/cpu/decoder/memory_buffer.cpp b/torchvision/csrc/cpu/decoder/memory_buffer.cpp
new file mode 100644
index 00000000000..d91213fdcbb
--- /dev/null
+++ b/torchvision/csrc/cpu/decoder/memory_buffer.cpp
@@ -0,0 +1,75 @@
+#include "memory_buffer.h"
+#include <c10/util/Logging.h>
+
+extern "C" {
+#include <libavformat/avio.h>
+}
+
+namespace ffmpeg {
+
+MemoryBuffer::MemoryBuffer(const uint8_t* buffer, size_t size)
+    : buffer_(buffer), len_(size) {}
+
+int MemoryBuffer::read(uint8_t* buf, int size) {
+  if (pos_ < len_) {
+    auto available = std::min(int(len_ - pos_), size);
+    memcpy(buf, buffer_ + pos_, available);
+    pos_ += available;
+    return available;
+  }
+
+  return 0;
+}
+
+int64_t MemoryBuffer::seek(int64_t offset, int whence) {
+  if (whence & AVSEEK_SIZE) {
+    return len_;
+  }
+
+  // remove force flag
+  whence &= ~AVSEEK_FORCE;
+
+  switch (whence) {
+    case SEEK_SET:
+      if (offset >= 0 && offset <= len_) {
+        pos_ = offset;
+      }
+      break;
+    case SEEK_END:
+      if (len_ + offset >= 0 && len_ + offset <= len_) {
+        pos_ = len_ + offset;
+      }
+      break;
+    case SEEK_CUR:
+      if (pos_ + offset > 0 && pos_ + offset <= len_) {
+        pos_ += offset;
+      }
+      break;
+    default:
+      LOG(ERROR) << "Unknown whence flag gets provided: " << whence;
+  }
+  return pos_;
+}
+
+/* static */
+DecoderInCallback MemoryBuffer::getCallback(
+    const uint8_t* buffer,
+    size_t size) {
+  MemoryBuffer object(buffer, size);
+  return
+      [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
+      -> int {
+        if (out) { // see defs.h file
+          // read mode
+          return object.read(out, size);
+        }
+        // seek mode
+        if (!timeoutMs) {
+          // seek capabilty, yes - supported
+          return 0;
+        }
+        return object.seek(size, whence);
+      };
+}
+
+} // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/memory_buffer.h b/torchvision/csrc/cpu/decoder/memory_buffer.h
new file mode 100644
index 00000000000..909626d3cae
--- /dev/null
+++ b/torchvision/csrc/cpu/decoder/memory_buffer.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "defs.h"
+
+namespace ffmpeg {
+
+/**
+ * Class uses external memory buffer and implements a seekable interface.
+ */
+class MemoryBuffer {
+ public:
+  explicit MemoryBuffer(const uint8_t* buffer, size_t size);
+  int64_t seek(int64_t offset, int whence);
+  int read(uint8_t* buf, int size);
+
+  // static constructor for decoder callback.
+  static DecoderInCallback getCallback(const uint8_t* buffer, size_t size);
+
+ private:
+  const uint8_t* buffer_; // set at construction time
+  long pos_{0}; // current position
+  long len_{0}; // bytes in buffer
+};
+
+} // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/seekable_buffer.cpp b/torchvision/csrc/cpu/decoder/seekable_buffer.cpp
index 8d159b789bf..2e6732a2f50 100644
--- a/torchvision/csrc/cpu/decoder/seekable_buffer.cpp
+++ b/torchvision/csrc/cpu/decoder/seekable_buffer.cpp
@@ -1,8 +1,7 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "seekable_buffer.h"
 #include <c10/util/Logging.h>
 #include <chrono>
+#include "memory_buffer.h"
 
 extern "C" {
 #include <libavformat/avio.h>
@@ -10,17 +9,59 @@ extern "C" {
 
 namespace ffmpeg {
 
-bool SeekableBuffer::init(
+int SeekableBuffer::init(
     DecoderInCallback&& in,
-    ssize_t minSize,
-    ssize_t maxSize,
-    uint64_t timeoutMs) {
+    uint64_t timeoutMs,
+    size_t maxSeekableBytes,
+    ImageType* type) {
+  shutdown();
+  isSeekable_ = in(nullptr, 0, 0, 0) == 0;
+  if (isSeekable_) { // seekable
+    if (type) {
+      if (!readBytes(in, 8, timeoutMs)) {
+        return -1;
+      }
+      setImageType(type);
+      end_ = 0;
+      eof_ = false;
+      std::vector<uint8_t>().swap(buffer_);
+      // reset callback
+      if (in(nullptr, 0, SEEK_SET, timeoutMs)) {
+        return -1;
+      }
+    }
+    inCallback_ = std::forward<DecoderInCallback>(in);
+    return 1;
+  }
+
+  if (!readBytes(in, maxSeekableBytes, timeoutMs)) {
+    return -1;
+  }
+
+  if (type) {
+    setImageType(type);
+  }
+
+  if (eof_) {
+    end_ = 0;
+    eof_ = false;
+    // reuse MemoryBuffer functionality
+    inCallback_ = MemoryBuffer::getCallback(buffer_.data(), buffer_.size());
+    isSeekable_ = true;
+    return 1;
+  }
   inCallback_ = std::forward<DecoderInCallback>(in);
-  len_ = minSize;
-  buffer_.resize(len_);
-  pos_ = 0;
+  return 0;
+}
+
+bool SeekableBuffer::readBytes(
+    DecoderInCallback& in,
+    size_t maxBytes,
+    uint64_t timeoutMs) {
+  // Resize to th minimum 4K page or less
+  buffer_.resize(std::min(maxBytes, 4 * 1024UL));
   end_ = 0;
-  eof_ = 0;
+  eof_ = false;
 
   auto end =
       std::chrono::steady_clock::now() + std::chrono::milliseconds(timeoutMs);
@@ -28,62 +69,58 @@ bool SeekableBuffer::init(
     return std::chrono::steady_clock::now() <= end;
   };
 
-  bool hasTime = false;
-  while (!eof_ && end_ < maxSize && (hasTime = watcher())) {
+  bool hasTime = true;
+  while (!eof_ && end_ < maxBytes && (hasTime = watcher())) {
     // lets read all bytes into available buffer
-    auto res = inCallback_(buffer_.data() + end_, len_ - end_, timeoutMs);
+    auto res = in(buffer_.data() + end_, buffer_.size() - end_, 0, timeoutMs);
     if (res > 0) {
       end_ += res;
-      if (end_ == len_) {
-        len_ = std::min(len_ * 4, maxSize);
-        buffer_.resize(len_);
+      if (end_ == buffer_.size()) {
+        buffer_.resize(std::min(end_ * 4UL, maxBytes));
       }
     } else if (res == 0) {
-      eof_ = 1;
+      eof_ = true;
     } else {
       // error
       return false;
     }
   }
 
-  if (!hasTime) {
-    return false;
-  }
+  return hasTime;
+}
 
+void SeekableBuffer::setImageType(ImageType* type) {
   if (buffer_.size() > 2 && buffer_[0] == 0xFF && buffer_[1] == 0xD8 &&
       buffer_[2] == 0xFF) {
-    imageType_ = ImageType::JPEG;
+    *type = ImageType::JPEG;
   } else if (
       buffer_.size() > 3 && buffer_[1] == 'P' && buffer_[2] == 'N' &&
       buffer_[3] == 'G') {
-    imageType_ = ImageType::PNG;
+    *type = ImageType::PNG;
   } else if (
       buffer_.size() > 1 &&
       ((buffer_[0] == 0x49 && buffer_[1] == 0x49) ||
        (buffer_[0] == 0x4D && buffer_[1] == 0x4D))) {
-    imageType_ = ImageType::TIFF;
+    *type = ImageType::TIFF;
+  } else {
+    *type = ImageType::UNKNOWN;
   }
-
-  return true;
 }
 
 int SeekableBuffer::read(uint8_t* buf, int size, uint64_t timeoutMs) {
-  // 1. pos_ < end_
+  if (isSeekable_) {
+    return inCallback_(buf, size, 0, timeoutMs);
+  }
   if (pos_ < end_) {
+    // read cached bytes for non-seekable callback
     auto available = std::min(int(end_ - pos_), size);
     memcpy(buf, buffer_.data() + pos_, available);
     pos_ += available;
     return available;
   } else if (!eof_) {
-    auto res = inCallback_(buf, size, timeoutMs); // read through
-    if (res > 0) {
-      pos_ += res;
-      if (pos_ > end_ && !buffer_.empty()) {
-        std::vector<uint8_t>().swap(buffer_);
-      }
-    } else if (res == 0) {
-      eof_ = 1;
-    }
+    // normal sequential read (see defs.h file), i.e. @buf != null
+    auto res = inCallback_(buf, size, 0, timeoutMs); // read through
+    eof_ = res == 0;
     return res;
   } else {
     return 0;
@@ -91,57 +128,13 @@ int SeekableBuffer::read(uint8_t* buf, int size, uint64_t timeoutMs) {
 }
 
 int64_t SeekableBuffer::seek(int64_t offset, int whence, uint64_t timeoutMs) {
-  // remove force flag
-  whence &= ~AVSEEK_FORCE;
-  // get size request
-  int size = whence & AVSEEK_SIZE;
-  // remove size flag
-  whence &= ~AVSEEK_SIZE;
-
-  if (size) {
-    return eof_ ? end_ : AVERROR(EINVAL);
-  } else {
-    switch (whence) {
-      case SEEK_SET:
-        if (offset < 0) {
-          return AVERROR(EINVAL);
-        }
-        if (offset <= end_) {
-          pos_ = offset;
-          return pos_;
-        }
-        if (!inCallback_(0, offset, timeoutMs)) {
-          pos_ = offset;
-          return 0;
-        }
-        break;
-      case SEEK_END:
-        if (eof_ && pos_ <= end_ && offset < 0 && end_ + offset >= 0) {
-          pos_ = end_ + offset;
-          return 0;
-        }
-        break;
-      case SEEK_CUR:
-        if (pos_ + offset < 0) {
-          return AVERROR(EINVAL);
-        }
-        if (pos_ + offset <= end_) {
-          pos_ += offset;
-          return 0;
-        }
-        if (!inCallback_(0, pos_ + offset, timeoutMs)) {
-          pos_ += offset;
-          return 0;
-        }
-        break;
-      default:
-        LOG(ERROR) << "Unknown whence flag gets provided: " << whence;
-    }
-  }
-  return AVERROR(EINVAL); // we have no idea what the media size is
+  return inCallback_(nullptr, offset, whence, timeoutMs);
 }
 
 void SeekableBuffer::shutdown() {
+  pos_ = end_ = 0;
+  eof_ = false;
+  std::vector<uint8_t>().swap(buffer_);
   inCallback_ = nullptr;
 }
 
diff --git a/torchvision/csrc/cpu/decoder/seekable_buffer.h b/torchvision/csrc/cpu/decoder/seekable_buffer.h
index e8ba327e4ea..9d5729f5306 100644
--- a/torchvision/csrc/cpu/decoder/seekable_buffer.h
+++ b/torchvision/csrc/cpu/decoder/seekable_buffer.h
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "defs.h"
@@ -20,27 +18,28 @@ enum class ImageType {
 
 class SeekableBuffer {
  public:
-  // try to fill out buffer, returns true if EOF detected (seek will supported)
-  bool init(
+  // @type is optional, not nullptr only is image detection required
+  // \returns 1 is buffer seekable, 0 - if not seekable, < 0 on error
+  int init(
       DecoderInCallback&& in,
-      ssize_t minSize,
-      ssize_t maxSize,
-      uint64_t timeoutMs);
+      uint64_t timeoutMs,
+      size_t maxSeekableBytes,
+      ImageType* type);
   int read(uint8_t* buf, int size, uint64_t timeoutMs);
   int64_t seek(int64_t offset, int whence, uint64_t timeoutMs);
   void shutdown();
-  ImageType getImageType() const {
-    return imageType_;
-  }
+
+ private:
+  bool readBytes(DecoderInCallback& in, size_t maxBytes, uint64_t timeoutMs);
+  void setImageType(ImageType* type);
 
  private:
   DecoderInCallback inCallback_;
   std::vector<uint8_t> buffer_; // resized at init time
-  ssize_t len_{0}; // current buffer size
-  ssize_t pos_{0}; // current position (SEEK_CUR iff pos_ < end_)
-  ssize_t end_{0}; // bytes in buffer [0, buffer_.size()]
-  ssize_t eof_{0}; // indicates the EOF
-  ImageType imageType_{ImageType::UNKNOWN};
+  long pos_{0}; // current position (SEEK_CUR iff pos_ < end_)
+  long end_{0}; // current buffer size
+  bool eof_{0}; // indicates the EOF
+  bool isSeekable_{false}; // is callback seekable
 };
 
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/stream.cpp b/torchvision/csrc/cpu/decoder/stream.cpp
index 767136657b6..ce13ca05a83 100644
--- a/torchvision/csrc/cpu/decoder/stream.cpp
+++ b/torchvision/csrc/cpu/decoder/stream.cpp
@@ -1,22 +1,18 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "stream.h"
 #include <c10/util/Logging.h>
 #include "util.h"
 
 namespace ffmpeg {
 
-namespace {
-const size_t kDecoderHeaderSize = sizeof(DecoderHeader);
-}
-
 Stream::Stream(
     AVFormatContext* inputCtx,
     MediaFormat format,
-    bool convertPtsToWallTime)
+    bool convertPtsToWallTime,
+    int64_t loggingUuid)
     : inputCtx_(inputCtx),
       format_(format),
-      convertPtsToWallTime_(convertPtsToWallTime) {}
+      convertPtsToWallTime_(convertPtsToWallTime),
+      loggingUuid_(loggingUuid) {}
 
 Stream::~Stream() {
   if (frame_) {
@@ -36,25 +32,30 @@ int Stream::openCodec() {
   auto codec_id = steam->codecpar->codec_id;
   AVCodec* codec = avcodec_find_decoder(codec_id);
   if (!codec) {
-    LOG(ERROR) << "avcodec_find_decoder failed for codec_id: " << int(codec_id);
+    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
+               << ", avcodec_find_decoder failed for codec_id: "
+               << int(codec_id);
     return AVERROR(EINVAL);
   }
 
   if (!(codecCtx_ = avcodec_alloc_context3(codec))) {
-    LOG(ERROR) << "avcodec_alloc_context3 fails";
+    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
+               << ", avcodec_alloc_context3 failed";
     return AVERROR(ENOMEM);
   }
 
   int ret;
   // Copy codec parameters from input stream to output codec context
   if ((ret = avcodec_parameters_to_context(codecCtx_, steam->codecpar)) < 0) {
-    LOG(ERROR) << "Failed to copy codec parameters to decoder context";
+    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
+               << ", avcodec_parameters_to_context failed";
     return ret;
   }
 
   // after avcodec_open2, value of codecCtx_->time_base is NOT meaningful
   if ((ret = avcodec_open2(codecCtx_, codec, nullptr)) < 0) {
-    LOG(ERROR) << "avcodec_open2 failed. " << Util::generateErrorDesc(ret);
+    LOG(ERROR) << "LoggingUuid #" << loggingUuid_
+               << ", avcodec_open2 failed: " << Util::generateErrorDesc(ret);
     avcodec_free_context(&codecCtx_);
     codecCtx_ = nullptr;
     return ret;
@@ -62,30 +63,41 @@ int Stream::openCodec() {
 
   frame_ = av_frame_alloc();
 
+  // always convert to us
   format_.num = inputCtx_->streams[format_.stream]->time_base.num;
   format_.den = inputCtx_->streams[format_.stream]->time_base.den;
-  format_.duration = inputCtx_->streams[format_.stream]->duration;
 
-  return initFormat();
-}
+  switch (format_.type) {
+    case TYPE_VIDEO:
+      fps_ = av_q2d(av_guess_frame_rate(
+          inputCtx_, inputCtx_->streams[format_.stream], nullptr));
+      break;
+    case TYPE_AUDIO:
+      fps_ = codecCtx_->sample_rate;
+      break;
+    default:
+      fps_ = 30.0;
+  }
+
+  format_.duration = av_rescale_q(
+      inputCtx_->streams[format_.stream]->duration,
+      inputCtx_->streams[format_.stream]->time_base,
+      AV_TIME_BASE_Q);
 
-// rescale package
-void Stream::rescalePackage(AVPacket* packet) {
-  if (codecCtx_->time_base.num != 0) {
-    av_packet_rescale_ts(
-        packet,
-        inputCtx_->streams[format_.stream]->time_base,
-        codecCtx_->time_base);
+  if ((ret = initFormat())) {
+    LOG(ERROR) << "initFormat failed, type: " << format_.type;
   }
+
+  return ret;
 }
 
-int Stream::analyzePacket(const AVPacket* packet, int* gotFramePtr) {
+int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
   int consumed = 0;
   int result = avcodec_send_packet(codecCtx_, packet);
   if (result == AVERROR(EAGAIN)) {
-    *gotFramePtr = 0; // no bytes get consumed, fetch frame
+    *gotFrame = false; // no bytes get consumed, fetch frame
   } else if (result == AVERROR_EOF) {
-    *gotFramePtr = 0; // more than one flush packet
+    *gotFrame = false; // more than one flush packet
     if (packet) {
       // got packet after flush, this is an error
       return result;
@@ -95,23 +107,23 @@ int Stream::analyzePacket(const AVPacket* packet, int* gotFramePtr) {
                << Util::generateErrorDesc(result);
     return result; // error
   } else {
-    consumed = packet ? packet->size : 0; // all bytes get consumed
+    consumed = 1; // all bytes get consumed
   }
 
   result = avcodec_receive_frame(codecCtx_, frame_);
 
   if (result >= 0) {
-    *gotFramePtr = 1; // frame is available
+    *gotFrame = true; // frame is available
   } else if (result == AVERROR(EAGAIN)) {
-    *gotFramePtr = 0; // no frames at this time, needs more packets
+    *gotFrame = false; // no frames at this time, needs more packets
     if (!consumed) {
       // precaution, if no packages got consumed and no frames are available
       return result;
     }
   } else if (result == AVERROR_EOF) {
-    *gotFramePtr = 0; // the last frame has been flushed
+    *gotFrame = false; // the last frame has been flushed
     // precaution, if no more frames are available assume we consume all bytes
-    consumed = packet ? packet->size : 0;
+    consumed = 0;
   } else { // error
     LOG(ERROR) << "avcodec_receive_frame failed, err: "
                << Util::generateErrorDesc(result);
@@ -120,46 +132,121 @@ int Stream::analyzePacket(const AVPacket* packet, int* gotFramePtr) {
   return consumed;
 }
 
-int Stream::decodeFrame(const AVPacket* packet, int* gotFramePtr) {
-  return analyzePacket(packet, gotFramePtr);
-}
-
-int Stream::getFrameBytes(DecoderOutputMessage* out, bool headerOnly) {
-  return fillBuffer(out, false, headerOnly);
+int Stream::decodePacket(
+    const AVPacket* packet,
+    DecoderOutputMessage* out,
+    bool headerOnly,
+    bool* hasMsg) {
+  int consumed;
+  bool gotFrame = false;
+  *hasMsg = false;
+  if ((consumed = analyzePacket(packet, &gotFrame)) >= 0 &&
+      (packet == nullptr || gotFrame)) {
+    int result;
+    if ((result = getMessage(out, !gotFrame, headerOnly)) < 0) {
+      return result; // report error
+    }
+    *hasMsg = result > 0;
+  }
+  return consumed;
 }
 
 int Stream::flush(DecoderOutputMessage* out, bool headerOnly) {
-  int gotFramePtr = 0;
-  int result;
-  if (analyzePacket(nullptr, &gotFramePtr) >= 0 && gotFramePtr &&
-      (result = fillBuffer(out, false, headerOnly)) > 0) {
-    return result;
-  } else if ((result = fillBuffer(out, true, headerOnly)) > 0) {
+  bool hasMsg = false;
+  int result = decodePacket(nullptr, out, headerOnly, &hasMsg);
+  if (result < 0) {
+    avcodec_flush_buffers(codecCtx_);
     return result;
   }
-  return result;
+  if (!hasMsg) {
+    avcodec_flush_buffers(codecCtx_);
+    return 0;
+  }
+  return 1;
 }
 
-int Stream::fillBuffer(DecoderOutputMessage* out, bool flush, bool headerOnly) {
-  int result = -1;
-  if (!codecCtx_) {
-    LOG(INFO) << "Codec is not initialized";
-    return result;
+int Stream::getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly) {
+  if (flush) {
+    // only flush of audio frames makes sense
+    if (format_.type == TYPE_AUDIO) {
+      int bytes = 0;
+      if ((bytes = estimateBytes(true)) < 0) {
+        return bytes;
+      }
+      int processed = 0;
+      // grab all audio bytes by chunks
+      do {
+        out->payload->ensure(out->payload->length() + bytes);
+        if ((processed = copyFrameBytes(out->payload.get(), true)) < 0) {
+          return processed;
+        }
+      } while (processed);
+
+      if (out->payload->length()) {
+        // set header first
+        setHeader(&out->header, flush);
+        return 1;
+      }
+    }
+    return 0;
+  } else {
+    // set header first
+    setHeader(&out->header, flush);
+
+    if (headerOnly) {
+      // Only header is requisted
+      return 1;
+    }
+
+    // decoded frame is available
+    int bytes;
+    if ((bytes = estimateBytes(false)) < 0) {
+      return bytes;
+    }
+    out->payload->ensure(bytes);
+    return copyFrameBytes(out->payload.get(), false);
   }
+}
+
+void Stream::setHeader(DecoderHeader* header, bool flush) {
+  header->seqno = numGenerator_++;
 
-  // assign message
-  setHeader(&out->header);
+  setFramePts(header, flush);
 
-  if (headerOnly) {
-    return sizeof(out->header);
+  if (convertPtsToWallTime_) {
+    keeper_.adjust(header->pts);
   }
 
-  // init sampler, if any and return required bytes
-  if ((result = estimateBytes(flush)) < 0) {
-    return result;
+  header->format = format_;
+  header->keyFrame = 0;
+  header->fps = std::numeric_limits<double>::quiet_NaN();
+}
+
+void Stream::setFramePts(DecoderHeader* header, bool flush) {
+  if (flush) {
+    header->pts = nextPts_; // already in us
+  } else {
+    header->pts = av_frame_get_best_effort_timestamp(frame_);
+    if (header->pts == AV_NOPTS_VALUE) {
+      header->pts = nextPts_;
+    } else {
+      header->pts = av_rescale_q(
+          header->pts,
+          inputCtx_->streams[format_.stream]->time_base,
+          AV_TIME_BASE_Q);
+    }
+
+    switch (format_.type) {
+      case TYPE_AUDIO:
+        nextPts_ = header->pts + frame_->nb_samples * AV_TIME_BASE / fps_;
+        break;
+      case TYPE_VIDEO:
+        nextPts_ = header->pts + AV_TIME_BASE / fps_;
+        break;
+      default:
+        nextPts_ = header->pts;
+    }
   }
-  out->payload->ensure(result);
-  return copyFrameBytes(out->payload.get(), flush);
 }
 
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/stream.h b/torchvision/csrc/cpu/decoder/stream.h
index fd83b90428c..3473a2a0fd3 100644
--- a/torchvision/csrc/cpu/decoder/stream.h
+++ b/torchvision/csrc/cpu/decoder/stream.h
@@ -1,9 +1,8 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include <atomic>
 #include "defs.h"
+#include "time_keeper.h"
 
 extern "C" {
 #include <libavformat/avformat.h>
@@ -22,23 +21,24 @@ class Stream {
   Stream(
       AVFormatContext* inputCtx,
       MediaFormat format,
-      bool convertPtsToWallTime);
+      bool convertPtsToWallTime,
+      int64_t loggingUuid);
   virtual ~Stream();
 
   // returns 0 - on success or negative error
   int openCodec();
-  // returns number processed bytes from packet, or negative error
-  int decodeFrame(const AVPacket* packet, int* gotFramePtr);
+  // returns 1 - if packet got consumed, 0 - if it's not, and < 0 on error
+  int decodePacket(
+      const AVPacket* packet,
+      DecoderOutputMessage* out,
+      bool headerOnly,
+      bool* hasMsg);
   // returns stream index
   int getIndex() const {
     return format_.stream;
   }
-  // returns number decoded/sampled bytes
-  int getFrameBytes(DecoderOutputMessage* out, bool headerOnly);
-  // returns number decoded/sampled bytes
+  // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error
   int flush(DecoderOutputMessage* out, bool headerOnly);
-  // rescale package
-  void rescalePackage(AVPacket* packet);
   // return media format
   MediaFormat getMediaFormat() const {
     return format_;
@@ -46,29 +46,37 @@ class Stream {
 
  protected:
   virtual int initFormat() = 0;
+  // returns 1 - if packet got consumed, 0 - if it's not, and < 0 on error
+  virtual int analyzePacket(const AVPacket* packet, bool* gotFrame);
   // returns number processed bytes from packet, or negative error
-  virtual int analyzePacket(const AVPacket* packet, int* gotFramePtr);
-  // returns number decoded/sampled bytes, or negative error
   virtual int copyFrameBytes(ByteStorage* out, bool flush) = 0;
-  // initialize codec, returns output buffer size, or negative error
+  // estimates bytes in frame, returns output buffer size, or negative error
   virtual int estimateBytes(bool flush) = 0;
   // sets output format
-  virtual void setHeader(DecoderHeader* header) = 0;
+  virtual void setHeader(DecoderHeader* header, bool flush);
+  // set frame pts
+  virtual void setFramePts(DecoderHeader* header, bool flush);
   // finds codec
   virtual AVCodec* findCodec(AVCodecContext* ctx);
 
  private:
-  int fillBuffer(DecoderOutputMessage* out, bool flush, bool headerOnly);
+  // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error
+  int getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly);
 
  protected:
   AVFormatContext* const inputCtx_;
   MediaFormat format_;
   const bool convertPtsToWallTime_;
+  int64_t loggingUuid_;
 
   AVCodecContext* codecCtx_{nullptr};
   AVFrame* frame_{nullptr};
 
   std::atomic<size_t> numGenerator_{0};
+  TimeKeeper keeper_;
+  // estimated next frame pts for flushing the last frame
+  int64_t nextPts_{0};
+  double fps_{30.};
 };
 
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/subtitle_sampler.cpp b/torchvision/csrc/cpu/decoder/subtitle_sampler.cpp
index 02859c19187..b89ef8f1b86 100644
--- a/torchvision/csrc/cpu/decoder/subtitle_sampler.cpp
+++ b/torchvision/csrc/cpu/decoder/subtitle_sampler.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "subtitle_sampler.h"
 #include "util.h"
 
diff --git a/torchvision/csrc/cpu/decoder/subtitle_sampler.h b/torchvision/csrc/cpu/decoder/subtitle_sampler.h
index 4846fe4d7c5..298e48d591f 100644
--- a/torchvision/csrc/cpu/decoder/subtitle_sampler.h
+++ b/torchvision/csrc/cpu/decoder/subtitle_sampler.h
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "defs.h"
diff --git a/torchvision/csrc/cpu/decoder/subtitle_stream.cpp b/torchvision/csrc/cpu/decoder/subtitle_stream.cpp
index b699a0507cf..4f83fad68f8 100644
--- a/torchvision/csrc/cpu/decoder/subtitle_stream.cpp
+++ b/torchvision/csrc/cpu/decoder/subtitle_stream.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "subtitle_stream.h"
 #include <c10/util/Logging.h>
 #include <limits>
@@ -26,7 +24,8 @@ SubtitleStream::SubtitleStream(
     : Stream(
           inputCtx,
           MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime) {
+          convertPtsToWallTime,
+          0) {
   memset(&sub_, 0, sizeof(sub_));
 }
 
@@ -51,16 +50,16 @@ int SubtitleStream::initFormat() {
   return 0;
 }
 
-int SubtitleStream::analyzePacket(const AVPacket* packet, int* gotFramePtr) {
+int SubtitleStream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
   // clean-up
   releaseSubtitle();
   // check flush packet
   AVPacket avPacket;
   av_init_packet(&avPacket);
   avPacket.data = nullptr;
-
   auto pkt = packet ? *packet : avPacket;
-  int result = avcodec_decode_subtitle2(codecCtx_, &sub_, gotFramePtr, &pkt);
+  int gotFramePtr = 0;
+  int result = avcodec_decode_subtitle2(codecCtx_, &sub_, &gotFramePtr, &pkt);
 
   if (result < 0) {
     VLOG(1) << "avcodec_decode_subtitle2 failed, err: "
@@ -69,17 +68,18 @@ int SubtitleStream::analyzePacket(const AVPacket* packet, int* gotFramePtr) {
     result = packet ? packet->size : 0; // discard the rest of the package
   }
 
-  sub_.release = *gotFramePtr;
+  sub_.release = gotFramePtr;
+  *gotFrame = gotFramePtr > 0;
   return result;
 }
 
-int SubtitleStream::estimateBytes(bool flush) {
+int SubtitleStream::estimateBytes(bool) {
   if (!(sampler_.getInputFormat().subtitle == *codecCtx_)) {
     // - reinit sampler
     SamplerParameters params;
     params.type = MediaType::TYPE_SUBTITLE;
     toSubtitleFormat(params.in.subtitle, *codecCtx_);
-    if (flush || !sampler_.init(params)) {
+    if (!sampler_.init(params)) {
       return -1;
     }
 
@@ -92,17 +92,8 @@ int SubtitleStream::copyFrameBytes(ByteStorage* out, bool flush) {
   return sampler_.sample(flush ? nullptr : &sub_, out);
 }
 
-void SubtitleStream::setHeader(DecoderHeader* header) {
-  header->seqno = numGenerator_++;
-
+void SubtitleStream::setFramePts(DecoderHeader* header, bool) {
   header->pts = sub_.pts; // already in us
-
-  if (convertPtsToWallTime_) {
-    keeper_.adjust(header->pts);
-  }
-
-  header->keyFrame = 0;
-  header->fps = std::numeric_limits<double>::quiet_NaN();
-  header->format = format_;
 }
+
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/subtitle_stream.h b/torchvision/csrc/cpu/decoder/subtitle_stream.h
index 8669f15e0ce..4297cfa83f7 100644
--- a/torchvision/csrc/cpu/decoder/subtitle_stream.h
+++ b/torchvision/csrc/cpu/decoder/subtitle_stream.h
@@ -1,10 +1,7 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "stream.h"
 #include "subtitle_sampler.h"
-#include "time_keeper.h"
 
 namespace ffmpeg {
 
@@ -25,18 +22,17 @@ class SubtitleStream : public Stream {
   ~SubtitleStream() override;
 
  protected:
-  void setHeader(DecoderHeader* header) override;
+  void setFramePts(DecoderHeader* header, bool flush) override;
 
  private:
   int initFormat() override;
-  int analyzePacket(const AVPacket* packet, int* gotFramePtr) override;
+  int analyzePacket(const AVPacket* packet, bool* gotFrame) override;
   int estimateBytes(bool flush) override;
   int copyFrameBytes(ByteStorage* out, bool flush) override;
   void releaseSubtitle();
 
  private:
   SubtitleSampler sampler_;
-  TimeKeeper keeper_;
   AVSubtitleKeeper sub_;
 };
 
diff --git a/torchvision/csrc/cpu/decoder/sync_decoder.cpp b/torchvision/csrc/cpu/decoder/sync_decoder.cpp
index 6387837218e..5f3c38e08f8 100644
--- a/torchvision/csrc/cpu/decoder/sync_decoder.cpp
+++ b/torchvision/csrc/cpu/decoder/sync_decoder.cpp
@@ -1,23 +1,26 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "sync_decoder.h"
 #include <c10/util/Logging.h>
 
 namespace ffmpeg {
 
 SyncDecoder::VectorByteStorage::VectorByteStorage(size_t n) {
-  buffer_.resize(n);
+  ensure(n);
+}
+
+SyncDecoder::VectorByteStorage::~VectorByteStorage() {
+  av_free(buffer_);
 }
 
 void SyncDecoder::VectorByteStorage::ensure(size_t n) {
   if (tail() < n) {
-    buffer_.resize(offset_ + length_ + n);
+    capacity_ = offset_ + length_ + n;
+    buffer_ = static_cast<uint8_t*>(av_realloc(buffer_, capacity_));
   }
 }
 
 uint8_t* SyncDecoder::VectorByteStorage::writableTail() {
-  CHECK_LE(offset_ + length_, buffer_.size());
-  return buffer_.data() + offset_ + length_;
+  CHECK_LE(offset_ + length_, capacity_);
+  return buffer_ + offset_ + length_;
 }
 
 void SyncDecoder::VectorByteStorage::append(size_t n) {
@@ -32,7 +35,7 @@ void SyncDecoder::VectorByteStorage::trim(size_t n) {
 }
 
 const uint8_t* SyncDecoder::VectorByteStorage::data() const {
-  return buffer_.data() + offset_;
+  return buffer_ + offset_;
 }
 
 size_t SyncDecoder::VectorByteStorage::length() const {
@@ -40,13 +43,11 @@ size_t SyncDecoder::VectorByteStorage::length() const {
 }
 
 size_t SyncDecoder::VectorByteStorage::tail() const {
-  auto size = buffer_.size();
-  CHECK_LE(offset_ + length_, buffer_.size());
-  return size - offset_ - length_;
+  CHECK_LE(offset_ + length_, capacity_);
+  return capacity_ - offset_ - length_;
 }
 
 void SyncDecoder::VectorByteStorage::clear() {
-  buffer_.clear();
   offset_ = 0;
   length_ = 0;
 }
@@ -66,16 +67,22 @@ int SyncDecoder::decode(DecoderOutputMessage* out, uint64_t timeoutMs) {
   }
 
   if (queue_.empty()) {
-    int result = getBytes(timeoutMs);
+    int result = getFrame(timeoutMs);
+    // assign EOF
     eof_ = result == ENODATA;
-
+    // check unrecoverable error, any error but ENODATA
     if (result && result != ENODATA) {
       return result;
     }
 
     // still empty
     if (queue_.empty()) {
-      return ETIMEDOUT;
+      if (eof_) {
+        return ENODATA;
+      } else {
+        LOG(INFO) << "Queue is empty";
+        return ETIMEDOUT;
+      }
     }
   }
 
diff --git a/torchvision/csrc/cpu/decoder/sync_decoder.h b/torchvision/csrc/cpu/decoder/sync_decoder.h
index 76c347fe707..192962acc0c 100644
--- a/torchvision/csrc/cpu/decoder/sync_decoder.h
+++ b/torchvision/csrc/cpu/decoder/sync_decoder.h
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include <list>
@@ -13,9 +11,11 @@ namespace ffmpeg {
  * or fetched internally by FFMPEG library
  */
 class SyncDecoder : public Decoder {
+  // Allocation of memory must be done with a proper alignment.
   class VectorByteStorage : public ByteStorage {
    public:
     VectorByteStorage(size_t n);
+    ~VectorByteStorage() override;
     void ensure(size_t n) override;
     uint8_t* writableTail() override;
     void append(size_t n) override;
@@ -28,7 +28,8 @@ class SyncDecoder : public Decoder {
    private:
     size_t offset_{0};
     size_t length_{0};
-    std::vector<uint8_t> buffer_;
+    size_t capacity_{0};
+    uint8_t* buffer_{nullptr};
   };
 
  public:
diff --git a/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp b/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp
index ee0fe3fcf3c..379c24a0aa0 100644
--- a/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp
+++ b/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp
@@ -1,7 +1,6 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include <c10/util/Logging.h>
 #include <gtest/gtest.h>
+#include "memory_buffer.h"
 #include "sync_decoder.h"
 
 using namespace ffmpeg;
@@ -10,7 +9,8 @@ TEST(SyncDecoder, Test) {
   SyncDecoder decoder;
   DecoderParameters params;
   params.timeoutMs = 10000;
-  params.startOffsetMs = 1000;
+  params.startOffset = 1000000;
+  params.seekAccuracy = 100000;
   params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
   params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
   CHECK(decoder.init(params, nullptr));
@@ -20,3 +20,136 @@ TEST(SyncDecoder, Test) {
   }
   decoder.shutdown();
 }
+
+TEST(SyncDecoder, TestHeadersOnly) {
+  SyncDecoder decoder;
+  DecoderParameters params;
+  params.timeoutMs = 10000;
+  params.startOffset = 1000000;
+  params.seekAccuracy = 100000;
+  params.headerOnly = true;
+  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
+  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
+  CHECK(decoder.init(params, nullptr));
+  DecoderOutputMessage out;
+  while (0 == decoder.decode(&out, 100)) {
+    LOG(INFO) << "Decoded frame, type: " << out.header.format.type
+              << ", timestamp(us): " << out.header.pts;
+  }
+  decoder.shutdown();
+}
+
+TEST(SyncDecoder, TestMemoryBuffer) {
+  SyncDecoder decoder;
+  DecoderParameters params;
+  params.timeoutMs = 10000;
+  params.startOffset = 1000000;
+  params.endOffset = 9000000;
+  params.seekAccuracy = 10000;
+  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
+
+  FILE* f = fopen(
+      "pytorch/vision/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi",
+      "rb");
+  CHECK(f != nullptr);
+  fseek(f, 0, SEEK_END);
+  std::vector<uint8_t> buffer(ftell(f));
+  rewind(f);
+  CHECK_EQ(buffer.size(), fread(buffer.data(), 1, buffer.size(), f));
+  fclose(f);
+  CHECK(decoder.init(
+      params, MemoryBuffer::getCallback(buffer.data(), buffer.size())));
+  LOG(INFO) << "Decoding from memory bytes: " << buffer.size();
+  DecoderOutputMessage out;
+  size_t audioFrames = 0, videoFrames = 0;
+  while (0 == decoder.decode(&out, 100)) {
+    if (out.header.format.type == TYPE_AUDIO) {
+      ++audioFrames;
+    } else if (out.header.format.type == TYPE_VIDEO) {
+      ++videoFrames;
+    }
+  }
+  LOG(INFO) << "Decoded audio frames: " << audioFrames
+            << ", video frames: " << videoFrames;
+  decoder.shutdown();
+}
+
+TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) {
+  SyncDecoder decoder;
+  DecoderParameters params;
+  params.timeoutMs = 10000;
+  params.startOffset = 1000000;
+  params.endOffset = 9000000;
+  params.seekAccuracy = 10000;
+  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
+
+  FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb");
+  CHECK(f != nullptr);
+  fseek(f, 0, SEEK_END);
+  std::vector<uint8_t> buffer(ftell(f));
+  rewind(f);
+  CHECK_EQ(buffer.size(), fread(buffer.data(), 1, buffer.size(), f));
+  fclose(f);
+
+  params.maxSeekableBytes = buffer.size() + 1;
+  MemoryBuffer object(buffer.data(), buffer.size());
+  CHECK(decoder.init(
+      params,
+      [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
+      -> int {
+        if (out) { // see defs.h file
+          // read mode
+          return object.read(out, size);
+        }
+        // seek mode
+        if (!timeoutMs) {
+          // seek capabilty, yes - no
+          return -1;
+        }
+        return object.seek(size, whence);
+      }));
+  DecoderOutputMessage out;
+  while (0 == decoder.decode(&out, 100)) {
+    LOG(INFO) << "Decoded frame, timestamp(us): " << out.header.pts
+              << ", num: " << out.header.format.num
+              << ", den: " << out.header.format.den
+              << ", duration(us): " << out.header.format.duration;
+  }
+  decoder.shutdown();
+}
+
+TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) {
+  SyncDecoder decoder;
+  DecoderParameters params;
+  params.timeoutMs = 10000;
+  params.startOffset = 1000000;
+  params.endOffset = 9000000;
+  params.seekAccuracy = 10000;
+  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
+
+  FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb");
+  CHECK(f != nullptr);
+  fseek(f, 0, SEEK_END);
+  std::vector<uint8_t> buffer(ftell(f));
+  rewind(f);
+  CHECK_EQ(buffer.size(), fread(buffer.data(), 1, buffer.size(), f));
+  fclose(f);
+
+  params.maxSeekableBytes = buffer.size() / 2;
+  MemoryBuffer object(buffer.data(), buffer.size());
+  CHECK(!decoder.init(
+      params,
+      [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable
+      -> int {
+        if (out) { // see defs.h file
+          // read mode
+          return object.read(out, size);
+        }
+        // seek mode
+        if (!timeoutMs) {
+          // seek capabilty, yes - no
+          return -1;
+        }
+        return object.seek(size, whence);
+      }));
+}
diff --git a/torchvision/csrc/cpu/decoder/time_keeper.cpp b/torchvision/csrc/cpu/decoder/time_keeper.cpp
index a0da56a1f64..9cfc9457963 100644
--- a/torchvision/csrc/cpu/decoder/time_keeper.cpp
+++ b/torchvision/csrc/cpu/decoder/time_keeper.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "time_keeper.h"
 
 extern "C" {
@@ -9,13 +7,13 @@ extern "C" {
 namespace ffmpeg {
 
 namespace {
-const ssize_t kMaxTimeBaseDiference = 10;
+const long kMaxTimeBaseDiference = 10;
 }
 
-ssize_t TimeKeeper::adjust(ssize_t& decoderTimestamp) {
-  const ssize_t now = std::chrono::duration_cast<std::chrono::microseconds>(
-                          std::chrono::system_clock::now().time_since_epoch())
-                          .count();
+long TimeKeeper::adjust(long& decoderTimestamp) {
+  const long now = std::chrono::duration_cast<std::chrono::microseconds>(
+                       std::chrono::system_clock::now().time_since_epoch())
+                       .count();
 
   if (startTime_ == 0) {
     startTime_ = now;
diff --git a/torchvision/csrc/cpu/decoder/time_keeper.h b/torchvision/csrc/cpu/decoder/time_keeper.h
index c9d06025b2c..e4d4718c705 100644
--- a/torchvision/csrc/cpu/decoder/time_keeper.h
+++ b/torchvision/csrc/cpu/decoder/time_keeper.h
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include <stdlib.h>
@@ -17,11 +15,11 @@ class TimeKeeper {
 
   // adjust provided @timestamp to the corrected value
   // return advised sleep time before next frame processing in (us)
-  ssize_t adjust(ssize_t& decoderTimestamp);
+  long adjust(long& decoderTimestamp);
 
  private:
-  ssize_t startTime_{0};
-  ssize_t streamTimestamp_{0};
+  long startTime_{0};
+  long streamTimestamp_{0};
 };
 
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/util.cpp b/torchvision/csrc/cpu/decoder/util.cpp
index 6ae888838ea..ba19cf582b0 100644
--- a/torchvision/csrc/cpu/decoder/util.cpp
+++ b/torchvision/csrc/cpu/decoder/util.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "util.h"
 #include <c10/util/Logging.h>
 
diff --git a/torchvision/csrc/cpu/decoder/util.h b/torchvision/csrc/cpu/decoder/util.h
index 6a985d78559..cc64d8944e4 100644
--- a/torchvision/csrc/cpu/decoder/util.h
+++ b/torchvision/csrc/cpu/decoder/util.h
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "defs.h"
diff --git a/torchvision/csrc/cpu/decoder/video_sampler.cpp b/torchvision/csrc/cpu/decoder/video_sampler.cpp
index 1a91c82a371..4b7d078ebd7 100644
--- a/torchvision/csrc/cpu/decoder/video_sampler.cpp
+++ b/torchvision/csrc/cpu/decoder/video_sampler.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "video_sampler.h"
 #include <c10/util/Logging.h>
 #include "util.h"
diff --git a/torchvision/csrc/cpu/decoder/video_sampler.h b/torchvision/csrc/cpu/decoder/video_sampler.h
index 73997c213e1..85161307257 100644
--- a/torchvision/csrc/cpu/decoder/video_sampler.h
+++ b/torchvision/csrc/cpu/decoder/video_sampler.h
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "defs.h"
diff --git a/torchvision/csrc/cpu/decoder/video_stream.cpp b/torchvision/csrc/cpu/decoder/video_stream.cpp
index 9c6b77d0bfc..e464ed30cc9 100644
--- a/torchvision/csrc/cpu/decoder/video_stream.cpp
+++ b/torchvision/csrc/cpu/decoder/video_stream.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "video_stream.h"
 #include <c10/util/Logging.h>
 #include "util.h"
@@ -11,12 +9,23 @@ bool operator==(const VideoFormat& x, const AVFrame& y) {
   return x.width == y.width && x.height == y.height && x.format == y.format;
 }
 
+bool operator==(const VideoFormat& x, const AVCodecContext& y) {
+  return x.width == y.width && x.height == y.height && x.format == y.pix_fmt;
+}
+
 VideoFormat& toVideoFormat(VideoFormat& x, const AVFrame& y) {
   x.width = y.width;
   x.height = y.height;
   x.format = y.format;
   return x;
 }
+
+VideoFormat& toVideoFormat(VideoFormat& x, const AVCodecContext& y) {
+  x.width = y.width;
+  x.height = y.height;
+  x.format = y.pix_fmt;
+  return x;
+}
 } // namespace
 
 VideoStream::VideoStream(
@@ -28,8 +37,8 @@ VideoStream::VideoStream(
     : Stream(
           inputCtx,
           MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime),
-      loggingUuid_(loggingUuid) {}
+          convertPtsToWallTime,
+          loggingUuid) {}
 
 VideoStream::~VideoStream() {
   if (sampler_) {
@@ -79,12 +88,14 @@ int VideoStream::initFormat() {
 int VideoStream::estimateBytes(bool flush) {
   ensureSampler();
   // check if input format gets changed
-  if (!flush && !(sampler_->getInputFormat().video == *frame_)) {
+  if (flush ? !(sampler_->getInputFormat().video == *codecCtx_)
+            : !(sampler_->getInputFormat().video == *frame_)) {
     // - reinit sampler
     SamplerParameters params;
     params.type = format_.type;
     params.out = format_.format;
-    toVideoFormat(params.in.video, *frame_);
+    flush ? toVideoFormat(params.in.video, *codecCtx_)
+          : toVideoFormat(params.in.video, *frame_);
     if (!sampler_->init(params)) {
       return -1;
     }
@@ -108,36 +119,13 @@ int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) {
   return sampler_->sample(flush ? nullptr : frame_, out);
 }
 
-void VideoStream::setHeader(DecoderHeader* header) {
-  header->seqno = numGenerator_++;
-
-  if (codecCtx_->time_base.num != 0) {
-    header->pts = av_rescale_q(
-        av_frame_get_best_effort_timestamp(frame_),
-        codecCtx_->time_base,
-        AV_TIME_BASE_Q);
-  } else {
-    // If the codec time_base is missing then we would've skipped the
-    // rescalePackage step to rescale to codec time_base, so here we can
-    // rescale straight from the stream time_base into AV_TIME_BASE_Q.
-    header->pts = av_rescale_q(
-        av_frame_get_best_effort_timestamp(frame_),
-        inputCtx_->streams[format_.stream]->time_base,
-        AV_TIME_BASE_Q);
-  }
-
-  if (convertPtsToWallTime_) {
-    keeper_.adjust(header->pts);
-  }
-
-  header->keyFrame = frame_->key_frame;
-  auto fpsRational = inputCtx_->streams[format_.stream]->avg_frame_rate;
-  if (fpsRational.den) {
-    header->fps = av_q2d(fpsRational);
-  } else {
-    header->fps = std::numeric_limits<double>::quiet_NaN();
+void VideoStream::setHeader(DecoderHeader* header, bool flush) {
+  Stream::setHeader(header, flush);
+  if (!flush) { // no frames for video flush
+    header->keyFrame = frame_->key_frame;
+    header->fps = av_q2d(av_guess_frame_rate(
+        inputCtx_, inputCtx_->streams[format_.stream], nullptr));
   }
-  header->format = format_;
 }
 
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/video_stream.h b/torchvision/csrc/cpu/decoder/video_stream.h
index af1e3fb960f..8e73d099613 100644
--- a/torchvision/csrc/cpu/decoder/video_stream.h
+++ b/torchvision/csrc/cpu/decoder/video_stream.h
@@ -1,9 +1,6 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "stream.h"
-#include "time_keeper.h"
 #include "video_sampler.h"
 
 namespace ffmpeg {
@@ -19,21 +16,19 @@ class VideoStream : public Stream {
       int index,
       bool convertPtsToWallTime,
       const VideoFormat& format,
-      int64_t loggingUuid = 0);
+      int64_t loggingUuid);
   ~VideoStream() override;
 
  private:
   int initFormat() override;
   int estimateBytes(bool flush) override;
   int copyFrameBytes(ByteStorage* out, bool flush) override;
-  void setHeader(DecoderHeader* header) override;
+  void setHeader(DecoderHeader* header, bool flush) override;
 
   void ensureSampler();
 
  private:
   std::unique_ptr<VideoSampler> sampler_;
-  TimeKeeper keeper_;
-  int64_t loggingUuid_{0};
 };
 
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.cpp b/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.cpp
deleted file mode 100644
index 24aecacf946..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-#include "FfmpegAudioSampler.h"
-#include <memory>
-#include "FfmpegUtil.h"
-
-using namespace std;
-
-FfmpegAudioSampler::FfmpegAudioSampler(
-    const AudioFormat& in,
-    const AudioFormat& out)
-    : inFormat_(in), outFormat_(out) {}
-
-FfmpegAudioSampler::~FfmpegAudioSampler() {
-  if (swrContext_) {
-    swr_free(&swrContext_);
-  }
-}
-
-int FfmpegAudioSampler::init() {
-  swrContext_ = swr_alloc_set_opts(
-      nullptr, // we're allocating a new context
-      av_get_default_channel_layout(outFormat_.channels), // out_ch_layout
-      static_cast<AVSampleFormat>(outFormat_.format), // out_sample_fmt
-      outFormat_.samples, // out_sample_rate
-      av_get_default_channel_layout(inFormat_.channels), // in_ch_layout
-      static_cast<AVSampleFormat>(inFormat_.format), // in_sample_fmt
-      inFormat_.samples, // in_sample_rate
-      0, // log_offset
-      nullptr); // log_ctx
-  if (swrContext_ == nullptr) {
-    LOG(ERROR) << "swr_alloc_set_opts fails";
-    return -1;
-  }
-  int result = 0;
-  if ((result = swr_init(swrContext_)) < 0) {
-    LOG(ERROR) << "swr_init failed, err: " << ffmpeg_util::getErrorDesc(result)
-               << ", in -> format: " << inFormat_.format
-               << ", channels: " << inFormat_.channels
-               << ", samples: " << inFormat_.samples
-               << ", out -> format: " << outFormat_.format
-               << ", channels: " << outFormat_.channels
-               << ", samples: " << outFormat_.samples;
-    return -1;
-  }
-  return 0;
-}
-
-int64_t FfmpegAudioSampler::getSampleBytes(const AVFrame* frame) const {
-  auto outSamples = getOutNumSamples(frame->nb_samples);
-
-  return av_samples_get_buffer_size(
-      nullptr,
-      outFormat_.channels,
-      outSamples,
-      static_cast<AVSampleFormat>(outFormat_.format),
-      1);
-}
-
-// https://www.ffmpeg.org/doxygen/3.2/group__lswr.html
-unique_ptr<DecodedFrame> FfmpegAudioSampler::sample(const AVFrame* frame) {
-  if (!frame) {
-    return nullptr; // no flush for videos
-  }
-
-  auto inNumSamples = frame->nb_samples;
-  auto outNumSamples = getOutNumSamples(frame->nb_samples);
-
-  auto outSampleSize = getSampleBytes(frame);
-  AvDataPtr frameData(static_cast<uint8_t*>(av_malloc(outSampleSize)));
-
-  uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS];
-  int result = 0;
-  if ((result = av_samples_fill_arrays(
-           outPlanes,
-           nullptr, // linesize is not needed
-           frameData.get(),
-           outFormat_.channels,
-           outNumSamples,
-           static_cast<AVSampleFormat>(outFormat_.format),
-           1)) < 0) {
-    LOG(ERROR) << "av_samples_fill_arrays failed, err: "
-               << ffmpeg_util::getErrorDesc(result)
-               << ", outNumSamples: " << outNumSamples
-               << ", format: " << outFormat_.format;
-    return nullptr;
-  }
-
-  if ((result = swr_convert(
-           swrContext_,
-           &outPlanes[0],
-           outNumSamples,
-           (const uint8_t**)&frame->data[0],
-           inNumSamples)) < 0) {
-    LOG(ERROR) << "swr_convert faield, err: "
-               << ffmpeg_util::getErrorDesc(result);
-    return nullptr;
-  }
-  // result returned by swr_convert is the No. of actual output samples.
-  // So update the buffer size using av_samples_get_buffer_size
-  result = av_samples_get_buffer_size(
-      nullptr,
-      outFormat_.channels,
-      result,
-      static_cast<AVSampleFormat>(outFormat_.format),
-      1);
-
-  return make_unique<DecodedFrame>(std::move(frameData), result, 0);
-}
-/*
-Because of decoding delay, the returned value is an upper bound of No. of
-output samples
-*/
-int64_t FfmpegAudioSampler::getOutNumSamples(int inNumSamples) const {
-  return av_rescale_rnd(
-      swr_get_delay(swrContext_, inFormat_.samples) + inNumSamples,
-      outFormat_.samples,
-      inFormat_.samples,
-      AV_ROUND_UP);
-}
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.h b/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.h
deleted file mode 100644
index 767a5ca6e4f..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#pragma once
-
-#include "FfmpegSampler.h"
-
-#define AVRESAMPLE_MAX_CHANNELS 32
-
-/**
- * Class transcode audio frames from one format into another
- */
-class FfmpegAudioSampler : public FfmpegSampler {
- public:
-  explicit FfmpegAudioSampler(const AudioFormat& in, const AudioFormat& out);
-  ~FfmpegAudioSampler() override;
-
-  int init() override;
-
-  int64_t getSampleBytes(const AVFrame* frame) const;
-  // FfmpegSampler overrides
-  // returns number of bytes of the sampled data
-  std::unique_ptr<DecodedFrame> sample(const AVFrame* frame) override;
-
-  const AudioFormat& getInFormat() const {
-    return inFormat_;
-  }
-
- private:
-  int64_t getOutNumSamples(int inNumSamples) const;
-
-  AudioFormat inFormat_;
-  AudioFormat outFormat_;
-  SwrContext* swrContext_{nullptr};
-};
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.cpp b/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.cpp
deleted file mode 100644
index b5b1e2fbda5..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-#include "FfmpegAudioStream.h"
-#include "FfmpegUtil.h"
-
-using namespace std;
-
-namespace {
-
-bool operator==(const AudioFormat& x, const AVCodecContext& y) {
-  return x.samples == y.sample_rate && x.channels == y.channels &&
-      x.format == y.sample_fmt;
-}
-
-AudioFormat& toAudioFormat(
-    AudioFormat& audioFormat,
-    const AVCodecContext& codecCtx) {
-  audioFormat.samples = codecCtx.sample_rate;
-  audioFormat.channels = codecCtx.channels;
-  audioFormat.format = codecCtx.sample_fmt;
-
-  return audioFormat;
-}
-
-} // namespace
-
-FfmpegAudioStream::FfmpegAudioStream(
-    AVFormatContext* inputCtx,
-    int index,
-    enum AVMediaType avMediaType,
-    MediaFormat mediaFormat,
-    double seekFrameMargin)
-    : FfmpegStream(inputCtx, index, avMediaType, seekFrameMargin),
-      mediaFormat_(mediaFormat) {}
-
-FfmpegAudioStream::~FfmpegAudioStream() {}
-
-void FfmpegAudioStream::checkStreamDecodeParams() {
-  auto timeBase = getTimeBase();
-  if (timeBase.first > 0) {
-    CHECK_EQ(timeBase.first, inputCtx_->streams[index_]->time_base.num);
-    CHECK_EQ(timeBase.second, inputCtx_->streams[index_]->time_base.den);
-  }
-}
-
-void FfmpegAudioStream::updateStreamDecodeParams() {
-  auto timeBase = getTimeBase();
-  if (timeBase.first == 0) {
-    mediaFormat_.format.audio.timeBaseNum =
-        inputCtx_->streams[index_]->time_base.num;
-    mediaFormat_.format.audio.timeBaseDen =
-        inputCtx_->streams[index_]->time_base.den;
-  }
-  mediaFormat_.format.audio.duration = inputCtx_->streams[index_]->duration;
-}
-
-int FfmpegAudioStream::initFormat() {
-  AudioFormat& format = mediaFormat_.format.audio;
-
-  if (format.samples == 0) {
-    format.samples = codecCtx_->sample_rate;
-  }
-  if (format.channels == 0) {
-    format.channels = codecCtx_->channels;
-  }
-  if (format.format == AV_SAMPLE_FMT_NONE) {
-    format.format = codecCtx_->sample_fmt;
-    VLOG(2) << "set stream format sample_fmt: " << format.format;
-  }
-
-  checkStreamDecodeParams();
-
-  updateStreamDecodeParams();
-
-  if (format.samples > 0 && format.channels > 0 &&
-      format.format != AV_SAMPLE_FMT_NONE) {
-    return 0;
-  } else {
-    return -1;
-  }
-}
-
-unique_ptr<DecodedFrame> FfmpegAudioStream::sampleFrameData() {
-  AudioFormat& audioFormat = mediaFormat_.format.audio;
-
-  if (!sampler_ || !(sampler_->getInFormat() == *codecCtx_)) {
-    AudioFormat newInFormat;
-    newInFormat = toAudioFormat(newInFormat, *codecCtx_);
-    sampler_ = make_unique<FfmpegAudioSampler>(newInFormat, audioFormat);
-    VLOG(1) << "Set sampler input audio format"
-            << ", samples: " << newInFormat.samples
-            << ", channels: " << newInFormat.channels
-            << ", format: " << newInFormat.format
-            << " : output audio sampler format"
-            << ", samples: " << audioFormat.samples
-            << ", channels: " << audioFormat.channels
-            << ", format: " << audioFormat.format;
-    int ret = sampler_->init();
-    if (ret < 0) {
-      VLOG(1) << "Fail to initialize audio sampler";
-      return nullptr;
-    }
-  }
-  return sampler_->sample(frame_);
-}
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.h b/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.h
deleted file mode 100644
index 1d4f7a2f2ee..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#pragma once
-
-#include <utility>
-#include "FfmpegAudioSampler.h"
-#include "FfmpegStream.h"
-
-/**
- * Class uses FFMPEG library to decode one video stream.
- */
-class FfmpegAudioStream : public FfmpegStream {
- public:
-  explicit FfmpegAudioStream(
-      AVFormatContext* inputCtx,
-      int index,
-      enum AVMediaType avMediaType,
-      MediaFormat mediaFormat,
-      double seekFrameMargin);
-
-  ~FfmpegAudioStream() override;
-
-  // FfmpegStream overrides
-  MediaType getMediaType() const override {
-    return MediaType::TYPE_AUDIO;
-  }
-
-  FormatUnion getMediaFormat() const override {
-    return mediaFormat_.format;
-  }
-
-  int64_t getStartPts() const override {
-    return mediaFormat_.format.audio.startPts;
-  }
-  int64_t getEndPts() const override {
-    return mediaFormat_.format.audio.endPts;
-  }
-  // return numerator and denominator of time base
-  std::pair<int, int> getTimeBase() const {
-    return std::make_pair(
-        mediaFormat_.format.audio.timeBaseNum,
-        mediaFormat_.format.audio.timeBaseDen);
-  }
-
-  void checkStreamDecodeParams();
-
-  void updateStreamDecodeParams();
-
- protected:
-  int initFormat() override;
-  std::unique_ptr<DecodedFrame> sampleFrameData() override;
-
- private:
-  MediaFormat mediaFormat_;
-  std::unique_ptr<FfmpegAudioSampler> sampler_{nullptr};
-};
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegDecoder.cpp b/torchvision/csrc/cpu/video_reader/FfmpegDecoder.cpp
deleted file mode 100644
index fb4d302cc03..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegDecoder.cpp
+++ /dev/null
@@ -1,412 +0,0 @@
-#include "FfmpegDecoder.h"
-#include "FfmpegAudioStream.h"
-#include "FfmpegUtil.h"
-#include "FfmpegVideoStream.h"
-
-using namespace std;
-
-static AVPacket avPkt;
-
-namespace {
-
-unique_ptr<FfmpegStream> createFfmpegStream(
-    MediaType type,
-    AVFormatContext* ctx,
-    int idx,
-    MediaFormat& mediaFormat,
-    double seekFrameMargin) {
-  enum AVMediaType avType;
-  CHECK(ffmpeg_util::mapMediaType(type, &avType));
-  switch (type) {
-    case MediaType::TYPE_VIDEO:
-      return make_unique<FfmpegVideoStream>(
-          ctx, idx, avType, mediaFormat, seekFrameMargin);
-    case MediaType::TYPE_AUDIO:
-      return make_unique<FfmpegAudioStream>(
-          ctx, idx, avType, mediaFormat, seekFrameMargin);
-    default:
-      return nullptr;
-  }
-}
-
-} // namespace
-
-FfmpegAvioContext::FfmpegAvioContext()
-    : workBuffersize_(VIO_BUFFER_SZ),
-      workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
-      inputFile_(nullptr),
-      inputBuffer_(nullptr),
-      inputBufferSize_(0) {}
-
-int FfmpegAvioContext::initAVIOContext(const uint8_t* buffer, int64_t size) {
-  inputBuffer_ = buffer;
-  inputBufferSize_ = size;
-  avioCtx_ = avio_alloc_context(
-      workBuffer_,
-      workBuffersize_,
-      0,
-      reinterpret_cast<void*>(this),
-      &FfmpegAvioContext::readMemory,
-      nullptr, // no write function
-      &FfmpegAvioContext::seekMemory);
-  return 0;
-}
-
-FfmpegAvioContext::~FfmpegAvioContext() {
-  /* note: the internal buffer could have changed, and be != workBuffer_ */
-  if (avioCtx_) {
-    av_freep(&avioCtx_->buffer);
-    av_freep(&avioCtx_);
-  } else {
-    av_freep(&workBuffer_);
-  }
-  if (inputFile_) {
-    fclose(inputFile_);
-  }
-}
-
-int FfmpegAvioContext::read(uint8_t* buf, int buf_size) {
-  if (inputBuffer_) {
-    return readMemory(this, buf, buf_size);
-  } else {
-    return -1;
-  }
-}
-
-int FfmpegAvioContext::readMemory(void* opaque, uint8_t* buf, int buf_size) {
-  FfmpegAvioContext* h = static_cast<FfmpegAvioContext*>(opaque);
-  if (buf_size < 0) {
-    return -1;
-  }
-
-  int reminder = h->inputBufferSize_ - h->offset_;
-  int r = buf_size < reminder ? buf_size : reminder;
-  if (r < 0) {
-    return AVERROR_EOF;
-  }
-
-  memcpy(buf, h->inputBuffer_ + h->offset_, r);
-  h->offset_ += r;
-  return r;
-}
-
-int64_t FfmpegAvioContext::seek(int64_t offset, int whence) {
-  if (inputBuffer_) {
-    return seekMemory(this, offset, whence);
-  } else {
-    return -1;
-  }
-}
-
-int64_t FfmpegAvioContext::seekMemory(
-    void* opaque,
-    int64_t offset,
-    int whence) {
-  FfmpegAvioContext* h = static_cast<FfmpegAvioContext*>(opaque);
-  switch (whence) {
-    case SEEK_CUR: // from current position
-      h->offset_ += offset;
-      break;
-    case SEEK_END: // from eof
-      h->offset_ = h->inputBufferSize_ + offset;
-      break;
-    case SEEK_SET: // from beginning of file
-      h->offset_ = offset;
-      break;
-    case AVSEEK_SIZE:
-      return h->inputBufferSize_;
-  }
-  return h->offset_;
-}
-
-int FfmpegDecoder::init(
-    const std::string& filename,
-    bool isDecodeFile,
-    FfmpegAvioContext& ioctx,
-    DecoderOutput& decoderOutput) {
-  cleanUp();
-
-  int ret = 0;
-  if (!isDecodeFile) {
-    formatCtx_ = avformat_alloc_context();
-    if (!formatCtx_) {
-      LOG(ERROR) << "avformat_alloc_context failed";
-      return -1;
-    }
-    formatCtx_->pb = ioctx.get_avio();
-    formatCtx_->flags |= AVFMT_FLAG_CUSTOM_IO;
-
-    // Determining the input format:
-    int probeSz = AVPROBE_SIZE + AVPROBE_PADDING_SIZE;
-    uint8_t* probe((uint8_t*)av_malloc(probeSz));
-    memset(probe, 0, probeSz);
-    int len = ioctx.read(probe, probeSz - AVPROBE_PADDING_SIZE);
-    if (len < probeSz - AVPROBE_PADDING_SIZE) {
-      LOG(ERROR) << "Insufficient data to determine video format";
-      av_freep(&probe);
-      return -1;
-    }
-    // seek back to start of stream
-    ioctx.seek(0, SEEK_SET);
-
-    unique_ptr<AVProbeData> probeData(new AVProbeData());
-    probeData->buf = probe;
-    probeData->buf_size = len;
-    probeData->filename = "";
-    // Determine the input-format:
-    formatCtx_->iformat = av_probe_input_format(probeData.get(), 1);
-    // this is to avoid the double-free error
-    if (formatCtx_->iformat == nullptr) {
-      LOG(ERROR) << "av_probe_input_format fails";
-      return -1;
-    }
-    VLOG(1) << "av_probe_input_format succeeds";
-    av_freep(&probe);
-
-    ret = avformat_open_input(&formatCtx_, "", nullptr, nullptr);
-  } else {
-    ret = avformat_open_input(&formatCtx_, filename.c_str(), nullptr, nullptr);
-  }
-
-  if (ret < 0) {
-    LOG(ERROR) << "avformat_open_input failed, error: "
-               << ffmpeg_util::getErrorDesc(ret);
-    cleanUp();
-    return ret;
-  }
-  ret = avformat_find_stream_info(formatCtx_, nullptr);
-  if (ret < 0) {
-    LOG(ERROR) << "avformat_find_stream_info failed, error: "
-               << ffmpeg_util::getErrorDesc(ret);
-    cleanUp();
-    return ret;
-  }
-  if (!initStreams()) {
-    LOG(ERROR) << "Cannot activate streams";
-    cleanUp();
-    return -1;
-  }
-
-  for (auto& stream : streams_) {
-    MediaType mediaType = stream.second->getMediaType();
-    decoderOutput.initMediaType(mediaType, stream.second->getMediaFormat());
-  }
-  VLOG(1) << "FfmpegDecoder initialized";
-  return 0;
-}
-
-int FfmpegDecoder::decodeFile(
-    unique_ptr<DecoderParameters> params,
-    const string& fileName,
-    DecoderOutput& decoderOutput) {
-  VLOG(1) << "decode file: " << fileName;
-  FfmpegAvioContext ioctx;
-  int ret = decodeLoop(std::move(params), fileName, true, ioctx, decoderOutput);
-  return ret;
-}
-
-int FfmpegDecoder::decodeMemory(
-    unique_ptr<DecoderParameters> params,
-    const uint8_t* buffer,
-    int64_t size,
-    DecoderOutput& decoderOutput) {
-  VLOG(1) << "decode video data in memory";
-  FfmpegAvioContext ioctx;
-  int ret = ioctx.initAVIOContext(buffer, size);
-  if (ret == 0) {
-    ret =
-        decodeLoop(std::move(params), string(""), false, ioctx, decoderOutput);
-  }
-  return ret;
-}
-
-int FfmpegDecoder::probeFile(
-    unique_ptr<DecoderParameters> params,
-    const string& fileName,
-    DecoderOutput& decoderOutput) {
-  VLOG(1) << "probe file: " << fileName;
-  FfmpegAvioContext ioctx;
-  return probeVideo(std::move(params), fileName, true, ioctx, decoderOutput);
-}
-
-int FfmpegDecoder::probeMemory(
-    unique_ptr<DecoderParameters> params,
-    const uint8_t* buffer,
-    int64_t size,
-    DecoderOutput& decoderOutput) {
-  VLOG(1) << "probe video data in memory";
-  FfmpegAvioContext ioctx;
-  int ret = ioctx.initAVIOContext(buffer, size);
-  if (ret == 0) {
-    ret =
-        probeVideo(std::move(params), string(""), false, ioctx, decoderOutput);
-  }
-  return ret;
-}
-
-void FfmpegDecoder::cleanUp() {
-  if (formatCtx_) {
-    for (auto& stream : streams_) {
-      // Drain stream buffers.
-      DecoderOutput decoderOutput;
-      stream.second->flush(1, decoderOutput);
-      stream.second.reset();
-    }
-    streams_.clear();
-    avformat_close_input(&formatCtx_);
-  }
-}
-
-FfmpegStream* FfmpegDecoder::findStreamByIndex(int streamIndex) const {
-  auto it = streams_.find(streamIndex);
-  return it != streams_.end() ? it->second.get() : nullptr;
-}
-
-/*
-Reference implementation:
-https://ffmpeg.org/doxygen/3.4/demuxing_decoding_8c-example.html
-*/
-int FfmpegDecoder::decodeLoop(
-    unique_ptr<DecoderParameters> params,
-    const std::string& filename,
-    bool isDecodeFile,
-    FfmpegAvioContext& ioctx,
-    DecoderOutput& decoderOutput) {
-  params_ = std::move(params);
-
-  int ret = init(filename, isDecodeFile, ioctx, decoderOutput);
-  if (ret < 0) {
-    return ret;
-  }
-  // init package
-  av_init_packet(&avPkt);
-  avPkt.data = nullptr;
-  avPkt.size = 0;
-
-  int result = 0;
-  bool ptsInRange = true;
-  while (ptsInRange) {
-    result = av_read_frame(formatCtx_, &avPkt);
-    if (result == AVERROR(EAGAIN)) {
-      VLOG(1) << "Decoder is busy";
-      ret = 0;
-      break;
-    } else if (result == AVERROR_EOF) {
-      VLOG(1) << "Stream decoding is completed";
-      ret = 0;
-      break;
-    } else if (result < 0) {
-      VLOG(1) << "av_read_frame fails. Break decoder loop. Error: "
-              << ffmpeg_util::getErrorDesc(result);
-      ret = result;
-      break;
-    }
-
-    ret = 0;
-    auto stream = findStreamByIndex(avPkt.stream_index);
-    if (stream == nullptr) {
-      // the packet is from a stream the caller is not interested. Ignore it
-      VLOG(2) << "avPkt ignored. stream index: " << avPkt.stream_index;
-      // Need to free the memory of AVPacket. Otherwise, memory leak happens
-      av_packet_unref(&avPkt);
-      continue;
-    }
-
-    do {
-      result = stream->sendPacket(&avPkt);
-      if (result == AVERROR(EAGAIN)) {
-        VLOG(2) << "avcodec_send_packet returns AVERROR(EAGAIN)";
-        // start to recevie available frames from internal buffer
-        stream->receiveAvailFrames(params_->getPtsOnly, decoderOutput);
-        if (isPtsExceedRange()) {
-          // exit the most-outer while loop
-          VLOG(1) << "In all streams, exceed the end pts. Exit decoding loop";
-          ret = 0;
-          ptsInRange = false;
-          break;
-        }
-      } else if (result < 0) {
-        LOG(WARNING) << "avcodec_send_packet failed. Error: "
-                     << ffmpeg_util::getErrorDesc(result);
-        ret = result;
-        break;
-      } else {
-        VLOG(2) << "avcodec_send_packet succeeds";
-        // succeed. Read the next AVPacket and send out it
-        break;
-      }
-    } while (ptsInRange);
-    // Need to free the memory of AVPacket. Otherwise, memory leak happens
-    av_packet_unref(&avPkt);
-  }
-  /* flush cached frames */
-  flushStreams(decoderOutput);
-  return ret;
-}
-
-int FfmpegDecoder::probeVideo(
-    unique_ptr<DecoderParameters> params,
-    const std::string& filename,
-    bool isDecodeFile,
-    FfmpegAvioContext& ioctx,
-    DecoderOutput& decoderOutput) {
-  params_ = std::move(params);
-  return init(filename, isDecodeFile, ioctx, decoderOutput);
-}
-
-bool FfmpegDecoder::initStreams() {
-  for (auto it = params_->formats.begin(); it != params_->formats.end(); ++it) {
-    AVMediaType mediaType;
-    if (!ffmpeg_util::mapMediaType(it->first, &mediaType)) {
-      LOG(ERROR) << "Unknown media type: " << it->first;
-      return false;
-    }
-    int streamIdx =
-        av_find_best_stream(formatCtx_, mediaType, -1, -1, nullptr, 0);
-
-    if (streamIdx >= 0) {
-      VLOG(2) << "find stream index: " << streamIdx;
-      auto stream = createFfmpegStream(
-          it->first,
-          formatCtx_,
-          streamIdx,
-          it->second,
-          params_->seekFrameMargin);
-
-      CHECK(stream);
-      if (stream->openCodecContext() < 0) {
-        LOG(ERROR) << "Cannot open codec. Stream index: " << streamIdx;
-        return false;
-      }
-      streams_.emplace(streamIdx, move(stream));
-    } else {
-      VLOG(1) << "Cannot open find stream of type " << it->first;
-    }
-  }
-  // Seek frames in each stream
-  int ret = 0;
-  for (auto& stream : streams_) {
-    auto startPts = stream.second->getStartPts();
-    VLOG(1) << "stream: " << stream.first << " startPts: " << startPts;
-    if (startPts > 0 && (ret = stream.second->seekFrame(startPts)) < 0) {
-      LOG(WARNING) << "seekFrame in stream fails";
-      return false;
-    }
-  }
-  VLOG(1) << "initStreams succeeds";
-  return true;
-}
-
-bool FfmpegDecoder::isPtsExceedRange() {
-  bool exceed = true;
-  for (auto& stream : streams_) {
-    exceed = exceed && stream.second->isFramePtsExceedRange();
-  }
-  return exceed;
-}
-
-void FfmpegDecoder::flushStreams(DecoderOutput& decoderOutput) {
-  for (auto& stream : streams_) {
-    stream.second->flush(params_->getPtsOnly, decoderOutput);
-  }
-}
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegDecoder.h b/torchvision/csrc/cpu/video_reader/FfmpegDecoder.h
deleted file mode 100644
index a0a564a4214..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegDecoder.h
+++ /dev/null
@@ -1,127 +0,0 @@
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "FfmpegHeaders.h"
-#include "FfmpegStream.h"
-#include "Interface.h"
-
-#define VIO_BUFFER_SZ 81920
-#define AVPROBE_SIZE 8192
-
-class DecoderParameters {
- public:
-  std::unordered_map<MediaType, MediaFormat, EnumClassHash> formats;
-  // av_seek_frame is imprecise so seek to a timestamp earlier by a margin
-  // The unit of margin is second
-  double seekFrameMargin{1.0};
-  // When getPtsOnly is set to 1, we only get pts of each frame and don not
-  // output frame data. It will be much faster
-  int64_t getPtsOnly{0};
-};
-
-class FfmpegAvioContext {
- public:
-  FfmpegAvioContext();
-
-  int initAVIOContext(const uint8_t* buffer, int64_t size);
-
-  ~FfmpegAvioContext();
-
-  int read(uint8_t* buf, int buf_size);
-
-  static int readMemory(void* opaque, uint8_t* buf, int buf_size);
-
-  int64_t seek(int64_t offset, int whence);
-
-  static int64_t seekMemory(void* opaque, int64_t offset, int whence);
-
-  AVIOContext* get_avio() {
-    return avioCtx_;
-  }
-
- private:
-  int workBuffersize_;
-  uint8_t* workBuffer_;
-  // for file mode
-  FILE* inputFile_;
-  // for memory mode
-  const uint8_t* inputBuffer_;
-  int inputBufferSize_;
-  int offset_ = 0;
-
-  AVIOContext* avioCtx_{nullptr};
-};
-
-class FfmpegDecoder {
- public:
-  FfmpegDecoder() {
-    av_register_all();
-  }
-  ~FfmpegDecoder() {
-    cleanUp();
-  }
-  // return 0 on success
-  // return negative number on failure
-  int decodeFile(
-      std::unique_ptr<DecoderParameters> params,
-      const std::string& filename,
-      DecoderOutput& decoderOutput);
-  // return 0 on success
-  // return negative number on failure
-  int decodeMemory(
-      std::unique_ptr<DecoderParameters> params,
-      const uint8_t* buffer,
-      int64_t size,
-      DecoderOutput& decoderOutput);
-  // return 0 on success
-  // return negative number on failure
-  int probeFile(
-      std::unique_ptr<DecoderParameters> params,
-      const std::string& filename,
-      DecoderOutput& decoderOutput);
-  // return 0 on success
-  // return negative number on failure
-  int probeMemory(
-      std::unique_ptr<DecoderParameters> params,
-      const uint8_t* buffer,
-      int64_t size,
-      DecoderOutput& decoderOutput);
-
-  void cleanUp();
-
- private:
-  FfmpegStream* findStreamByIndex(int streamIndex) const;
-
-  int init(
-      const std::string& filename,
-      bool isDecodeFile,
-      FfmpegAvioContext& ioctx,
-      DecoderOutput& decoderOutput);
-  // return 0 on success
-  // return negative number on failure
-  int decodeLoop(
-      std::unique_ptr<DecoderParameters> params,
-      const std::string& filename,
-      bool isDecodeFile,
-      FfmpegAvioContext& ioctx,
-      DecoderOutput& decoderOutput);
-
-  int probeVideo(
-      std::unique_ptr<DecoderParameters> params,
-      const std::string& filename,
-      bool isDecodeFile,
-      FfmpegAvioContext& ioctx,
-      DecoderOutput& decoderOutput);
-
-  bool initStreams();
-
-  void flushStreams(DecoderOutput& decoderOutput);
-  // whether in all streams, the pts of most recent frame exceeds range
-  bool isPtsExceedRange();
-
-  std::unordered_map<int, std::unique_ptr<FfmpegStream>> streams_;
-  AVFormatContext* formatCtx_{nullptr};
-  std::unique_ptr<DecoderParameters> params_{nullptr};
-};
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegHeaders.h b/torchvision/csrc/cpu/video_reader/FfmpegHeaders.h
deleted file mode 100644
index ff26aa30a8d..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegHeaders.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-extern "C" {
-#include <libavcodec/avcodec.h>
-#include <libavformat/avformat.h>
-#include <libavformat/avio.h>
-#include <libavutil/avutil.h>
-#include <libavutil/imgutils.h>
-#include <libavutil/log.h>
-#include <libavutil/samplefmt.h>
-#include <libswresample/swresample.h>
-#include <libswscale/swscale.h>
-}
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegSampler.h b/torchvision/csrc/cpu/video_reader/FfmpegSampler.h
deleted file mode 100644
index 3d00be3486f..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegSampler.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#pragma once
-
-#include "FfmpegHeaders.h"
-#include "Interface.h"
-
-/**
- * Class sample data from AVFrame
- */
-class FfmpegSampler {
- public:
-  virtual ~FfmpegSampler() = default;
-  // return 0 on success and negative number on failure
-  virtual int init() = 0;
-  // sample from the given frame
-  virtual std::unique_ptr<DecodedFrame> sample(const AVFrame* frame) = 0;
-};
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegStream.cpp b/torchvision/csrc/cpu/video_reader/FfmpegStream.cpp
deleted file mode 100644
index b745170baf4..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegStream.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-#include "FfmpegStream.h"
-#include "FfmpegUtil.h"
-
-using namespace std;
-
-// (TODO) Currently, disable the use of refCount
-static int refCount = 0;
-
-FfmpegStream::FfmpegStream(
-    AVFormatContext* inputCtx,
-    int index,
-    enum AVMediaType avMediaType,
-    double seekFrameMargin)
-    : inputCtx_(inputCtx),
-      index_(index),
-      avMediaType_(avMediaType),
-      seekFrameMargin_(seekFrameMargin) {}
-
-FfmpegStream::~FfmpegStream() {
-  if (frame_) {
-    av_frame_free(&frame_);
-  }
-  avcodec_free_context(&codecCtx_);
-}
-
-int FfmpegStream::openCodecContext() {
-  VLOG(2) << "stream start_time: " << inputCtx_->streams[index_]->start_time;
-
-  auto typeString = av_get_media_type_string(avMediaType_);
-  AVStream* st = inputCtx_->streams[index_];
-  auto codec_id = st->codecpar->codec_id;
-  VLOG(1) << "codec_id: " << codec_id;
-  AVCodec* codec = avcodec_find_decoder(codec_id);
-  if (!codec) {
-    LOG(ERROR) << "avcodec_find_decoder failed for codec_id: " << int(codec_id);
-    return AVERROR(EINVAL);
-  }
-  VLOG(1) << "Succeed to find decoder";
-
-  codecCtx_ = avcodec_alloc_context3(codec);
-  if (!codecCtx_) {
-    LOG(ERROR) << "avcodec_alloc_context3 fails";
-    return AVERROR(ENOMEM);
-  }
-
-  int ret;
-  /* Copy codec parameters from input stream to output codec context */
-  if ((ret = avcodec_parameters_to_context(codecCtx_, st->codecpar)) < 0) {
-    LOG(ERROR) << "Failed to copy " << typeString
-               << " codec parameters to decoder context";
-    return ret;
-  }
-
-  AVDictionary* opts = nullptr;
-  av_dict_set(&opts, "refcounted_frames", refCount ? "1" : "0", 0);
-
-  // after avcodec_open2, value of codecCtx_->time_base is NOT meaningful
-  // But inputCtx_->streams[index_]->time_base has meaningful values
-  if ((ret = avcodec_open2(codecCtx_, codec, &opts)) < 0) {
-    LOG(ERROR) << "avcodec_open2 failed. " << ffmpeg_util::getErrorDesc(ret);
-    return ret;
-  }
-  VLOG(1) << "Succeed to open codec";
-
-  frame_ = av_frame_alloc();
-  return initFormat();
-}
-
-unique_ptr<DecodedFrame> FfmpegStream::getFrameData(int getPtsOnly) {
-  if (!codecCtx_) {
-    LOG(ERROR) << "Codec is not initialized";
-    return nullptr;
-  }
-  if (getPtsOnly) {
-    unique_ptr<DecodedFrame> decodedFrame = make_unique<DecodedFrame>();
-    decodedFrame->pts_ = frame_->pts;
-    return decodedFrame;
-  } else {
-    unique_ptr<DecodedFrame> decodedFrame = sampleFrameData();
-    if (decodedFrame) {
-      decodedFrame->pts_ = frame_->pts;
-    }
-    return decodedFrame;
-  }
-}
-
-void FfmpegStream::flush(int getPtsOnly, DecoderOutput& decoderOutput) {
-  VLOG(1) << "Media Type: " << getMediaType() << ", flush stream.";
-  // need to receive frames before entering draining mode
-  receiveAvailFrames(getPtsOnly, decoderOutput);
-
-  VLOG(2) << "send nullptr packet";
-  sendPacket(nullptr);
-  // receive remaining frames after entering draining mode
-  receiveAvailFrames(getPtsOnly, decoderOutput);
-
-  avcodec_flush_buffers(codecCtx_);
-}
-
-bool FfmpegStream::isFramePtsInRange() {
-  CHECK(frame_);
-  auto pts = frame_->pts;
-  auto startPts = this->getStartPts();
-  auto endPts = this->getEndPts();
-  VLOG(2) << "isPtsInRange. pts: " << pts << ", startPts: " << startPts
-          << ", endPts: " << endPts;
-  return (pts == AV_NOPTS_VALUE) ||
-      (pts >= startPts && (endPts >= 0 ? pts <= endPts : true));
-}
-
-bool FfmpegStream::isFramePtsExceedRange() {
-  if (frame_) {
-    auto endPts = this->getEndPts();
-    VLOG(2) << "isFramePtsExceedRange. last_pts_: " << last_pts_
-            << ", endPts: " << endPts;
-    return endPts >= 0 ? last_pts_ >= endPts : false;
-  } else {
-    return true;
-  }
-}
-
-// seek a frame
-int FfmpegStream::seekFrame(int64_t seekPts) {
-  // translate margin from second to pts
-  int64_t margin = (int64_t)(
-      seekFrameMargin_ * (double)inputCtx_->streams[index_]->time_base.den /
-      (double)inputCtx_->streams[index_]->time_base.num);
-  int64_t real_seekPts = (seekPts - margin) > 0 ? (seekPts - margin) : 0;
-  VLOG(2) << "seek margin: " << margin;
-  VLOG(2) << "real seekPts: " << real_seekPts;
-  int ret = av_seek_frame(
-      inputCtx_,
-      index_,
-      (seekPts - margin) > 0 ? (seekPts - margin) : 0,
-      AVSEEK_FLAG_BACKWARD);
-  if (ret < 0) {
-    LOG(WARNING) << "av_seek_frame fails. Stream index: " << index_;
-    return ret;
-  }
-  return 0;
-}
-
-// send/receive encoding and decoding API overview
-// https://ffmpeg.org/doxygen/3.4/group__lavc__encdec.html
-int FfmpegStream::sendPacket(const AVPacket* packet) {
-  return avcodec_send_packet(codecCtx_, packet);
-}
-
-int FfmpegStream::receiveFrame() {
-  int ret = avcodec_receive_frame(codecCtx_, frame_);
-  if (ret >= 0) {
-    // succeed
-    frame_->pts = av_frame_get_best_effort_timestamp(frame_);
-    if (frame_->pts == AV_NOPTS_VALUE) {
-      // Trick: if we can not figure out pts, we just set it to be (last_pts +
-      // 1)
-      frame_->pts = last_pts_ + 1;
-    }
-    last_pts_ = frame_->pts;
-
-    VLOG(2) << "avcodec_receive_frame succeed";
-  } else if (ret == AVERROR(EAGAIN)) {
-    VLOG(2) << "avcodec_receive_frame fails and returns AVERROR(EAGAIN). ";
-  } else if (ret == AVERROR_EOF) {
-    // no more frame to read
-    VLOG(2) << "avcodec_receive_frame returns AVERROR_EOF";
-  } else {
-    LOG(WARNING) << "avcodec_receive_frame failed. Error: "
-                 << ffmpeg_util::getErrorDesc(ret);
-  }
-  return ret;
-}
-
-void FfmpegStream::receiveAvailFrames(
-    int getPtsOnly,
-    DecoderOutput& decoderOutput) {
-  int result = 0;
-  while ((result = receiveFrame()) >= 0) {
-    unique_ptr<DecodedFrame> decodedFrame = getFrameData(getPtsOnly);
-
-    if (decodedFrame &&
-        ((!getPtsOnly && decodedFrame->frameSize_ > 0) || getPtsOnly)) {
-      if (isFramePtsInRange()) {
-        decoderOutput.addMediaFrame(getMediaType(), std::move(decodedFrame));
-      }
-    } // end-if
-  } // end-while
-}
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegStream.h b/torchvision/csrc/cpu/video_reader/FfmpegStream.h
deleted file mode 100644
index b66a36977ec..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegStream.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-#pragma once
-
-#include <memory>
-#include <unordered_map>
-#include <utility>
-#include "FfmpegHeaders.h"
-#include "Interface.h"
-
-/*
-Class uses FFMPEG library to decode one media stream (audio or video).
-*/
-class FfmpegStream {
- public:
-  FfmpegStream(
-      AVFormatContext* inputCtx,
-      int index,
-      enum AVMediaType avMediaType,
-      double seekFrameMargin);
-  virtual ~FfmpegStream();
-
-  // returns 0 - on success or negative error
-  int openCodecContext();
-  // returns stream index
-  int getIndex() const {
-    return index_;
-  }
-  // returns number decoded/sampled bytes
-  std::unique_ptr<DecodedFrame> getFrameData(int getPtsOnly);
-  // flush the stream at the end of decoding.
-  // Return 0 on success and -1 when cache is drained
-  void flush(int getPtsOnly, DecoderOutput& decoderOutput);
-  // seek a frame
-  int seekFrame(int64_t ts);
-  // send an AVPacket
-  int sendPacket(const AVPacket* packet);
-  // receive AVFrame
-  int receiveFrame();
-  // receive all available frames from the internal buffer
-  void receiveAvailFrames(int getPtsOnly, DecoderOutput& decoderOutput);
-  // return media type
-  virtual MediaType getMediaType() const = 0;
-  // return media format
-  virtual FormatUnion getMediaFormat() const = 0;
-  // return start presentation timestamp
-  virtual int64_t getStartPts() const = 0;
-  // return end presentation timestamp
-  virtual int64_t getEndPts() const = 0;
-  // is the pts of most recent frame within range?
-  bool isFramePtsInRange();
-  // does the pts of most recent frame exceed range?
-  bool isFramePtsExceedRange();
-
- protected:
-  virtual int initFormat() = 0;
-  // returns a decoded frame
-  virtual std::unique_ptr<DecodedFrame> sampleFrameData() = 0;
-
- protected:
-  AVFormatContext* const inputCtx_;
-  const int index_;
-  enum AVMediaType avMediaType_;
-
-  AVCodecContext* codecCtx_{nullptr};
-  AVFrame* frame_{nullptr};
-  // pts of last decoded frame
-  int64_t last_pts_{0};
-  double seekFrameMargin_{1.0};
-};
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp b/torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp
deleted file mode 100644
index 9e804ee67c0..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-#include "FfmpegUtil.h"
-
-using namespace std;
-
-namespace ffmpeg_util {
-
-bool mapFfmpegType(AVMediaType media, MediaType* type) {
-  switch (media) {
-    case AVMEDIA_TYPE_VIDEO:
-      *type = MediaType::TYPE_VIDEO;
-      return true;
-    case AVMEDIA_TYPE_AUDIO:
-      *type = MediaType::TYPE_AUDIO;
-      return true;
-    default:
-      return false;
-  }
-}
-
-bool mapMediaType(MediaType type, AVMediaType* media) {
-  switch (type) {
-    case MediaType::TYPE_VIDEO:
-      *media = AVMEDIA_TYPE_VIDEO;
-      return true;
-    case MediaType::TYPE_AUDIO:
-      *media = AVMEDIA_TYPE_AUDIO;
-      return true;
-    default:
-      return false;
-  }
-}
-
-void setFormatDimensions(
-    int& destW,
-    int& destH,
-    int userW,
-    int userH,
-    int srcW,
-    int srcH,
-    int minDimension) {
-  // rounding rules
-  // int -> double -> round
-  // round up if fraction is >= 0.5 or round down if fraction is < 0.5
-  // int result = double(value) + 0.5
-  // here we rounding double to int according to the above rule
-  if (userW == 0 && userH == 0) {
-    if (minDimension > 0) { // #2
-      if (srcW > srcH) {
-        // landscape
-        destH = minDimension;
-        destW = round(double(srcW * minDimension) / srcH);
-      } else {
-        // portrait
-        destW = minDimension;
-        destH = round(double(srcH * minDimension) / srcW);
-      }
-    } else { // #1
-      destW = srcW;
-      destH = srcH;
-    }
-  } else if (userW != 0 && userH == 0) { // #3
-    destW = userW;
-    destH = round(double(srcH * userW) / srcW);
-  } else if (userW == 0 && userH != 0) { // #4
-    destW = round(double(srcW * userH) / srcH);
-    destH = userH;
-  } else {
-    // userW != 0 && userH != 0. #5
-    destW = userW;
-    destH = userH;
-  }
-  // prevent zeros
-  destW = std::max(destW, 1);
-  destH = std::max(destH, 1);
-}
-
-bool validateVideoFormat(const VideoFormat& f) {
-  /*
-  Valid parameters values for decoder
-  ___________________________________________________
-  |  W  |  H  | minDimension |  algorithm           |
-  |_________________________________________________|
-  |  0  |  0  |     0        |   original           |
-  |_________________________________________________|
-  |  0  |  0  |     >0       |scale to min dimension|
-  |_____|_____|____________________________________ |
-  |  >0 |  0  |     0        |   scale keeping W    |
-  |_________________________________________________|
-  |  0  |  >0 |     0        |   scale keeping H    |
-  |_________________________________________________|
-  |  >0 |  >0 |     0        |   stretch/scale      |
-  |_________________________________________________|
-
-  */
-  return (f.width == 0 && f.height == 0) || // #1 and #2
-      (f.width != 0 && f.height != 0 && f.minDimension == 0) || // # 5
-      (((f.width != 0 && f.height == 0) || // #3 and #4
-        (f.width == 0 && f.height != 0)) &&
-       f.minDimension == 0);
-}
-
-string getErrorDesc(int errnum) {
-  array<char, 1024> buffer;
-  if (av_strerror(errnum, buffer.data(), buffer.size()) < 0) {
-    return string("Unknown error code");
-  }
-  buffer.back() = 0;
-  return string(buffer.data());
-}
-
-} // namespace ffmpeg_util
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegUtil.h b/torchvision/csrc/cpu/video_reader/FfmpegUtil.h
deleted file mode 100644
index 9f42eb53c97..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegUtil.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include <array>
-#include <string>
-#include "FfmpegHeaders.h"
-#include "Interface.h"
-
-namespace ffmpeg_util {
-
-bool mapFfmpegType(AVMediaType media, enum MediaType* type);
-
-bool mapMediaType(MediaType type, enum AVMediaType* media);
-
-void setFormatDimensions(
-    int& destW,
-    int& destH,
-    int userW,
-    int userH,
-    int srcW,
-    int srcH,
-    int minDimension);
-
-bool validateVideoFormat(const VideoFormat& f);
-
-std::string getErrorDesc(int errnum);
-
-} // namespace ffmpeg_util
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp b/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp
deleted file mode 100644
index d87b3104dd5..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-#include "FfmpegVideoSampler.h"
-#include "FfmpegUtil.h"
-
-using namespace std;
-
-FfmpegVideoSampler::FfmpegVideoSampler(
-    const VideoFormat& in,
-    const VideoFormat& out,
-    int swsFlags)
-    : inFormat_(in), outFormat_(out), swsFlags_(swsFlags) {}
-
-FfmpegVideoSampler::~FfmpegVideoSampler() {
-  if (scaleContext_) {
-    sws_freeContext(scaleContext_);
-    scaleContext_ = nullptr;
-  }
-}
-
-int FfmpegVideoSampler::init() {
-  VLOG(1) << "Input format: width " << inFormat_.width << ", height "
-          << inFormat_.height << ", format " << inFormat_.format
-          << ", minDimension " << inFormat_.minDimension;
-  VLOG(1) << "Scale format: width " << outFormat_.width << ", height "
-          << outFormat_.height << ", format " << outFormat_.format
-          << ", minDimension " << outFormat_.minDimension;
-
-  scaleContext_ = sws_getContext(
-      inFormat_.width,
-      inFormat_.height,
-      (AVPixelFormat)inFormat_.format,
-      outFormat_.width,
-      outFormat_.height,
-      static_cast<AVPixelFormat>(outFormat_.format),
-      swsFlags_,
-      nullptr,
-      nullptr,
-      nullptr);
-  if (scaleContext_) {
-    return 0;
-  } else {
-    return -1;
-  }
-}
-
-int32_t FfmpegVideoSampler::getImageBytes() const {
-  return av_image_get_buffer_size(
-      (AVPixelFormat)outFormat_.format, outFormat_.width, outFormat_.height, 1);
-}
-
-// https://ffmpeg.org/doxygen/3.4/scaling_video_8c-example.html#a10
-unique_ptr<DecodedFrame> FfmpegVideoSampler::sample(const AVFrame* frame) {
-  if (!frame) {
-    return nullptr; // no flush for videos
-  }
-  // scaled and cropped image
-  auto outImageSize = getImageBytes();
-  AvDataPtr frameData(static_cast<uint8_t*>(av_malloc(outImageSize)));
-
-  uint8_t* scalePlanes[4] = {nullptr};
-  int scaleLines[4] = {0};
-
-  int result;
-  if ((result = av_image_fill_arrays(
-           scalePlanes,
-           scaleLines,
-           frameData.get(),
-           static_cast<AVPixelFormat>(outFormat_.format),
-           outFormat_.width,
-           outFormat_.height,
-           1)) < 0) {
-    LOG(ERROR) << "av_image_fill_arrays failed, err: "
-               << ffmpeg_util::getErrorDesc(result);
-    return nullptr;
-  }
-
-  if ((result = sws_scale(
-           scaleContext_,
-           frame->data,
-           frame->linesize,
-           0,
-           inFormat_.height,
-           scalePlanes,
-           scaleLines)) < 0) {
-    LOG(ERROR) << "sws_scale failed, err: "
-               << ffmpeg_util::getErrorDesc(result);
-    return nullptr;
-  }
-
-  return make_unique<DecodedFrame>(std::move(frameData), outImageSize, 0);
-}
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h b/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h
deleted file mode 100644
index 1fd6862f537..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#pragma once
-
-#include "FfmpegSampler.h"
-
-/**
- * Class transcode video frames from one format into another
- */
-
-class FfmpegVideoSampler : public FfmpegSampler {
- public:
-  explicit FfmpegVideoSampler(
-      const VideoFormat& in,
-      const VideoFormat& out,
-      int swsFlags = SWS_AREA);
-  ~FfmpegVideoSampler() override;
-
-  int init() override;
-
-  int32_t getImageBytes() const;
-  // returns number of bytes of the sampled data
-  std::unique_ptr<DecodedFrame> sample(const AVFrame* frame) override;
-
-  const VideoFormat& getInFormat() const {
-    return inFormat_;
-  }
-
- private:
-  VideoFormat inFormat_;
-  VideoFormat outFormat_;
-  int swsFlags_;
-  SwsContext* scaleContext_{nullptr};
-};
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp b/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp
deleted file mode 100644
index 7a429249a71..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-#include "FfmpegVideoStream.h"
-#include "FfmpegUtil.h"
-
-using namespace std;
-
-namespace {
-
-bool operator==(const VideoFormat& x, const AVFrame& y) {
-  return x.width == y.width && x.height == y.height &&
-      x.format == static_cast<AVPixelFormat>(y.format);
-}
-
-VideoFormat toVideoFormat(const AVFrame& frame) {
-  VideoFormat videoFormat;
-  videoFormat.width = frame.width;
-  videoFormat.height = frame.height;
-  videoFormat.format = static_cast<AVPixelFormat>(frame.format);
-
-  return videoFormat;
-}
-
-} // namespace
-
-FfmpegVideoStream::FfmpegVideoStream(
-    AVFormatContext* inputCtx,
-    int index,
-    enum AVMediaType avMediaType,
-    MediaFormat mediaFormat,
-    double seekFrameMargin)
-    : FfmpegStream(inputCtx, index, avMediaType, seekFrameMargin),
-      mediaFormat_(mediaFormat) {}
-
-FfmpegVideoStream::~FfmpegVideoStream() {}
-
-void FfmpegVideoStream::checkStreamDecodeParams() {
-  auto timeBase = getTimeBase();
-  if (timeBase.first > 0) {
-    CHECK_EQ(timeBase.first, inputCtx_->streams[index_]->time_base.num);
-    CHECK_EQ(timeBase.second, inputCtx_->streams[index_]->time_base.den);
-  }
-}
-
-void FfmpegVideoStream::updateStreamDecodeParams() {
-  auto timeBase = getTimeBase();
-  if (timeBase.first == 0) {
-    mediaFormat_.format.video.timeBaseNum =
-        inputCtx_->streams[index_]->time_base.num;
-    mediaFormat_.format.video.timeBaseDen =
-        inputCtx_->streams[index_]->time_base.den;
-  }
-  mediaFormat_.format.video.duration = inputCtx_->streams[index_]->duration;
-}
-
-int FfmpegVideoStream::initFormat() {
-  // set output format
-  VideoFormat& format = mediaFormat_.format.video;
-  if (!ffmpeg_util::validateVideoFormat(format)) {
-    LOG(ERROR) << "Invalid video format";
-    return -1;
-  }
-
-  format.fps = av_q2d(
-      av_guess_frame_rate(inputCtx_, inputCtx_->streams[index_], nullptr));
-
-  // keep aspect ratio
-  ffmpeg_util::setFormatDimensions(
-      format.width,
-      format.height,
-      format.width,
-      format.height,
-      codecCtx_->width,
-      codecCtx_->height,
-      format.minDimension);
-
-  VLOG(1) << "After adjusting, video format"
-          << ", width: " << format.width << ", height: " << format.height
-          << ", format: " << format.format
-          << ", minDimension: " << format.minDimension;
-
-  if (format.format == AV_PIX_FMT_NONE) {
-    format.format = codecCtx_->pix_fmt;
-    VLOG(1) << "Set pixel format: " << format.format;
-  }
-
-  checkStreamDecodeParams();
-
-  updateStreamDecodeParams();
-
-  return format.width != 0 && format.height != 0 &&
-          format.format != AV_PIX_FMT_NONE
-      ? 0
-      : -1;
-}
-
-unique_ptr<DecodedFrame> FfmpegVideoStream::sampleFrameData() {
-  VideoFormat& format = mediaFormat_.format.video;
-  if (!sampler_ || !(sampler_->getInFormat() == *frame_)) {
-    VideoFormat newInFormat = toVideoFormat(*frame_);
-    sampler_ = make_unique<FfmpegVideoSampler>(newInFormat, format, SWS_AREA);
-    VLOG(1) << "Set input video sampler format"
-            << ", width: " << newInFormat.width
-            << ", height: " << newInFormat.height
-            << ", format: " << newInFormat.format
-            << " : output video sampler format"
-            << ", width: " << format.width << ", height: " << format.height
-            << ", format: " << format.format
-            << ", minDimension: " << format.minDimension;
-    int ret = sampler_->init();
-    if (ret < 0) {
-      VLOG(1) << "Fail to initialize video sampler";
-      return nullptr;
-    }
-  }
-  return sampler_->sample(frame_);
-}
diff --git a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h b/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h
deleted file mode 100644
index 9bfbc9f665b..00000000000
--- a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#pragma once
-
-#include <utility>
-#include "FfmpegStream.h"
-#include "FfmpegVideoSampler.h"
-
-/**
- * Class uses FFMPEG library to decode one video stream.
- */
-class FfmpegVideoStream : public FfmpegStream {
- public:
-  explicit FfmpegVideoStream(
-      AVFormatContext* inputCtx,
-      int index,
-      enum AVMediaType avMediaType,
-      MediaFormat mediaFormat,
-      double seekFrameMargin);
-
-  ~FfmpegVideoStream() override;
-
-  // FfmpegStream overrides
-  MediaType getMediaType() const override {
-    return MediaType::TYPE_VIDEO;
-  }
-
-  FormatUnion getMediaFormat() const override {
-    return mediaFormat_.format;
-  }
-
-  int64_t getStartPts() const override {
-    return mediaFormat_.format.video.startPts;
-  }
-  int64_t getEndPts() const override {
-    return mediaFormat_.format.video.endPts;
-  }
-  // return numerator and denominator of time base
-  std::pair<int, int> getTimeBase() const {
-    return std::make_pair(
-        mediaFormat_.format.video.timeBaseNum,
-        mediaFormat_.format.video.timeBaseDen);
-  }
-
-  void checkStreamDecodeParams();
-
-  void updateStreamDecodeParams();
-
- protected:
-  int initFormat() override;
-  std::unique_ptr<DecodedFrame> sampleFrameData() override;
-
- private:
-  MediaFormat mediaFormat_;
-  std::unique_ptr<FfmpegVideoSampler> sampler_{nullptr};
-};
diff --git a/torchvision/csrc/cpu/video_reader/Interface.cpp b/torchvision/csrc/cpu/video_reader/Interface.cpp
deleted file mode 100644
index 0ec9f155821..00000000000
--- a/torchvision/csrc/cpu/video_reader/Interface.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#include "Interface.h"
-
-void DecoderOutput::initMediaType(MediaType mediaType, FormatUnion format) {
-  MediaData mediaData(format);
-  media_data_.emplace(mediaType, std::move(mediaData));
-}
-
-void DecoderOutput::addMediaFrame(
-    MediaType mediaType,
-    std::unique_ptr<DecodedFrame> frame) {
-  if (media_data_.find(mediaType) != media_data_.end()) {
-    VLOG(1) << "media type: " << mediaType
-            << " add frame with pts: " << frame->pts_;
-    media_data_[mediaType].frames_.push_back(std::move(frame));
-  } else {
-    VLOG(1) << "media type: " << mediaType << " not found. Skip the frame.";
-  }
-}
-
-void DecoderOutput::clear() {
-  media_data_.clear();
-}
diff --git a/torchvision/csrc/cpu/video_reader/Interface.h b/torchvision/csrc/cpu/video_reader/Interface.h
deleted file mode 100644
index e137008ce7b..00000000000
--- a/torchvision/csrc/cpu/video_reader/Interface.h
+++ /dev/null
@@ -1,127 +0,0 @@
-#pragma once
-
-#include <c10/util/Logging.h>
-#include <sys/types.h>
-#include <memory>
-#include <unordered_map>
-
-extern "C" {
-
-#include <libavutil/pixfmt.h>
-#include <libavutil/samplefmt.h>
-void av_free(void* ptr);
-}
-
-struct avDeleter {
-  void operator()(uint8_t* p) const {
-    av_free(p);
-  }
-};
-
-const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
-const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
-
-using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;
-
-enum MediaType : uint32_t {
-  TYPE_VIDEO = 1,
-  TYPE_AUDIO = 2,
-};
-
-struct EnumClassHash {
-  template <typename T>
-  uint32_t operator()(T t) const {
-    return static_cast<uint32_t>(t);
-  }
-};
-
-struct VideoFormat {
-  // fields are initialized for the auto detection
-  // caller can specify some/all of field values if specific output is desirable
-
-  int width{0}; // width in pixels
-  int height{0}; // height in pixels
-  int minDimension{0}; // choose min dimension and rescale accordingly
-  // Output image pixel format. data type AVPixelFormat
-  AVPixelFormat format{defaultVideoPixelFormat}; // type AVPixelFormat
-  int64_t startPts{0}, endPts{0}; // Start and end presentation timestamp
-  int timeBaseNum{0};
-  int timeBaseDen{1}; // numerator and denominator of time base
-  float fps{0.0};
-  int64_t duration{0}; // duration of the stream, in stream time base
-};
-
-struct AudioFormat {
-  // fields are initialized for the auto detection
-  // caller can specify some/all of field values if specific output is desirable
-
-  int samples{0}; // number samples per second (frequency)
-  int channels{0}; // number of channels
-  AVSampleFormat format{defaultAudioSampleFormat}; // type AVSampleFormat
-  int64_t startPts{0}, endPts{0}; // Start and end presentation timestamp
-  int timeBaseNum{0};
-  int timeBaseDen{1}; // numerator and denominator of time base
-  int64_t duration{0}; // duration of the stream, in stream time base
-};
-
-union FormatUnion {
-  FormatUnion() {}
-  VideoFormat video;
-  AudioFormat audio;
-};
-
-struct MediaFormat {
-  MediaFormat() {}
-
-  MediaFormat(const MediaFormat& mediaFormat) : type(mediaFormat.type) {
-    if (type == MediaType::TYPE_VIDEO) {
-      format.video = mediaFormat.format.video;
-    } else if (type == MediaType::TYPE_AUDIO) {
-      format.audio = mediaFormat.format.audio;
-    }
-  }
-
-  MediaFormat(MediaType mediaType) : type(mediaType) {
-    if (mediaType == MediaType::TYPE_VIDEO) {
-      format.video = VideoFormat();
-    } else if (mediaType == MediaType::TYPE_AUDIO) {
-      format.audio = AudioFormat();
-    }
-  }
-  // media type
-  MediaType type;
-  // format data
-  FormatUnion format;
-};
-
-class DecodedFrame {
- public:
-  explicit DecodedFrame() : frame_(nullptr), frameSize_(0), pts_(0) {}
-  explicit DecodedFrame(AvDataPtr frame, int frameSize, int64_t pts)
-      : frame_(std::move(frame)), frameSize_(frameSize), pts_(pts) {}
-  AvDataPtr frame_{nullptr};
-  int frameSize_{0};
-  int64_t pts_{0};
-};
-
-struct MediaData {
-  MediaData() {}
-  MediaData(FormatUnion format) : format_(format) {}
-  FormatUnion format_;
-  std::vector<std::unique_ptr<DecodedFrame>> frames_;
-};
-
-class DecoderOutput {
- public:
-  explicit DecoderOutput() {}
-
-  ~DecoderOutput() {}
-
-  void initMediaType(MediaType mediaType, FormatUnion format);
-
-  void addMediaFrame(MediaType mediaType, std::unique_ptr<DecodedFrame> frame);
-
-  void clear();
-
-  std::unordered_map<MediaType, MediaData, EnumClassHash> media_data_;
-};
diff --git a/torchvision/csrc/cpu/video_reader/VideoReader.cpp b/torchvision/csrc/cpu/video_reader/VideoReader.cpp
index dfe7f46bf39..7578927f1b5 100644
--- a/torchvision/csrc/cpu/video_reader/VideoReader.cpp
+++ b/torchvision/csrc/cpu/video_reader/VideoReader.cpp
@@ -3,11 +3,11 @@
 #include <Python.h>
 #include <c10/util/Logging.h>
 #include <exception>
-#include "FfmpegDecoder.h"
-#include "FfmpegHeaders.h"
-#include "util.h"
+#include "memory_buffer.h"
+#include "sync_decoder.h"
 
 using namespace std;
+using namespace ffmpeg;
 
 // If we are in a Windows environment, we need to define
 // initialization functions for the _custom_ops extension
@@ -27,121 +27,157 @@ PyMODINIT_FUNC PyInit_video_reader(void) {
 
 namespace video_reader {
 
-class UnknownPixelFormatException : public exception {
-  const char* what() const throw() override {
-    return "Unknown pixel format";
-  }
-};
-
-int getChannels(AVPixelFormat format) {
-  int numChannels = 0;
-  switch (format) {
-    case AV_PIX_FMT_BGR24:
-    case AV_PIX_FMT_RGB24:
-      numChannels = 3;
-      break;
-    default:
-      LOG(ERROR) << "Unknown format: " << format;
-      throw UnknownPixelFormatException();
-  }
-  return numChannels;
-}
+const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
+const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
+const size_t decoderTimeoutMs = 600000;
+// A jitter can be added to the end of the range to avoid conversion/rounding
+// error, small value 100us won't be enough to select the next frame, but enough
+// to compensate rounding error due to the multiple conversions.
+const size_t timeBaseJitterUs = 100;
+
+DecoderParameters getDecoderParams(
+    int64_t videoStartUs,
+    int64_t videoEndUs,
+    double seekFrameMarginUs,
+    int64_t getPtsOnly,
+    int64_t readVideoStream,
+    int videoWidth,
+    int videoHeight,
+    int videoMinDimension,
+    int64_t readAudioStream,
+    int audioSamples,
+    int audioChannels) {
+  DecoderParameters params;
+  params.headerOnly = getPtsOnly != 0;
+  params.seekAccuracy = seekFrameMarginUs;
+  params.startOffset = videoStartUs;
+  params.endOffset = videoEndUs;
+  params.timeoutMs = decoderTimeoutMs;
+  params.preventStaleness = false;
 
-void fillVideoTensor(
-    std::vector<unique_ptr<DecodedFrame>>& frames,
-    torch::Tensor& videoFrame,
-    torch::Tensor& videoFramePts) {
-  int frameSize = 0;
-  if (videoFrame.numel() > 0) {
-    frameSize = videoFrame.numel() / frames.size();
+  if (readVideoStream == 1) {
+    MediaFormat videoFormat(0);
+    videoFormat.type = TYPE_VIDEO;
+    videoFormat.format.video.format = defaultVideoPixelFormat;
+    videoFormat.format.video.width = videoWidth;
+    videoFormat.format.video.height = videoHeight;
+    videoFormat.format.video.minDimension = videoMinDimension;
+    params.formats.insert(videoFormat);
   }
 
-  int frameCount = 0;
+  if (readAudioStream == 1) {
+    MediaFormat audioFormat;
+    audioFormat.type = TYPE_AUDIO;
+    audioFormat.format.audio.format = defaultAudioSampleFormat;
+    audioFormat.format.audio.samples = audioSamples;
+    audioFormat.format.audio.channels = audioChannels;
+    params.formats.insert(audioFormat);
+  }
 
-  uint8_t* videoFrameData =
-      videoFrame.numel() > 0 ? videoFrame.data_ptr<uint8_t>() : nullptr;
-  int64_t* videoFramePtsData = videoFramePts.data_ptr<int64_t>();
+  return params;
+}
 
-  for (size_t i = 0; i < frames.size(); ++i) {
-    const auto& frame = frames[i];
-    if (videoFrameData) {
-      memcpy(
-          videoFrameData + (size_t)(frameCount++) * (size_t)frameSize,
-          frame->frame_.get(),
-          frameSize * sizeof(uint8_t));
+// returns number of written bytes
+template <typename T>
+size_t fillTensor(
+    std::vector<DecoderOutputMessage>& msgs,
+    torch::Tensor& frame,
+    torch::Tensor& framePts,
+    int64_t num,
+    int64_t den) {
+  if (msgs.empty()) {
+    return 0;
+  }
+  T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
+  int64_t* framePtsData = framePts.data_ptr<int64_t>();
+  CHECK_EQ(framePts.size(0), msgs.size());
+  size_t avgElementsInFrame = frame.numel() / msgs.size();
+
+  size_t offset = 0;
+  for (size_t i = 0; i < msgs.size(); ++i) {
+    const auto& msg = msgs[i];
+    // convert pts into original time_base
+    AVRational avr = {(int)num, (int)den};
+    framePtsData[i] = av_rescale_q(msg.header.pts, AV_TIME_BASE_Q, avr);
+    VLOG(2) << "PTS type: " << sizeof(T) << ", us: " << msg.header.pts
+            << ", original: " << framePtsData[i];
+
+    if (frameData) {
+      auto sizeInBytes = msg.payload->length();
+      memcpy(frameData + offset, msg.payload->data(), sizeInBytes);
+      if (sizeof(T) == sizeof(uint8_t)) {
+        // Video - move by allocated frame size
+        offset += avgElementsInFrame / sizeof(T);
+      } else {
+        // Audio - move by number of samples
+        offset += sizeInBytes / sizeof(T);
+      }
     }
-    videoFramePtsData[i] = frame->pts_;
   }
+  return offset * sizeof(T);
 }
 
-void getVideoMeta(
-    DecoderOutput& decoderOutput,
-    int& numFrames,
-    int& height,
-    int& width,
-    int& numChannels) {
-  auto& videoFrames = decoderOutput.media_data_[TYPE_VIDEO].frames_;
-  numFrames = videoFrames.size();
-
-  FormatUnion& videoFormat = decoderOutput.media_data_[TYPE_VIDEO].format_;
-  height = videoFormat.video.height;
-  width = videoFormat.video.width;
-  numChannels = getChannels(videoFormat.video.format);
+size_t fillVideoTensor(
+    std::vector<DecoderOutputMessage>& msgs,
+    torch::Tensor& videoFrame,
+    torch::Tensor& videoFramePts,
+    int64_t num,
+    int64_t den) {
+  return fillTensor<uint8_t>(msgs, videoFrame, videoFramePts, num, den);
 }
 
-void fillAudioTensor(
-    std::vector<unique_ptr<DecodedFrame>>& frames,
+size_t fillAudioTensor(
+    std::vector<DecoderOutputMessage>& msgs,
     torch::Tensor& audioFrame,
-    torch::Tensor& audioFramePts) {
-  if (frames.size() == 0) {
-    return;
-  }
-
-  float* audioFrameData =
-      audioFrame.numel() > 0 ? audioFrame.data_ptr<float>() : nullptr;
-  CHECK_EQ(audioFramePts.size(0), frames.size());
-  int64_t* audioFramePtsData = audioFramePts.data_ptr<int64_t>();
-
-  int bytesPerSample = av_get_bytes_per_sample(defaultAudioSampleFormat);
-
-  int64_t frameDataOffset = 0;
-  for (size_t i = 0; i < frames.size(); ++i) {
-    audioFramePtsData[i] = frames[i]->pts_;
-    if (audioFrameData) {
-      memcpy(
-          audioFrameData + frameDataOffset,
-          frames[i]->frame_.get(),
-          frames[i]->frameSize_);
-      frameDataOffset += (frames[i]->frameSize_ / bytesPerSample);
-    }
-  }
+    torch::Tensor& audioFramePts,
+    int64_t num,
+    int64_t den) {
+  return fillTensor<float>(msgs, audioFrame, audioFramePts, num, den);
 }
 
-void getAudioMeta(
-    DecoderOutput& decoderOutput,
-    int64_t& numSamples,
-    int64_t& channels,
-    int64_t& numFrames) {
-  FormatUnion& audioFormat = decoderOutput.media_data_[TYPE_AUDIO].format_;
-
-  channels = audioFormat.audio.channels;
-  CHECK_EQ(audioFormat.audio.format, AV_SAMPLE_FMT_FLT);
-  int bytesPerSample = av_get_bytes_per_sample(
-      static_cast<AVSampleFormat>(audioFormat.audio.format));
-
-  // auto& audioFrames = decoderOutput.media_frames_[TYPE_AUDIO];
-  auto& audioFrames = decoderOutput.media_data_[TYPE_AUDIO].frames_;
-  numFrames = audioFrames.size();
-  int64_t frameSizeTotal = 0;
-  for (auto const& decodedFrame : audioFrames) {
-    frameSizeTotal += static_cast<int64_t>(decodedFrame->frameSize_);
+void offsetsToUs(
+    double& seekFrameMargin,
+    int64_t readVideoStream,
+    int64_t videoStartPts,
+    int64_t videoEndPts,
+    int64_t videoTimeBaseNum,
+    int64_t videoTimeBaseDen,
+    int64_t readAudioStream,
+    int64_t audioStartPts,
+    int64_t audioEndPts,
+    int64_t audioTimeBaseNum,
+    int64_t audioTimeBaseDen,
+    int64_t& videoStartUs,
+    int64_t& videoEndUs) {
+  seekFrameMargin *= AV_TIME_BASE;
+  videoStartUs = 0;
+  videoEndUs = -1;
+
+  if (readVideoStream) {
+    AVRational vr = {(int)videoTimeBaseNum, (int)videoTimeBaseDen};
+    if (videoStartPts > 0) {
+      videoStartUs = av_rescale_q(videoStartPts, vr, AV_TIME_BASE_Q);
+    }
+    if (videoEndPts > 0) {
+      // Add jitter to the end of the range to avoid conversion/rounding error.
+      // Small value 100us won't be enough to select the next frame, but enough
+      // to compensate rounding error due to the multiple conversions.
+      videoEndUs =
+          timeBaseJitterUs + av_rescale_q(videoEndPts, vr, AV_TIME_BASE_Q);
+    }
+  } else if (readAudioStream) {
+    AVRational ar = {(int)audioTimeBaseNum, (int)audioTimeBaseDen};
+    if (audioStartPts > 0) {
+      videoStartUs = av_rescale_q(audioStartPts, ar, AV_TIME_BASE_Q);
+    }
+    if (audioEndPts > 0) {
+      // Add jitter to the end of the range to avoid conversion/rounding error.
+      // Small value 100us won't be enough to select the next frame, but enough
+      // to compensate rounding error due to the multiple conversions.
+      videoEndUs =
+          timeBaseJitterUs + av_rescale_q(audioEndPts, ar, AV_TIME_BASE_Q);
+    }
   }
-  VLOG(2) << "numFrames: " << numFrames;
-  VLOG(2) << "frameSizeTotal: " << frameSizeTotal;
-  VLOG(2) << "channels: " << channels;
-  VLOG(2) << "bytesPerSample: " << bytesPerSample;
-  CHECK_EQ(frameSizeTotal % (channels * bytesPerSample), 0);
-  numSamples = frameSizeTotal / (channels * bytesPerSample);
 }
 
 torch::List<torch::Tensor> readVideo(
@@ -165,38 +201,83 @@ torch::List<torch::Tensor> readVideo(
     int64_t audioEndPts,
     int64_t audioTimeBaseNum,
     int64_t audioTimeBaseDen) {
-  unique_ptr<DecoderParameters> params = util::getDecoderParams(
+  int64_t videoStartUs, videoEndUs;
+
+  offsetsToUs(
       seekFrameMargin,
-      getPtsOnly,
       readVideoStream,
-      width,
-      height,
-      minDimension,
       videoStartPts,
       videoEndPts,
       videoTimeBaseNum,
       videoTimeBaseDen,
       readAudioStream,
-      audioSamples,
-      audioChannels,
       audioStartPts,
       audioEndPts,
       audioTimeBaseNum,
-      audioTimeBaseDen);
-
-  FfmpegDecoder decoder;
-  DecoderOutput decoderOutput;
+      audioTimeBaseDen,
+      videoStartUs,
+      videoEndUs);
+
+  DecoderParameters params = getDecoderParams(
+      videoStartUs, // videoStartPts
+      videoEndUs, // videoEndPts
+      seekFrameMargin, // seekFrameMargin
+      getPtsOnly, // getPtsOnly
+      readVideoStream, // readVideoStream
+      width, // width
+      height, // height
+      minDimension, // minDimension
+      readAudioStream, // readAudioStream
+      audioSamples, // audioSamples
+      audioChannels // audioChannels
+  );
 
+  SyncDecoder decoder;
+  std::vector<DecoderOutputMessage> audioMessages, videoMessages;
+  DecoderInCallback callback = nullptr;
+  std::string logMessage, logType;
   if (isReadFile) {
-    decoder.decodeFile(std::move(params), videoPath, decoderOutput);
+    params.uri = videoPath;
+    logType = "file";
+    logMessage = videoPath;
   } else {
-    decoder.decodeMemory(
-        std::move(params),
-        input_video.data_ptr<uint8_t>(),
-        input_video.size(0),
-        decoderOutput);
+    callback = MemoryBuffer::getCallback(
+        input_video.data_ptr<uint8_t>(), input_video.size(0));
+    logType = "memory";
+    logMessage = std::to_string(input_video.size(0));
   }
 
+  VLOG(1) << "Video decoding from " << logType << " [" << logMessage
+          << "] has started";
+
+  const auto now = std::chrono::system_clock::now();
+
+  bool succeeded;
+  if ((succeeded = decoder.init(params, std::move(callback)))) {
+    int res;
+    DecoderOutputMessage msg;
+    while (0 == (res = decoder.decode(&msg, decoderTimeoutMs))) {
+      if (msg.header.format.type == TYPE_VIDEO) {
+        videoMessages.push_back(std::move(msg));
+      }
+      if (msg.header.format.type == TYPE_AUDIO) {
+        audioMessages.push_back(std::move(msg));
+      }
+      msg.payload.reset();
+    }
+
+    const auto then = std::chrono::system_clock::now();
+    VLOG(1) << "Video decoding from " << logType << " [" << logMessage
+            << "] has finished, "
+            << std::chrono::duration_cast<std::chrono::microseconds>(then - now)
+                   .count()
+            << " us";
+  } else {
+    LOG(ERROR) << "Decoder initialization has failed";
+  }
+
+  decoder.shutdown();
+
   // video section
   torch::Tensor videoFrame = torch::zeros({0}, torch::kByte);
   torch::Tensor videoFramePts = torch::zeros({0}, torch::kLong);
@@ -204,37 +285,50 @@ torch::List<torch::Tensor> readVideo(
   torch::Tensor videoFps = torch::zeros({0}, torch::kFloat);
   torch::Tensor videoDuration = torch::zeros({0}, torch::kLong);
 
-  if (readVideoStream == 1) {
-    auto it = decoderOutput.media_data_.find(TYPE_VIDEO);
-    if (it != decoderOutput.media_data_.end()) {
-      int numVideoFrames, outHeight, outWidth, numChannels;
-      getVideoMeta(
-          decoderOutput, numVideoFrames, outHeight, outWidth, numChannels);
-
+  if (succeeded && readVideoStream == 1) {
+    if (!videoMessages.empty()) {
+      const auto& header = videoMessages[0].header;
+      const auto& media = header.format;
+      const auto& format = media.format.video;
+      int numVideoFrames = videoMessages.size();
+      int outHeight = format.height;
+      int outWidth = format.width;
+      int numChannels = 3; // decoder guarantees the default AV_PIX_FMT_RGB24
+
+      size_t expectedWrittenBytes = 0;
       if (getPtsOnly == 0) {
         videoFrame = torch::zeros(
             {numVideoFrames, outHeight, outWidth, numChannels}, torch::kByte);
+        expectedWrittenBytes =
+            numVideoFrames * outHeight * outWidth * numChannels;
       }
 
       videoFramePts = torch::zeros({numVideoFrames}, torch::kLong);
 
-      fillVideoTensor(
-          decoderOutput.media_data_[TYPE_VIDEO].frames_,
-          videoFrame,
-          videoFramePts);
+      VLOG(2) << "video duration: " << media.duration << ", fps: " << header.fps
+              << ", num: " << media.num << ", den: " << media.den
+              << ", num frames: " << numVideoFrames;
+
+      auto numberWrittenBytes = fillVideoTensor(
+          videoMessages, videoFrame, videoFramePts, media.num, media.den);
+
+      CHECK_EQ(numberWrittenBytes, expectedWrittenBytes);
 
       videoTimeBase = torch::zeros({2}, torch::kInt);
       int* videoTimeBaseData = videoTimeBase.data_ptr<int>();
-      videoTimeBaseData[0] = it->second.format_.video.timeBaseNum;
-      videoTimeBaseData[1] = it->second.format_.video.timeBaseDen;
+      videoTimeBaseData[0] = media.num;
+      videoTimeBaseData[1] = media.den;
 
       videoFps = torch::zeros({1}, torch::kFloat);
       float* videoFpsData = videoFps.data_ptr<float>();
-      videoFpsData[0] = it->second.format_.video.fps;
+      videoFpsData[0] = header.fps;
 
       videoDuration = torch::zeros({1}, torch::kLong);
       int64_t* videoDurationData = videoDuration.data_ptr<int64_t>();
-      videoDurationData[0] = it->second.format_.video.duration;
+      AVRational avr = {(int)media.num, (int)media.den};
+      videoDurationData[0] = av_rescale_q(media.duration, AV_TIME_BASE_Q, avr);
+      VLOG(1) << "Video decoding from " << logType << " [" << logMessage
+              << "] filled video tensors";
     } else {
       VLOG(1) << "Miss video stream";
     }
@@ -246,39 +340,58 @@ torch::List<torch::Tensor> readVideo(
   torch::Tensor audioTimeBase = torch::zeros({0}, torch::kInt);
   torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt);
   torch::Tensor audioDuration = torch::zeros({0}, torch::kLong);
-  if (readAudioStream == 1) {
-    auto it = decoderOutput.media_data_.find(TYPE_AUDIO);
-    if (it != decoderOutput.media_data_.end()) {
-      VLOG(1) << "Find audio stream";
-      int64_t numAudioSamples = 0, outAudioChannels = 0, numAudioFrames = 0;
-      getAudioMeta(
-          decoderOutput, numAudioSamples, outAudioChannels, numAudioFrames);
-      VLOG(2) << "numAudioSamples: " << numAudioSamples;
-      VLOG(2) << "outAudioChannels: " << outAudioChannels;
-      VLOG(2) << "numAudioFrames: " << numAudioFrames;
-
+  if (succeeded && readAudioStream == 1) {
+    if (!audioMessages.empty()) {
+      const auto& header = audioMessages[0].header;
+      const auto& media = header.format;
+      const auto& format = media.format.audio;
+
+      int64_t outAudioChannels = format.channels;
+      int bytesPerSample =
+          av_get_bytes_per_sample(static_cast<AVSampleFormat>(format.format));
+
+      int numAudioFrames = audioMessages.size();
+      int64_t numAudioSamples = 0;
       if (getPtsOnly == 0) {
+        int64_t frameSizeTotal = 0;
+        for (auto const& audioMessage : audioMessages) {
+          frameSizeTotal += audioMessage.payload->length();
+        }
+
+        CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
+        numAudioSamples = frameSizeTotal / (outAudioChannels * bytesPerSample);
+
         audioFrame =
             torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
       }
       audioFramePts = torch::zeros({numAudioFrames}, torch::kLong);
-      fillAudioTensor(
-          decoderOutput.media_data_[TYPE_AUDIO].frames_,
-          audioFrame,
-          audioFramePts);
+
+      VLOG(2) << "audio duration: " << media.duration
+              << ", channels: " << format.channels
+              << ", sample rate: " << format.samples << ", num: " << media.num
+              << ", den: " << media.den;
+
+      auto numberWrittenBytes = fillAudioTensor(
+          audioMessages, audioFrame, audioFramePts, media.num, media.den);
+      CHECK_EQ(
+          numberWrittenBytes,
+          numAudioSamples * outAudioChannels * sizeof(float));
 
       audioTimeBase = torch::zeros({2}, torch::kInt);
       int* audioTimeBaseData = audioTimeBase.data_ptr<int>();
-      audioTimeBaseData[0] = it->second.format_.audio.timeBaseNum;
-      audioTimeBaseData[1] = it->second.format_.audio.timeBaseDen;
+      audioTimeBaseData[0] = media.num;
+      audioTimeBaseData[1] = media.den;
 
       audioSampleRate = torch::zeros({1}, torch::kInt);
       int* audioSampleRateData = audioSampleRate.data_ptr<int>();
-      audioSampleRateData[0] = it->second.format_.audio.samples;
+      audioSampleRateData[0] = format.samples;
 
       audioDuration = torch::zeros({1}, torch::kLong);
       int64_t* audioDurationData = audioDuration.data_ptr<int64_t>();
-      audioDurationData[0] = it->second.format_.audio.duration;
+      AVRational avr = {(int)media.num, (int)media.den};
+      audioDurationData[0] = av_rescale_q(media.duration, AV_TIME_BASE_Q, avr);
+      VLOG(1) << "Video decoding from " << logType << " [" << logMessage
+              << "] filled audio tensors";
     } else {
       VLOG(1) << "Miss audio stream";
     }
@@ -296,6 +409,9 @@ torch::List<torch::Tensor> readVideo(
   result.push_back(std::move(audioSampleRate));
   result.push_back(std::move(audioDuration));
 
+  VLOG(1) << "Video decoding from " << logType << " [" << logMessage
+          << "] about to return";
+
   return result;
 }
 
@@ -388,59 +504,101 @@ torch::List<torch::Tensor> probeVideo(
     bool isReadFile,
     const torch::Tensor& input_video,
     std::string videoPath) {
-  unique_ptr<DecoderParameters> params = util::getDecoderParams(
+  DecoderParameters params = getDecoderParams(
+      0, // videoStartUs
+      -1, // videoEndUs
       0, // seekFrameMargin
-      0, // getPtsOnly
+      1, // getPtsOnly
       1, // readVideoStream
       0, // width
       0, // height
       0, // minDimension
-      0, // videoStartPts
-      0, // videoEndPts
-      0, // videoTimeBaseNum
-      1, // videoTimeBaseDen
       1, // readAudioStream
       0, // audioSamples
-      0, // audioChannels
-      0, // audioStartPts
-      0, // audioEndPts
-      0, // audioTimeBaseNum
-      1 // audioTimeBaseDen
+      0 // audioChannels
   );
 
-  FfmpegDecoder decoder;
-  DecoderOutput decoderOutput;
+  SyncDecoder decoder;
+  DecoderOutputMessage audioMessage, videoMessage;
+  DecoderInCallback callback = nullptr;
+  std::string logMessage, logType;
   if (isReadFile) {
-    decoder.probeFile(std::move(params), videoPath, decoderOutput);
+    params.uri = videoPath;
+    logType = "file";
+    logMessage = videoPath;
+  } else {
+    callback = MemoryBuffer::getCallback(
+        input_video.data_ptr<uint8_t>(), input_video.size(0));
+    logType = "memory";
+    logMessage = std::to_string(input_video.size(0));
+  }
+
+  VLOG(1) << "Video probing from " << logType << " [" << logMessage
+          << "] has started";
+
+  const auto now = std::chrono::system_clock::now();
+
+  bool succeeded;
+  bool gotAudio = false, gotVideo = false;
+  if ((succeeded = decoder.init(params, std::move(callback)))) {
+    int res;
+    DecoderOutputMessage msg;
+    while (0 == (res = decoder.decode(&msg, decoderTimeoutMs)) &&
+           (!gotAudio || !gotVideo)) {
+      if (msg.header.format.type == TYPE_VIDEO && !gotVideo) {
+        videoMessage = std::move(msg);
+        gotVideo = true;
+      }
+      if (msg.header.format.type == TYPE_AUDIO && !gotAudio) {
+        audioMessage = std::move(msg);
+        gotAudio = true;
+      }
+      msg.payload.reset();
+    }
+    succeeded = (res == 0 || res == ENODATA);
+
+    const auto then = std::chrono::system_clock::now();
+    VLOG(1) << "Video probing from " << logType << " [" << logMessage
+            << "] has finished, "
+            << std::chrono::duration_cast<std::chrono::microseconds>(then - now)
+                   .count()
+            << " us";
   } else {
-    decoder.probeMemory(
-        std::move(params),
-        input_video.data_ptr<uint8_t>(),
-        input_video.size(0),
-        decoderOutput);
+    LOG(ERROR) << "Decoder initialization has failed";
   }
+
+  decoder.shutdown();
+
   // video section
   torch::Tensor videoTimeBase = torch::zeros({0}, torch::kInt);
   torch::Tensor videoFps = torch::zeros({0}, torch::kFloat);
   torch::Tensor videoDuration = torch::zeros({0}, torch::kLong);
 
-  auto it = decoderOutput.media_data_.find(TYPE_VIDEO);
-  if (it != decoderOutput.media_data_.end()) {
-    VLOG(1) << "Find video stream";
+  if (succeeded && gotVideo) {
     videoTimeBase = torch::zeros({2}, torch::kInt);
     int* videoTimeBaseData = videoTimeBase.data_ptr<int>();
-    videoTimeBaseData[0] = it->second.format_.video.timeBaseNum;
-    videoTimeBaseData[1] = it->second.format_.video.timeBaseDen;
+    const auto& header = videoMessage.header;
+    const auto& media = header.format;
+
+    videoTimeBaseData[0] = media.num;
+    videoTimeBaseData[1] = media.den;
 
     videoFps = torch::zeros({1}, torch::kFloat);
     float* videoFpsData = videoFps.data_ptr<float>();
-    videoFpsData[0] = it->second.format_.video.fps;
+    videoFpsData[0] = header.fps;
 
     videoDuration = torch::zeros({1}, torch::kLong);
     int64_t* videoDurationData = videoDuration.data_ptr<int64_t>();
-    videoDurationData[0] = it->second.format_.video.duration;
+    AVRational avr = {(int)media.num, (int)media.den};
+    videoDurationData[0] = av_rescale_q(media.duration, AV_TIME_BASE_Q, avr);
+
+    VLOG(2) << "Prob fps: " << header.fps << ", duration: " << media.duration
+            << ", num: " << media.num << ", den: " << media.den;
+
+    VLOG(1) << "Video probing from " << logType << " [" << logMessage
+            << "] filled video tensors";
   } else {
-    VLOG(1) << "Miss video stream";
+    LOG(ERROR) << "Miss video stream";
   }
 
   // audio section
@@ -448,21 +606,31 @@ torch::List<torch::Tensor> probeVideo(
   torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt);
   torch::Tensor audioDuration = torch::zeros({0}, torch::kLong);
 
-  it = decoderOutput.media_data_.find(TYPE_AUDIO);
-  if (it != decoderOutput.media_data_.end()) {
-    VLOG(1) << "Find audio stream";
+  if (succeeded && gotAudio) {
     audioTimeBase = torch::zeros({2}, torch::kInt);
     int* audioTimeBaseData = audioTimeBase.data_ptr<int>();
-    audioTimeBaseData[0] = it->second.format_.audio.timeBaseNum;
-    audioTimeBaseData[1] = it->second.format_.audio.timeBaseDen;
+    const auto& header = audioMessage.header;
+    const auto& media = header.format;
+    const auto& format = media.format.audio;
+
+    audioTimeBaseData[0] = media.num;
+    audioTimeBaseData[1] = media.den;
 
     audioSampleRate = torch::zeros({1}, torch::kInt);
     int* audioSampleRateData = audioSampleRate.data_ptr<int>();
-    audioSampleRateData[0] = it->second.format_.audio.samples;
+    audioSampleRateData[0] = format.samples;
 
     audioDuration = torch::zeros({1}, torch::kLong);
     int64_t* audioDurationData = audioDuration.data_ptr<int64_t>();
-    audioDurationData[0] = it->second.format_.audio.duration;
+    AVRational avr = {(int)media.num, (int)media.den};
+    audioDurationData[0] = av_rescale_q(media.duration, AV_TIME_BASE_Q, avr);
+
+    VLOG(2) << "Prob sample rate: " << format.samples
+            << ", duration: " << media.duration << ", num: " << media.num
+            << ", den: " << media.den;
+
+    VLOG(1) << "Video probing from " << logType << " [" << logMessage
+            << "] filled audio tensors";
   } else {
     VLOG(1) << "Miss audio stream";
   }
@@ -475,6 +643,9 @@ torch::List<torch::Tensor> probeVideo(
   result.push_back(std::move(audioSampleRate));
   result.push_back(std::move(audioDuration));
 
+  VLOG(1) << "Video probing from " << logType << " [" << logMessage
+          << "] is about to return";
+
   return result;
 }
 
diff --git a/torchvision/csrc/cpu/video_reader/VideoReader.h b/torchvision/csrc/cpu/video_reader/VideoReader.h
index efc2e4709a6..923a3190977 100644
--- a/torchvision/csrc/cpu/video_reader/VideoReader.h
+++ b/torchvision/csrc/cpu/video_reader/VideoReader.h
@@ -1,99 +1,3 @@
 #pragma once
 
 #include <torch/script.h>
-
-// Interface for Python
-
-/*
-  return:
-    videoFrame: tensor (N, H, W, C) kByte
-    videoFramePts: tensor (N) kLong
-    videoTimeBase: tensor (2) kInt
-    videoFps: tensor (1) kFloat
-    audioFrame: tensor (N, C) kFloat
-    audioFramePts: tensor (N) kLong
-    audioTimeBase: tensor (2) kInt
-    audioSampleRate: tensor (1) kInt
-*/
-torch::List<torch::Tensor> readVideoFromMemory(
-    // 1D tensor of data type uint8, storing the comparessed video data
-    torch::Tensor input_video,
-    // seeking frame in the video/audio stream is imprecise so seek to a
-    // timestamp earlier by a margin The unit of margin is second
-    double seekFrameMargin,
-    // If only pts is needed and video/audio frames are not needed, set it
-    // to 1
-    int64_t getPtsOnly,
-    // bool variable. Set it to 1 if video stream should be read. Otherwise, set
-    // it to 0
-    int64_t readVideoStream,
-    /*
-    Valid parameters values for rescaling video frames
-    ___________________________________________________
-    |  width  |  height  | min_dimension |  algorithm |
-    |_________________________________________________|
-    |  0  |  0  |     0        |   original           |
-    |_________________________________________________|
-    |  0  |  0  |     >0       |scale to min dimension|
-    |_____|_____|____________________________________ |
-    |  >0 |  0  |     0        |   scale keeping W    |
-    |_________________________________________________|
-    |  0  |  >0 |     0        |   scale keeping H    |
-    |_________________________________________________|
-    |  >0 |  >0 |     0        |   stretch/scale      |
-    |_________________________________________________|
-    */
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    // video frames with pts in [videoStartPts, videoEndPts] will be decoded
-    // For decoding all video frames, use [0, -1]
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    // numerator and denominator of time base of video stream.
-    // For decoding all video frames, supply dummy 0 (numerator) and 1
-    // (denominator). For decoding localized video frames, need to supply
-    // them which will be checked during decoding
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    // bool variable. Set it to 1 if audio stream should be read. Otherwise, set
-    // it to 0
-    int64_t readAudioStream,
-    // audio stream sampling rate.
-    // If not resampling audio waveform, supply 0
-    // Otherwise, supply a positive integer.
-    int64_t audioSamples,
-    // audio stream channels
-    // Supply 0 to use the same number of channels as in the original audio
-    // stream
-    int64_t audioChannels,
-    // audio frames with pts in [audioStartPts, audioEndPts] will be decoded
-    // For decoding all audio frames, use [0, -1]
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    // numerator and denominator of time base of audio stream.
-    // For decoding all audio frames, supply dummy 0 (numerator) and 1
-    // (denominator). For decoding localized audio frames, need to supply
-    // them which will be checked during decoding
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen);
-
-torch::List<torch::Tensor> readVideoFromFile(
-    std::string videoPath,
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int64_t width,
-    int64_t height,
-    int64_t minDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int64_t videoTimeBaseNum,
-    int64_t videoTimeBaseDen,
-    int64_t readAudioStream,
-    int64_t audioSamples,
-    int64_t audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int64_t audioTimeBaseNum,
-    int64_t audioTimeBaseDen);
diff --git a/torchvision/csrc/cpu/video_reader/util.cpp b/torchvision/csrc/cpu/video_reader/util.cpp
deleted file mode 100644
index ae3c3df0f0a..00000000000
--- a/torchvision/csrc/cpu/video_reader/util.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#include "util.h"
-
-using namespace std;
-
-namespace util {
-
-unique_ptr<DecoderParameters> getDecoderParams(
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int videoWidth,
-    int videoHeight,
-    int videoMinDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int videoTimeBaseNum,
-    int videoTimeBaseDen,
-    int64_t readAudioStream,
-    int audioSamples,
-    int audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int audioTimeBaseNum,
-    int audioTimeBaseDen) {
-  unique_ptr<DecoderParameters> params = make_unique<DecoderParameters>();
-
-  if (readVideoStream == 1) {
-    params->formats.emplace(
-        MediaType::TYPE_VIDEO, MediaFormat(MediaType::TYPE_VIDEO));
-    MediaFormat& videoFormat = params->formats[MediaType::TYPE_VIDEO];
-
-    videoFormat.format.video.width = videoWidth;
-    videoFormat.format.video.height = videoHeight;
-    videoFormat.format.video.minDimension = videoMinDimension;
-    videoFormat.format.video.startPts = videoStartPts;
-    videoFormat.format.video.endPts = videoEndPts;
-    videoFormat.format.video.timeBaseNum = videoTimeBaseNum;
-    videoFormat.format.video.timeBaseDen = videoTimeBaseDen;
-  }
-
-  if (readAudioStream == 1) {
-    params->formats.emplace(
-        MediaType::TYPE_AUDIO, MediaFormat(MediaType::TYPE_AUDIO));
-    MediaFormat& audioFormat = params->formats[MediaType::TYPE_AUDIO];
-
-    audioFormat.format.audio.samples = audioSamples;
-    audioFormat.format.audio.channels = audioChannels;
-    audioFormat.format.audio.startPts = audioStartPts;
-    audioFormat.format.audio.endPts = audioEndPts;
-    audioFormat.format.audio.timeBaseNum = audioTimeBaseNum;
-    audioFormat.format.audio.timeBaseDen = audioTimeBaseDen;
-  }
-
-  params->seekFrameMargin = seekFrameMargin;
-  params->getPtsOnly = getPtsOnly;
-
-  return params;
-}
-
-} // namespace util
diff --git a/torchvision/csrc/cpu/video_reader/util.h b/torchvision/csrc/cpu/video_reader/util.h
deleted file mode 100644
index 6b5fd55388b..00000000000
--- a/torchvision/csrc/cpu/video_reader/util.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-#include <memory>
-#include "FfmpegDecoder.h"
-
-namespace util {
-
-std::unique_ptr<DecoderParameters> getDecoderParams(
-    double seekFrameMargin,
-    int64_t getPtsOnly,
-    int64_t readVideoStream,
-    int videoWidth,
-    int videoHeight,
-    int videoMinDimension,
-    int64_t videoStartPts,
-    int64_t videoEndPts,
-    int videoTimeBaseNum,
-    int videoTimeBaseDen,
-    int64_t readAudioStream,
-    int audioSamples,
-    int audioChannels,
-    int64_t audioStartPts,
-    int64_t audioEndPts,
-    int audioTimeBaseNum,
-    int audioTimeBaseDen);
-
-} // namespace util