Integrated base decoder into VideoReader class and video_utils.py (py…

…torch#1766) Summary: Pull Request resolved: pytorch#1766 Replaced FfmpegDecoder (incompativle with VUE) by base decoder (compatible with VUE). Modified python utilities video_utils.py for internal simplification. Public interface got preserved. Differential Revision: D19415903 fbshipit-source-id: 998210ded30e228f3a025d3af5054eef8f073fe1
putivsky · Jan 27, 2020 · d7d16a8 · d7d16a8
1 parent bf98744
commit d7d16a8
Show file tree

Hide file tree

Showing 52 changed files with 745 additions and 2,345 deletions.
diff --git a/setup.py b/setup.py
@@ -155,41 +155,21 @@ def get_extensions():
         ffmpeg_root = os.path.dirname(ffmpeg_bin)
         ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include')
 
-        # TorchVision video reader
+        # TorchVision base decoder + video reader
         video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader')
         video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp"))
-
-        ext_modules.append(
-            CppExtension(
-                'torchvision.video_reader',
-                video_reader_src,
-                include_dirs=[
-                    video_reader_src_dir,
-                    ffmpeg_include_dir,
-                    extensions_dir,
-                ],
-                libraries=[
-                    'avcodec',
-                    'avformat',
-                    'avutil',
-                    'swresample',
-                    'swscale',
-                ],
-                extra_compile_args=["-std=c++14"],
-                extra_link_args=["-std=c++14"],
-            )
-        )
-
-        # TorchVision base decoder
         base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder')
         base_decoder_src = glob.glob(os.path.join(base_decoder_src_dir, "[!sync_decoder_test]*.cpp"))
 
+        combined_src = video_reader_src + base_decoder_src
+
         ext_modules.append(
             CppExtension(
-                'torchvision.base_decoder',
-                base_decoder_src,
+                'torchvision.video_reader',
+                combined_src,
                 include_dirs=[
                     base_decoder_src_dir,
+                    video_reader_src_dir,
                     ffmpeg_include_dir,
                     extensions_dir,
                 ],

diff --git a/torchvision/csrc/cpu/decoder/audio_sampler.cpp b/torchvision/csrc/cpu/decoder/audio_sampler.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "audio_sampler.h"
 #include <c10/util/Logging.h>
 #include "util.h"

diff --git a/torchvision/csrc/cpu/decoder/audio_sampler.h b/torchvision/csrc/cpu/decoder/audio_sampler.h
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "defs.h"

diff --git a/torchvision/csrc/cpu/decoder/audio_stream.cpp b/torchvision/csrc/cpu/decoder/audio_stream.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "audio_stream.h"
 #include <c10/util/Logging.h>
 #include <limits>
@@ -8,11 +6,23 @@
 namespace ffmpeg {
 
 namespace {
+bool operator==(const AudioFormat& x, const AVFrame& y) {
+  return x.samples == y.sample_rate && x.channels == y.channels &&
+      x.format == y.format;
+}
+
 bool operator==(const AudioFormat& x, const AVCodecContext& y) {
   return x.samples == y.sample_rate && x.channels == y.channels &&
       x.format == y.sample_fmt;
 }
 
+AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) {
+  x.samples = y.sample_rate;
+  x.channels = y.channels;
+  x.format = y.format;
+  return x;
+}
+
 AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) {
   x.samples = y.sample_rate;
   x.channels = y.channels;
@@ -65,12 +75,15 @@ int AudioStream::initFormat() {
 
 int AudioStream::estimateBytes(bool flush) {
   ensureSampler();
-  if (!(sampler_->getInputFormat().audio == *codecCtx_)) {
+  // check if input format gets changed
+  if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_)
+            : !(sampler_->getInputFormat().audio == *frame_)) {
     // - reinit sampler
     SamplerParameters params;
     params.type = format_.type;
     params.out = format_.format;
-    toAudioFormat(params.in.audio, *codecCtx_);
+    flush ? toAudioFormat(params.in.audio, *codecCtx_)
+          : toAudioFormat(params.in.audio, *frame_);
     if (flush || !sampler_->init(params)) {
       return -1;
     }
@@ -84,39 +97,12 @@ int AudioStream::estimateBytes(bool flush) {
             << ", channels: " << format_.format.audio.channels
             << ", format: " << format_.format.audio.format;
   }
-  return sampler_->getSamplesBytes(frame_);
+  return sampler_->getSamplesBytes(flush ? nullptr : frame_);
 }
 
 int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
   ensureSampler();
   return sampler_->sample(flush ? nullptr : frame_, out);
 }
 
-void AudioStream::setHeader(DecoderHeader* header) {
-  header->seqno = numGenerator_++;
-
-  if (codecCtx_->time_base.num != 0) {
-    header->pts = av_rescale_q(
-        av_frame_get_best_effort_timestamp(frame_),
-        codecCtx_->time_base,
-        AV_TIME_BASE_Q);
-  } else {
-    // If the codec time_base is missing then we would've skipped the
-    // rescalePackage step to rescale to codec time_base, so here we can
-    // rescale straight from the stream time_base into AV_TIME_BASE_Q.
-    header->pts = av_rescale_q(
-        av_frame_get_best_effort_timestamp(frame_),
-        inputCtx_->streams[format_.stream]->time_base,
-        AV_TIME_BASE_Q);
-  }
-
-  if (convertPtsToWallTime_) {
-    keeper_.adjust(header->pts);
-  }
-
-  header->keyFrame = 1;
-  header->fps = std::numeric_limits<double>::quiet_NaN();
-  header->format = format_;
-}
-
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/audio_stream.h b/torchvision/csrc/cpu/decoder/audio_stream.h
@@ -1,10 +1,7 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "audio_sampler.h"
 #include "stream.h"
-#include "time_keeper.h"
 
 namespace ffmpeg {
 
@@ -25,13 +22,11 @@ class AudioStream : public Stream {
   int initFormat() override;
   int estimateBytes(bool flush) override;
   int copyFrameBytes(ByteStorage* out, bool flush) override;
-  void setHeader(DecoderHeader* header) override;
 
   void ensureSampler();
 
  private:
   std::unique_ptr<AudioSampler> sampler_;
-  TimeKeeper keeper_;
 };
 
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/cc_stream.cpp b/torchvision/csrc/cpu/decoder/cc_stream.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "cc_stream.h"
 
 namespace ffmpeg {

diff --git a/torchvision/csrc/cpu/decoder/cc_stream.h b/torchvision/csrc/cpu/decoder/cc_stream.h
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "subtitle_stream.h"