Integrated base decoder into VideoReader class and video_utils.py (#1766

) Summary: Pull Request resolved: #1766 Replaced FfmpegDecoder (incompativle with VUE) by base decoder (compatible with VUE). Modified python utilities video_utils.py for internal simplification. Public interface got preserved. Differential Revision: D19415903 fbshipit-source-id: c48f939c59e2e1c61f44c92424d26a17ae36ee7f
pytorch · Jan 31, 2020 · 7f2fb62 · 7f2fb62
1 parent ccdc9f7
commit 7f2fb62
Show file tree

Hide file tree

Showing 53 changed files with 1,183 additions and 2,594 deletions.
diff --git a/setup.py b/setup.py
@@ -155,41 +155,21 @@ def get_extensions():
         ffmpeg_root = os.path.dirname(ffmpeg_bin)
         ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include')
 
-        # TorchVision video reader
+        # TorchVision base decoder + video reader
         video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader')
         video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp"))
-
-        ext_modules.append(
-            CppExtension(
-                'torchvision.video_reader',
-                video_reader_src,
-                include_dirs=[
-                    video_reader_src_dir,
-                    ffmpeg_include_dir,
-                    extensions_dir,
-                ],
-                libraries=[
-                    'avcodec',
-                    'avformat',
-                    'avutil',
-                    'swresample',
-                    'swscale',
-                ],
-                extra_compile_args=["-std=c++14"],
-                extra_link_args=["-std=c++14"],
-            )
-        )
-
-        # TorchVision base decoder
         base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder')
         base_decoder_src = glob.glob(os.path.join(base_decoder_src_dir, "[!sync_decoder_test]*.cpp"))
 
+        combined_src = video_reader_src + base_decoder_src
+
         ext_modules.append(
             CppExtension(
-                'torchvision.base_decoder',
-                base_decoder_src,
+                'torchvision.video_reader',
+                combined_src,
                 include_dirs=[
                     base_decoder_src_dir,
+                    video_reader_src_dir,
                     ffmpeg_include_dir,
                     extensions_dir,
                 ],

diff --git a/torchvision/csrc/cpu/decoder/audio_sampler.cpp b/torchvision/csrc/cpu/decoder/audio_sampler.cpp
@@ -1,15 +1,10 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "audio_sampler.h"
 #include <c10/util/Logging.h>
 #include "util.h"
 
-// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24
-
-#ifndef SWR_CH_MAX
-#define SWR_CH_MAX 32
-#endif
+#define AVRESAMPLE_MAX_CHANNELS 32
 
+// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24
 namespace ffmpeg {
 
 namespace {
@@ -94,17 +89,20 @@ int AudioSampler::numOutputSamples(int inSamples) const {
 }
 
 int AudioSampler::getSamplesBytes(AVFrame* frame) const {
-  return av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) *
-      numOutputSamples(frame ? frame->nb_samples : 0) *
-      params_.out.audio.channels;
+  return av_samples_get_buffer_size(
+      nullptr,
+      params_.out.audio.channels,
+      numOutputSamples(frame ? frame->nb_samples : 0),
+      (AVSampleFormat)params_.out.audio.format,
+      1);
 }
 
 int AudioSampler::sample(
     const uint8_t* inPlanes[],
     int inNumSamples,
     ByteStorage* out,
     int outNumSamples) {
-  uint8_t* outPlanes[SWR_CH_MAX] = {nullptr};
+  uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
   int result;
   if ((result = preparePlanes(
            params_.out.audio, out->writableTail(), outNumSamples, outPlanes)) <
@@ -140,9 +138,12 @@ int AudioSampler::sample(AVFrame* frame, ByteStorage* out) {
     return 0;
   }
 
-  const auto samplesBytes =
-      av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) *
-      outNumSamples * params_.out.audio.channels;
+  const auto samplesBytes = av_samples_get_buffer_size(
+      nullptr,
+      params_.out.audio.channels,
+      outNumSamples,
+      (AVSampleFormat)params_.out.audio.format,
+      1);
 
   // bytes must be allocated
   CHECK_LE(samplesBytes, out->tail());
@@ -167,14 +168,17 @@ int AudioSampler::sample(const ByteStorage* in, ByteStorage* out) {
     return 0;
   }
 
-  const auto samplesBytes =
-      av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) *
-      outNumSamples * params_.out.audio.channels;
+  const auto samplesBytes = av_samples_get_buffer_size(
+      nullptr,
+      params_.out.audio.channels,
+      outNumSamples,
+      (AVSampleFormat)params_.out.audio.format,
+      1);
 
   out->clear();
   out->ensure(samplesBytes);
 
-  uint8_t* inPlanes[SWR_CH_MAX] = {nullptr};
+  uint8_t* inPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
   int result;
   if (in &&
       (result = preparePlanes(

diff --git a/torchvision/csrc/cpu/decoder/audio_sampler.h b/torchvision/csrc/cpu/decoder/audio_sampler.h
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "defs.h"

diff --git a/torchvision/csrc/cpu/decoder/audio_stream.cpp b/torchvision/csrc/cpu/decoder/audio_stream.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "audio_stream.h"
 #include <c10/util/Logging.h>
 #include <limits>
@@ -8,11 +6,23 @@
 namespace ffmpeg {
 
 namespace {
+bool operator==(const AudioFormat& x, const AVFrame& y) {
+  return x.samples == y.sample_rate && x.channels == y.channels &&
+      x.format == y.format;
+}
+
 bool operator==(const AudioFormat& x, const AVCodecContext& y) {
   return x.samples == y.sample_rate && x.channels == y.channels &&
       x.format == y.sample_fmt;
 }
 
+AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) {
+  x.samples = y.sample_rate;
+  x.channels = y.channels;
+  x.format = y.format;
+  return x;
+}
+
 AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) {
   x.samples = y.sample_rate;
   x.channels = y.channels;
@@ -29,7 +39,8 @@ AudioStream::AudioStream(
     : Stream(
           inputCtx,
           MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime) {}
+          convertPtsToWallTime,
+          0) {}
 
 AudioStream::~AudioStream() {
   if (sampler_) {
@@ -65,12 +76,15 @@ int AudioStream::initFormat() {
 
 int AudioStream::estimateBytes(bool flush) {
   ensureSampler();
-  if (!(sampler_->getInputFormat().audio == *codecCtx_)) {
+  // check if input format gets changed
+  if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_)
+            : !(sampler_->getInputFormat().audio == *frame_)) {
     // - reinit sampler
     SamplerParameters params;
     params.type = format_.type;
     params.out = format_.format;
-    toAudioFormat(params.in.audio, *codecCtx_);
+    flush ? toAudioFormat(params.in.audio, *codecCtx_)
+          : toAudioFormat(params.in.audio, *frame_);
     if (flush || !sampler_->init(params)) {
       return -1;
     }
@@ -84,39 +98,12 @@ int AudioStream::estimateBytes(bool flush) {
             << ", channels: " << format_.format.audio.channels
             << ", format: " << format_.format.audio.format;
   }
-  return sampler_->getSamplesBytes(frame_);
+  return sampler_->getSamplesBytes(flush ? nullptr : frame_);
 }
 
 int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
   ensureSampler();
   return sampler_->sample(flush ? nullptr : frame_, out);
 }
 
-void AudioStream::setHeader(DecoderHeader* header) {
-  header->seqno = numGenerator_++;
-
-  if (codecCtx_->time_base.num != 0) {
-    header->pts = av_rescale_q(
-        av_frame_get_best_effort_timestamp(frame_),
-        codecCtx_->time_base,
-        AV_TIME_BASE_Q);
-  } else {
-    // If the codec time_base is missing then we would've skipped the
-    // rescalePackage step to rescale to codec time_base, so here we can
-    // rescale straight from the stream time_base into AV_TIME_BASE_Q.
-    header->pts = av_rescale_q(
-        av_frame_get_best_effort_timestamp(frame_),
-        inputCtx_->streams[format_.stream]->time_base,
-        AV_TIME_BASE_Q);
-  }
-
-  if (convertPtsToWallTime_) {
-    keeper_.adjust(header->pts);
-  }
-
-  header->keyFrame = 1;
-  header->fps = std::numeric_limits<double>::quiet_NaN();
-  header->format = format_;
-}
-
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/audio_stream.h b/torchvision/csrc/cpu/decoder/audio_stream.h
@@ -1,10 +1,7 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "audio_sampler.h"
 #include "stream.h"
-#include "time_keeper.h"
 
 namespace ffmpeg {
 
@@ -25,13 +22,11 @@ class AudioStream : public Stream {
   int initFormat() override;
   int estimateBytes(bool flush) override;
   int copyFrameBytes(ByteStorage* out, bool flush) override;
-  void setHeader(DecoderHeader* header) override;
 
   void ensureSampler();
 
  private:
   std::unique_ptr<AudioSampler> sampler_;
-  TimeKeeper keeper_;
 };
 
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/cc_stream.cpp b/torchvision/csrc/cpu/decoder/cc_stream.cpp
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #include "cc_stream.h"
 
 namespace ffmpeg {

diff --git a/torchvision/csrc/cpu/decoder/cc_stream.h b/torchvision/csrc/cpu/decoder/cc_stream.h
@@ -1,5 +1,3 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
 #include "subtitle_stream.h"