Integrated base decoder into VideoReader class and video_utils.py (py…

…torch#1766) Summary: Pull Request resolved: pytorch#1766 Replaced FfmpegDecoder (incompativle with VUE) by base decoder (compatible with VUE). Modified python utilities video_utils.py for internal simplification. Public interface got preserved. Differential Revision: D19415903 fbshipit-source-id: c750e2b74573a60f7f6930f1859c577e26245b89
putivsky · Jan 23, 2020 · 08d5f42 · 08d5f42
1 parent bf98744
commit 08d5f42
Show file tree

Hide file tree

Showing 52 changed files with 696 additions and 2,352 deletions.
diff --git a/setup.py b/setup.py
@@ -155,41 +155,21 @@ def get_extensions():
         ffmpeg_root = os.path.dirname(ffmpeg_bin)
         ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include')
 
-        # TorchVision video reader
+        # TorchVision base decoder + video reader
         video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader')
         video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp"))
-
-        ext_modules.append(
-            CppExtension(
-                'torchvision.video_reader',
-                video_reader_src,
-                include_dirs=[
-                    video_reader_src_dir,
-                    ffmpeg_include_dir,
-                    extensions_dir,
-                ],
-                libraries=[
-                    'avcodec',
-                    'avformat',
-                    'avutil',
-                    'swresample',
-                    'swscale',
-                ],
-                extra_compile_args=["-std=c++14"],
-                extra_link_args=["-std=c++14"],
-            )
-        )
-
-        # TorchVision base decoder
         base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder')
         base_decoder_src = glob.glob(os.path.join(base_decoder_src_dir, "[!sync_decoder_test]*.cpp"))
 
+        combined_src = video_reader_src + base_decoder_src
+
         ext_modules.append(
             CppExtension(
-                'torchvision.base_decoder',
-                base_decoder_src,
+                'torchvision.video_reader',
+                combined_src,
                 include_dirs=[
                     base_decoder_src_dir,
+                    video_reader_src_dir,
                     ffmpeg_include_dir,
                     extensions_dir,
                 ],

diff --git a/torchvision/csrc/cpu/decoder/audio_sampler.cpp b/torchvision/csrc/cpu/decoder/audio_sampler.cpp
@@ -1,8 +1,6 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "audio_sampler.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/audio_sampler.h"
 #include <c10/util/Logging.h>
-#include "util.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/util.h"
 
 // www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24
 

diff --git a/torchvision/csrc/cpu/decoder/audio_sampler.h b/torchvision/csrc/cpu/decoder/audio_sampler.h
@@ -1,8 +1,6 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
-#include "defs.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/defs.h"
 
 extern "C" {
 #include <libswresample/swresample.h>

diff --git a/torchvision/csrc/cpu/decoder/audio_stream.cpp b/torchvision/csrc/cpu/decoder/audio_stream.cpp
@@ -1,18 +1,28 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "audio_stream.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/audio_stream.h"
 #include <c10/util/Logging.h>
 #include <limits>
-#include "util.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/util.h"
 
 namespace ffmpeg {
 
 namespace {
+bool operator==(const AudioFormat& x, const AVFrame& y) {
+  return x.samples == y.sample_rate && x.channels == y.channels &&
+      x.format == y.format;
+}
+
 bool operator==(const AudioFormat& x, const AVCodecContext& y) {
   return x.samples == y.sample_rate && x.channels == y.channels &&
       x.format == y.sample_fmt;
 }
 
+AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) {
+  x.samples = y.sample_rate;
+  x.channels = y.channels;
+  x.format = y.format;
+  return x;
+}
+
 AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) {
   x.samples = y.sample_rate;
   x.channels = y.channels;
@@ -65,12 +75,15 @@ int AudioStream::initFormat() {
 
 int AudioStream::estimateBytes(bool flush) {
   ensureSampler();
-  if (!(sampler_->getInputFormat().audio == *codecCtx_)) {
+  // check if input format gets changed
+  if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_)
+            : !(sampler_->getInputFormat().audio == *frame_)) {
     // - reinit sampler
     SamplerParameters params;
     params.type = format_.type;
     params.out = format_.format;
-    toAudioFormat(params.in.audio, *codecCtx_);
+    flush ? toAudioFormat(params.in.audio, *codecCtx_)
+          : toAudioFormat(params.in.audio, *frame_);
     if (flush || !sampler_->init(params)) {
       return -1;
     }
@@ -84,39 +97,12 @@ int AudioStream::estimateBytes(bool flush) {
             << ", channels: " << format_.format.audio.channels
             << ", format: " << format_.format.audio.format;
   }
-  return sampler_->getSamplesBytes(frame_);
+  return sampler_->getSamplesBytes(flush ? nullptr : frame_);
 }
 
 int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
   ensureSampler();
   return sampler_->sample(flush ? nullptr : frame_, out);
 }
 
-void AudioStream::setHeader(DecoderHeader* header) {
-  header->seqno = numGenerator_++;
-
-  if (codecCtx_->time_base.num != 0) {
-    header->pts = av_rescale_q(
-        av_frame_get_best_effort_timestamp(frame_),
-        codecCtx_->time_base,
-        AV_TIME_BASE_Q);
-  } else {
-    // If the codec time_base is missing then we would've skipped the
-    // rescalePackage step to rescale to codec time_base, so here we can
-    // rescale straight from the stream time_base into AV_TIME_BASE_Q.
-    header->pts = av_rescale_q(
-        av_frame_get_best_effort_timestamp(frame_),
-        inputCtx_->streams[format_.stream]->time_base,
-        AV_TIME_BASE_Q);
-  }
-
-  if (convertPtsToWallTime_) {
-    keeper_.adjust(header->pts);
-  }
-
-  header->keyFrame = 1;
-  header->fps = std::numeric_limits<double>::quiet_NaN();
-  header->format = format_;
-}
-
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/audio_stream.h b/torchvision/csrc/cpu/decoder/audio_stream.h
@@ -1,10 +1,7 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
-#include "audio_sampler.h"
-#include "stream.h"
-#include "time_keeper.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/audio_sampler.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/stream.h"
 
 namespace ffmpeg {
 
@@ -25,13 +22,11 @@ class AudioStream : public Stream {
   int initFormat() override;
   int estimateBytes(bool flush) override;
   int copyFrameBytes(ByteStorage* out, bool flush) override;
-  void setHeader(DecoderHeader* header) override;
 
   void ensureSampler();
 
  private:
   std::unique_ptr<AudioSampler> sampler_;
-  TimeKeeper keeper_;
 };
 
 } // namespace ffmpeg
diff --git a/torchvision/csrc/cpu/decoder/cc_stream.cpp b/torchvision/csrc/cpu/decoder/cc_stream.cpp
@@ -1,6 +1,4 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "cc_stream.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/cc_stream.h"
 
 namespace ffmpeg {
 

diff --git a/torchvision/csrc/cpu/decoder/cc_stream.h b/torchvision/csrc/cpu/decoder/cc_stream.h
@@ -1,8 +1,6 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
-#include "subtitle_stream.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/subtitle_stream.h"
 
 namespace ffmpeg {
 

diff --git a/torchvision/csrc/cpu/decoder/decoder.cpp b/torchvision/csrc/cpu/decoder/decoder.cpp
@@ -1,22 +1,18 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "decoder.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/decoder.h"
 #include <c10/util/Logging.h>
 #include <future>
 #include <iostream>
 #include <mutex>
-#include "audio_stream.h"
-#include "cc_stream.h"
-#include "subtitle_stream.h"
-#include "util.h"
-#include "video_stream.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/audio_stream.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/cc_stream.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/subtitle_stream.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/util.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/video_stream.h"
 
 namespace ffmpeg {
 
 namespace {
 
-constexpr ssize_t kMinSeekBufferSize = 1024;
-constexpr ssize_t kMaxSeekBufferSize = 4 * 1024;
 constexpr size_t kIoBufferSize = 4 * 1024;
 constexpr size_t kLogBufferSize = 1024;
 
@@ -205,7 +201,7 @@ void Decoder::initOnce() {
     av_lockmgr_register(&ffmpeg_lock);
     av_log_set_callback(Decoder::logFunction);
     av_log_set_level(AV_LOG_ERROR);
-    LOG(INFO) << "Registered ffmpeg libs";
+    VLOG(1) << "Registered ffmpeg libs";
   });
 }
 
@@ -248,23 +244,21 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) {
       return false;
     }
 
-    bool canSeek = in(nullptr, 0, 0) == 0;
+    bool canSeek = in(nullptr, 0, 0, 0) == 0;
 
-    if (!seekableBuffer_.init(
-            std::forward<DecoderInCallback>(in),
-            kMinSeekBufferSize,
-            kMaxSeekBufferSize,
-            params_.timeoutMs)) {
-      LOG(ERROR) << "seekable buffer initialization failed";
-      av_free(avioCtxBuffer);
-      avformat_close_input(&tmpCtx);
-      cleanUp();
-      return false;
-    }
+    seekableBuffer_.init(std::forward<DecoderInCallback>(in));
 
     if (params_.isImage) {
+      ImageType type = ImageType::UNKNOWN;
+      if (!seekableBuffer_.detect(params_.timeoutMs, &type)) {
+        LOG(ERROR) << "can't detect image type";
+        av_free(avioCtxBuffer);
+        avformat_close_input(&tmpCtx);
+        cleanUp();
+        return false;
+      }
       const char* fmtName = "image2";
-      switch (seekableBuffer_.getImageType()) {
+      switch (type) {
         case ImageType::JPEG:
           fmtName = "jpeg_pipe";
           break;
@@ -364,16 +358,15 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) {
 
   onInit();
 
-  if (params.startOffsetMs != 0) {
-    av_seek_frame(
-        inputCtx_,
-        -1,
-        params.startOffsetMs * AV_TIME_BASE / 1000,
-        AVSEEK_FLAG_FRAME | AVSEEK_FLAG_ANY);
+  if (params.startOffset != 0) {
+    auto offset = params.startOffset <= params.seekAccuracy
+        ? 0
+        : params.startOffset - params.seekAccuracy;
+
+    av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD);
   }
 
-  LOG(INFO) << "Decoder initialized, log level: " << params_.logLevel;
-  outOfRange_ = false;
+  VLOG(1) << "Decoder initialized, log level: " << params_.logLevel;
   return true;
 }
 
@@ -418,6 +411,7 @@ bool Decoder::activateStreams() {
         return false;
       }
       streams_.emplace(i, std::move(stream));
+      inRange_.set(i, true);
     }
   }
 
@@ -459,7 +453,7 @@ void Decoder::cleanUp() {
 }
 
 int Decoder::getBytes(size_t workingTimeInMs) {
-  if (outOfRange_) {
+  if (inRange_.none()) {
     return ENODATA;
   }
   // decode frames until cache is full and leave thread
@@ -499,13 +493,11 @@ int Decoder::getBytes(size_t workingTimeInMs) {
 
     // get stream
     auto stream = findByIndex(avPacket.stream_index);
-    if (stream == nullptr) {
+    if (stream == nullptr || !inRange_.test(stream->getIndex())) {
       av_packet_unref(&avPacket);
       continue;
     }
 
-    stream->rescalePackage(&avPacket);
-
     AVPacket copyPacket = avPacket;
 
     size_t numConsecutiveNoBytes = 0;
@@ -572,8 +564,10 @@ int Decoder::processPacket(Stream* stream, AVPacket* packet) {
   if ((result = stream->decodeFrame(packet, &gotFrame)) >= 0 && gotFrame &&
       stream->getFrameBytes(&msg, params_.headerOnly) > 0) {
     // check end offset
-    if (params_.endOffsetMs <= 0 ||
-        !(outOfRange_ = msg.header.pts > params_.endOffsetMs * 1000)) {
+    bool endInRange =
+        params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
+    inRange_.set(stream->getIndex(), endInRange);
+    if (endInRange && msg.header.pts >= params_.startOffset) {
       push(std::move(msg));
     }
   }
@@ -587,8 +581,10 @@ void Decoder::flushStreams() {
     while (msg.payload = createByteStorage(0),
            stream.second->flush(&msg, params_.headerOnly) > 0) {
       // check end offset
-      if (params_.endOffsetMs <= 0 ||
-          !(outOfRange_ = msg.header.pts > params_.endOffsetMs * 1000)) {
+      bool endInRange =
+          params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
+      inRange_.set(stream.second->getIndex(), endInRange);
+      if (endInRange && msg.header.pts >= params_.startOffset) {
         push(std::move(msg));
       }
     }

diff --git a/torchvision/csrc/cpu/decoder/decoder.h b/torchvision/csrc/cpu/decoder/decoder.h
@@ -1,9 +1,9 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
 #pragma once
 
-#include "seekable_buffer.h"
-#include "stream.h"
+#include <bitset>
+#include <unordered_map>
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/seekable_buffer.h"
+#include "pytorch/vision/torchvision/csrc/cpu/decoder/stream.h"
 
 namespace ffmpeg {
 
@@ -72,6 +72,6 @@ class Decoder : public MediaDecoder {
   AVFormatContext* inputCtx_{nullptr};
   AVIOContext* avioCtx_{nullptr};
   std::unordered_map<ssize_t, std::unique_ptr<Stream>> streams_;
-  bool outOfRange_{false};
+  std::bitset<64> inRange_;
 };
 } // namespace ffmpeg