Refactor Streamer implementation (#2402)

Summary: * Move the helper wrapping code in TorchBind layer to proper wrapper class for so that it will be re-used in PyBind11. * Move `add_basic_[audio|video]_stream` methods from C++ to Python, as they are just string manipulation. This will make PyBind11-based binding simpler as it needs not to deal with dtype. * Move `add_[audio|video]_stream` wrapper signature to Streamer core, so that Streamer directly deals with `c10::optional`.† † Related to this, there is a slight change in how the empty filter expression is stored. Originally, if an empty filter expression was given to `add_[audio|video]_stream` method, the `StreamReaderOutputStream` was showing it as empty string `""`, even though internally it was using `"anull"` or `"null"`. Now `StreamReaderOutputStream` shows the corresponding filter expression that is actually being used. Ref #2400 Pull Request resolved: #2402 Reviewed By: nateanl Differential Revision: D36488808 Pulled By: mthrok fbshipit-source-id: 877ca731364d10fc0cb9d97e75d55df9180f2047
pytorch · May 19, 2022 · eed5753 · eed5753
1 parent 647f28e
commit eed5753
Show file tree

Hide file tree

Showing 18 changed files with 372 additions and 472 deletions.
diff --git a/test/torchaudio_unittest/io/stream_reader_test.py b/test/torchaudio_unittest/io/stream_reader_test.py
@@ -165,7 +165,7 @@ def test_basic_audio_stream(self):
 
         sinfo = s.get_out_stream_info(0)
         assert sinfo.source_index == s.default_audio_stream
-        assert sinfo.filter_description == ""
+        assert sinfo.filter_description == "anull"
 
         sinfo = s.get_out_stream_info(1)
         assert sinfo.source_index == s.default_audio_stream
@@ -185,7 +185,7 @@ def test_basic_video_stream(self):
 
         sinfo = s.get_out_stream_info(0)
         assert sinfo.source_index == s.default_video_stream
-        assert sinfo.filter_description == ""
+        assert sinfo.filter_description == "null"
 
         sinfo = s.get_out_stream_info(1)
         assert sinfo.source_index == s.default_video_stream

diff --git a/torchaudio/csrc/CMakeLists.txt b/torchaudio/csrc/CMakeLists.txt
@@ -181,6 +181,7 @@ if(USE_FFMPEG)
     ffmpeg/sink.cpp
     ffmpeg/stream_processor.cpp
     ffmpeg/streamer.cpp
+    ffmpeg/stream_reader_wrapper.cpp
     )
   message(STATUS "FFMPEG_ROOT=$ENV{FFMPEG_ROOT}")
   find_package(FFMPEG 4.1 REQUIRED COMPONENTS avdevice avfilter avformat avcodec avutil)

diff --git a/torchaudio/csrc/ffmpeg/decoder.cpp b/torchaudio/csrc/ffmpeg/decoder.cpp
@@ -8,8 +8,8 @@ namespace ffmpeg {
 ////////////////////////////////////////////////////////////////////////////////
 Decoder::Decoder(
     AVCodecParameters* pParam,
-    const std::string& decoder_name,
-    const std::map<std::string, std::string>& decoder_option,
+    const c10::optional<std::string>& decoder_name,
+    const OptionDict& decoder_option,
     const torch::Device& device)
     : pCodecContext(get_decode_context(pParam->codec_id, decoder_name)) {
   init_codec_context(

diff --git a/torchaudio/csrc/ffmpeg/decoder.h b/torchaudio/csrc/ffmpeg/decoder.h
@@ -13,8 +13,8 @@ class Decoder {
   // Default constructable
   Decoder(
       AVCodecParameters* pParam,
-      const std::string& decoder_name,
-      const std::map<std::string, std::string>& decoder_option,
+      const c10::optional<std::string>& decoder_name,
+      const OptionDict& decoder_option,
       const torch::Device& device);
   // Custom destructor to clean up the resources
   ~Decoder() = default;

diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.cpp b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
@@ -17,10 +17,9 @@ void AVFormatContextDeleter::operator()(AVFormatContext* p) {
 
 namespace {
 
-AVDictionary* get_option_dict(
-    const std::map<std::string, std::string>& option) {
+AVDictionary* get_option_dict(const OptionDict& option) {
   AVDictionary* opt = nullptr;
-  for (auto& it : option) {
+  for (const auto& it : option) {
     av_dict_set(&opt, it.first.c_str(), it.second.c_str(), 0);
   }
   return opt;
@@ -66,12 +65,25 @@ std::string join(std::vector<std::string> vars) {
 
 AVFormatContextPtr get_input_format_context(
     const std::string& src,
-    const std::string& device,
-    const std::map<std::string, std::string>& option) {
+    const c10::optional<std::string>& device,
+    const OptionDict& option) {
   AVFormatContext* pFormat = NULL;
 
-  AVINPUT_FORMAT_CONST AVInputFormat* pInput =
-      device.empty() ? NULL : av_find_input_format(device.c_str());
+  AVINPUT_FORMAT_CONST AVInputFormat* pInput = [&]() -> AVInputFormat* {
+    if (device.has_value()) {
+      std::string device_str = device.value();
+      AVINPUT_FORMAT_CONST AVInputFormat* p =
+          av_find_input_format(device_str.c_str());
+      if (!p) {
+        std::ostringstream msg;
+        msg << "Unsupported device: \"" << device_str << "\"";
+        throw std::runtime_error(msg.str());
+      }
+      return p;
+    }
+    return nullptr;
+  }();
+
   AVDictionary* opt = get_option_dict(option);
   int ret = avformat_open_input(&pFormat, src.c_str(), pInput, &opt);
 
@@ -148,18 +160,18 @@ void AVCodecContextDeleter::operator()(AVCodecContext* p) {
 namespace {
 const AVCodec* get_decode_codec(
     enum AVCodecID codec_id,
-    const std::string& decoder_name) {
-  const AVCodec* pCodec = decoder_name.empty()
+    const c10::optional<std::string>& decoder_name) {
+  const AVCodec* pCodec = !decoder_name.has_value()
       ? avcodec_find_decoder(codec_id)
-      : avcodec_find_decoder_by_name(decoder_name.c_str());
+      : avcodec_find_decoder_by_name(decoder_name.value().c_str());
 
   if (!pCodec) {
     std::stringstream ss;
-    if (decoder_name.empty()) {
+    if (!decoder_name.has_value()) {
       ss << "Unsupported codec: \"" << avcodec_get_name(codec_id) << "\", ("
          << codec_id << ").";
     } else {
-      ss << "Unsupported codec: \"" << decoder_name << "\".";
+      ss << "Unsupported codec: \"" << decoder_name.value() << "\".";
     }
     throw std::runtime_error(ss.str());
   }
@@ -170,7 +182,7 @@ const AVCodec* get_decode_codec(
 
 AVCodecContextPtr get_decode_context(
     enum AVCodecID codec_id,
-    const std::string& decoder_name) {
+    const c10::optional<std::string>& decoder_name) {
   const AVCodec* pCodec = get_decode_codec(codec_id, decoder_name);
 
   AVCodecContext* pCodecContext = avcodec_alloc_context3(pCodec);
@@ -216,8 +228,8 @@ const AVCodecHWConfig* get_cuda_config(const AVCodec* pCodec) {
 void init_codec_context(
     AVCodecContext* pCodecContext,
     AVCodecParameters* pParams,
-    const std::string& decoder_name,
-    const std::map<std::string, std::string>& decoder_option,
+    const c10::optional<std::string>& decoder_name,
+    const OptionDict& decoder_option,
     const torch::Device& device,
     AVBufferRefPtr& pHWBufferRef) {
   const AVCodec* pCodec = get_decode_codec(pParams->codec_id, decoder_name);

diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.h b/torchaudio/csrc/ffmpeg/ffmpeg.h
@@ -23,6 +23,8 @@ extern "C" {
 namespace torchaudio {
 namespace ffmpeg {
 
+using OptionDict = std::map<std::string, std::string>;
+
 // Replacement of av_err2str, which causes
 // `error: taking address of temporary array`
 // https://github.com/joncampbell123/composite-video-simulator/issues/5
@@ -71,8 +73,8 @@ struct AVFormatContextPtr
 // create format context for reading media
 AVFormatContextPtr get_input_format_context(
     const std::string& src,
-    const std::string& device,
-    const std::map<std::string, std::string>& option);
+    const c10::optional<std::string>& device,
+    const OptionDict& option);
 
 ////////////////////////////////////////////////////////////////////////////////
 // AVPacket
@@ -141,14 +143,14 @@ struct AVCodecContextPtr
 // Allocate codec context from either decoder name or ID
 AVCodecContextPtr get_decode_context(
     enum AVCodecID codec_id,
-    const std::string& decoder);
+    const c10::optional<std::string>& decoder);
 
 // Initialize codec context with the parameters
 void init_codec_context(
     AVCodecContext* pCodecContext,
     AVCodecParameters* pParams,
-    const std::string& decoder_name,
-    const std::map<std::string, std::string>& decoder_option,
+    const c10::optional<std::string>& decoder_name,
+    const OptionDict& decoder_option,
     const torch::Device& device,
     AVBufferRefPtr& pHWBufferRef);
 

diff --git a/torchaudio/csrc/ffmpeg/filter_graph.cpp b/torchaudio/csrc/ffmpeg/filter_graph.cpp
@@ -7,10 +7,11 @@ namespace ffmpeg {
 FilterGraph::FilterGraph(
     AVRational time_base,
     AVCodecParameters* codecpar,
-    std::string filter_description)
+    const c10::optional<std::string>& filter_description)
     : input_time_base(time_base),
       codecpar(codecpar),
-      filter_description(std::move(filter_description)),
+      filter_description(filter_description.value_or(
+          codecpar->codec_type == AVMEDIA_TYPE_AUDIO ? "anull" : "null")),
       media_type(codecpar->codec_type) {
   init();
 }
@@ -49,10 +50,10 @@ std::string get_video_src_args(
   std::snprintf(
       args,
       sizeof(args),
-      "video_size=%dx%d:pix_fmt=%d:time_base=%d/%d:pixel_aspect=%d/%d",
+      "video_size=%dx%d:pix_fmt=%s:time_base=%d/%d:pixel_aspect=%d/%d",
       codecpar->width,
       codecpar->height,
-      static_cast<AVPixelFormat>(codecpar->format),
+      av_get_pix_fmt_name(static_cast<AVPixelFormat>(codecpar->format)),
       time_base.num,
       time_base.den,
       codecpar->sample_aspect_ratio.num,
@@ -165,16 +166,12 @@ void FilterGraph::add_process() {
   // If you are debugging this part of the code, you might get confused.
   InOuts in{"in", buffersrc_ctx}, out{"out", buffersink_ctx};
 
-  std::string desc = filter_description.empty()
-      ? (media_type == AVMEDIA_TYPE_AUDIO) ? "anull" : "null"
-      : filter_description;
-
-  int ret =
-      avfilter_graph_parse_ptr(pFilterGraph, desc.c_str(), out, in, nullptr);
+  int ret = avfilter_graph_parse_ptr(
+      pFilterGraph, filter_description.c_str(), out, in, nullptr);
 
   if (ret < 0) {
     throw std::runtime_error(
-        "Failed to create the filter from \"" + desc + "\" (" +
+        "Failed to create the filter from \"" + filter_description + "\" (" +
         av_err2string(ret) + ".)");
   }
 }

diff --git a/torchaudio/csrc/ffmpeg/filter_graph.h b/torchaudio/csrc/ffmpeg/filter_graph.h
@@ -24,7 +24,7 @@ class FilterGraph {
   FilterGraph(
       AVRational time_base,
       AVCodecParameters* codecpar,
-      std::string filter_desc);
+      const c10::optional<std::string>& filter_desc);
   // Custom destructor to release AVFilterGraph*
   ~FilterGraph() = default;
   // Non-copyable