Update and fill the rest of ffmpeg-integration C++ code (pytorch#2113)

Summary: - Introduce AudioBuffer and VideoBuffer for different way of handling frames - Update the way option dictionary is passed - Remove unused AutoFrameUnref - Add SrcStreamInfo/OutputStreamInfo classes Pull Request resolved: pytorch#2113 Reviewed By: nateanl Differential Revision: D33356144 Pulled By: mthrok fbshipit-source-id: e837e84fae48baa7befd5c70599bcd2cbb61514d
xiaohui-zhang · May 4, 2022 · 822d08e · 822d08e
1 parent 52c6f52
commit 822d08e
Show file tree

Hide file tree

Showing 10 changed files with 283 additions and 57 deletions.
diff --git a/torchaudio/csrc/ffmpeg/buffer.cpp b/torchaudio/csrc/ffmpeg/buffer.cpp
@@ -5,8 +5,27 @@
 namespace torchaudio {
 namespace ffmpeg {
 
-Buffer::Buffer(AVMediaType type) : media_type(type) {}
+Buffer::Buffer(int frames_per_chunk, int num_chunks)
+    : frames_per_chunk(frames_per_chunk), num_chunks(num_chunks) {}
 
+AudioBuffer::AudioBuffer(int frames_per_chunk, int num_chunks)
+    : Buffer(frames_per_chunk, num_chunks) {}
+
+VideoBuffer::VideoBuffer(int frames_per_chunk, int num_chunks)
+    : Buffer(frames_per_chunk, num_chunks) {}
+
+////////////////////////////////////////////////////////////////////////////////
+// Query
+////////////////////////////////////////////////////////////////////////////////
+bool Buffer::is_ready() const {
+  if (frames_per_chunk < 0)
+    return num_buffered_frames > 0;
+  return num_buffered_frames >= frames_per_chunk;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Modifiers - Push Audio
+////////////////////////////////////////////////////////////////////////////////
 namespace {
 torch::Tensor convert_audio_tensor(AVFrame* pFrame) {
   // ref: https://ffmpeg.org/doxygen/4.1/filter__audio_8c_source.html#l00215
@@ -82,10 +101,64 @@ torch::Tensor convert_audio_tensor(AVFrame* pFrame) {
 }
 } // namespace
 
-void Buffer::push_audio_frame(AVFrame* pFrame) {
-  chunks.push_back(convert_audio_tensor(pFrame));
+void AudioBuffer::push_tensor(torch::Tensor t) {
+  // If frames_per_chunk < 0, users want to fetch all frames.
+  // Just push back to chunks and that's it.
+  if (frames_per_chunk < 0) {
+    chunks.push_back(t);
+    num_buffered_frames += t.size(0);
+    return;
+  }
+
+  // Push
+  // Note:
+  // For audio, the incoming tensor contains multiple of samples.
+  // For small `frames_per_chunk` value, it might be more than `max_frames`.
+  // If we push the tensor as-is, then, the whole frame might be popped at
+  // trimming stage, resulting buffer always empty. So we slice push the
+  // incoming Tensor.
+
+  // Check the last inserted Tensor and if the numbe of frames is not
+  // frame_per_chunk, reprocess it again with the incomping tensor
+  if (num_buffered_frames % frames_per_chunk) {
+    torch::Tensor prev = chunks.back();
+    chunks.pop_back();
+    num_buffered_frames -= prev.size(0);
+    t = torch::cat({prev, t}, 0);
+  }
+
+  while (true) {
+    int num_input_frames = t.size(0);
+    if (num_input_frames <= frames_per_chunk) {
+      chunks.push_back(t);
+      num_buffered_frames += num_input_frames;
+      break;
+    }
+    // The input tensor contains more frames than frames_per_chunk
+    auto splits = torch::tensor_split(t, {frames_per_chunk, num_input_frames});
+    chunks.push_back(splits[0]);
+    num_buffered_frames += frames_per_chunk;
+    t = splits[1];
+  }
+
+  // Trim
+  // If frames_per_chunk > 0, we only retain the following number of frames and
+  // Discard older frames.
+  int max_frames = num_chunks * frames_per_chunk;
+  while (num_buffered_frames > max_frames) {
+    torch::Tensor& t = chunks.front();
+    num_buffered_frames -= t.size(0);
+    chunks.pop_front();
+  }
+}
+
+void AudioBuffer::push_frame(AVFrame* frame) {
+  push_tensor(convert_audio_tensor(frame));
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Modifiers - Push Video
+////////////////////////////////////////////////////////////////////////////////
 namespace {
 torch::Tensor convert_image_tensor(AVFrame* pFrame) {
   // ref:
@@ -130,34 +203,79 @@ torch::Tensor convert_image_tensor(AVFrame* pFrame) {
 }
 } // namespace
 
-void Buffer::push_video_frame(AVFrame* pFrame) {
-  chunks.push_back(convert_image_tensor(pFrame));
+void VideoBuffer::push_tensor(torch::Tensor t) {
+  // the video frames is expected to contain only one frame
+  chunks.push_back(t);
+  num_buffered_frames += t.size(0);
+
+  if (frames_per_chunk < 0) {
+    return;
+  }
+
+  // Trim
+  int max_frames = num_chunks * frames_per_chunk;
+  if (num_buffered_frames > max_frames) {
+    torch::Tensor& t = chunks.front();
+    num_buffered_frames -= t.size(0);
+    chunks.pop_front();
+  }
 }
 
-torch::Tensor Buffer::pop_all() {
-  if (!chunks.size())
-    return torch::empty({});
+void VideoBuffer::push_frame(AVFrame* frame) {
+  push_tensor(convert_image_tensor(frame));
+}
 
-  std::vector<torch::Tensor> tmp;
-  while (chunks.size()) {
-    tmp.push_back(chunks.front());
+////////////////////////////////////////////////////////////////////////////////
+// Modifiers - Pop
+////////////////////////////////////////////////////////////////////////////////
+
+using namespace torch::indexing;
+
+c10::optional<torch::Tensor> Buffer::pop_chunk() {
+  if (!num_buffered_frames) {
+    return c10::optional<torch::Tensor>{};
+  }
+  if (frames_per_chunk < 0) {
+    return c10::optional<torch::Tensor>{pop_all()};
+  }
+  return c10::optional<torch::Tensor>{pop_one_chunk()};
+}
+
+torch::Tensor AudioBuffer::pop_one_chunk() {
+  // Audio deque are aligned with `frames_per_chunk`
+  torch::Tensor ret = chunks.front();
+  chunks.pop_front();
+  num_buffered_frames -= ret.size(0);
+  return ret;
+}
+
+torch::Tensor VideoBuffer::pop_one_chunk() {
+  // Video deque contains one frame par one tensor
+  std::vector<torch::Tensor> ret;
+  while (num_buffered_frames > 0 && ret.size() < frames_per_chunk) {
+    torch::Tensor& t = chunks.front();
+    ret.push_back(t);
     chunks.pop_front();
+    num_buffered_frames -= 1;
   }
-  return torch::cat(tmp, 0);
+  return torch::cat(ret, 0);
 }
 
-void Buffer::push_frame(AVFrame* frame) {
-  switch (media_type) {
-    case AVMEDIA_TYPE_AUDIO:
-      push_audio_frame(frame);
-      break;
-    case AVMEDIA_TYPE_VIDEO:
-      push_video_frame(frame);
-      break;
-    default:
-      throw std::runtime_error(
-          "Unexpected media type. Only audio/video is supported.");
+torch::Tensor Buffer::pop_all() {
+  // Note:
+  // This method is common to audio/video.
+  // In audio case, each Tensor contains multiple frames
+  // In video case, each Tensor contains one frame,
+  std::vector<torch::Tensor> ret;
+  while (chunks.size()) {
+    torch::Tensor& t = chunks.front();
+    int n_frames = t.size(0);
+    ret.push_back(t);
+    chunks.pop_front();
+    num_buffered_frames -= n_frames;
   }
+  return torch::cat(ret, 0);
 }
+
 } // namespace ffmpeg
 } // namespace torchaudio
diff --git a/torchaudio/csrc/ffmpeg/buffer.h b/torchaudio/csrc/ffmpeg/buffer.h
@@ -7,18 +7,82 @@ namespace torchaudio {
 namespace ffmpeg {
 
 class Buffer {
+ protected:
+  // Each AVFrame is converted to a Tensor and stored here.
   std::deque<torch::Tensor> chunks;
-  AVMediaType media_type;
 
-  void push_audio_frame(AVFrame* pFrame);
-  void push_video_frame(AVFrame* pFrame);
+  // The number of frames to return as a chunk
+  // If <0, then user wants to receive all the frames
+  const int frames_per_chunk;
+  // The numbe of chunks to retain
+  const int num_chunks;
+  // The number of currently stored chunks
+  // For video, one Tensor corresponds to one frame, but for audio,
+  // one Tensor contains multiple samples, so we track here.
+  int num_buffered_frames = 0;
 
  public:
-  Buffer(AVMediaType type);
+  Buffer(int frames_per_chunk, int num_chunks);
+  virtual ~Buffer() = default;
 
-  void push_frame(AVFrame* pFrame);
+  //////////////////////////////////////////////////////////////////////////////
+  // Query
+  //////////////////////////////////////////////////////////////////////////////
+  // Check if buffeer has enoough number of frames for a chunk
+  // If frame_per_chunk <0, returns true if there is >0 frames.
+  // Otherwise, returns if num_frames >= frame_per_chunk.
+  bool is_ready() const;
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Modifiers
+  //////////////////////////////////////////////////////////////////////////////
+  virtual void push_frame(AVFrame* frame) = 0;
+
+  c10::optional<torch::Tensor> pop_chunk();
+
+ private:
+  virtual torch::Tensor pop_one_chunk() = 0;
   torch::Tensor pop_all();
 };
 
+// Specialization of the handling around push/pop for audio/video.
+
+////////////////////////////////////////////////////////////////////////////////
+// AudioBuffer specialization
+////////////////////////////////////////////////////////////////////////////////
+// For audio, input AVFrame contains multiple frames.
+// When popping the buffered frames chunk-by-chunk, it is easier if they are
+// organized by chunk when pushed to deque object.
+// Therefore, audio implements pushing mechanism that makes sure that
+// each Tensor in deque consists Tensors with `frames_per_chunk` frames.
+class AudioBuffer : public Buffer {
+ public:
+  AudioBuffer(int frames_per_chunk, int num_chunks);
+
+  void push_frame(AVFrame* frame);
+
+ private:
+  void push_tensor(torch::Tensor tensor);
+  torch::Tensor pop_one_chunk();
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// VideoBuffer specialization
+////////////////////////////////////////////////////////////////////////////////
+// For video, input AVFrame contains one frame.
+// Contraty to audio, it is simple to push one frame each time to deque.
+// But this mean that chunks consisting of multiple frames have to be created
+// at popping time.
+class VideoBuffer : public Buffer {
+ public:
+  VideoBuffer(int frames_per_chunk, int num_chunks);
+
+  void push_frame(AVFrame* frame);
+
+ private:
+  void push_tensor(torch::Tensor tensor);
+  torch::Tensor pop_one_chunk();
+};
+
 } // namespace ffmpeg
 } // namespace torchaudio
diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.cpp b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
@@ -14,12 +14,20 @@ namespace {
 AVFormatContext* get_format_context(
     const std::string& src,
     const std::string& device,
-    AVDictionary** option) {
+    const std::map<std::string, std::string>& option) {
   AVFormatContext* pFormat = NULL;
   AVInputFormat* pInput =
       device.empty() ? NULL : av_find_input_format(device.c_str());
 
-  if (avformat_open_input(&pFormat, src.c_str(), pInput, option) < 0)
+  AVDictionary* dict = NULL;
+  for (auto& it : option) {
+    av_dict_set(&dict, it.first.c_str(), it.second.c_str(), 0);
+  }
+
+  int ret = avformat_open_input(&pFormat, src.c_str(), pInput, &dict);
+  av_dict_free(&dict);
+
+  if (ret < 0)
     throw std::runtime_error("Failed to open the input: " + src);
   return pFormat;
 }
@@ -28,7 +36,7 @@ AVFormatContext* get_format_context(
 AVFormatContextPtr::AVFormatContextPtr(
     const std::string& src,
     const std::string& device,
-    AVDictionary** option)
+    const std::map<std::string, std::string>& option)
     : Wrapper<AVFormatContext, AVFormatContextDeleter>(
           get_format_context(src, device, option)) {
   if (avformat_find_stream_info(ptr.get(), NULL) < 0)
@@ -82,17 +90,6 @@ AVFrame* get_av_frame() {
 
 AVFramePtr::AVFramePtr() : Wrapper<AVFrame, AVFrameDeleter>(get_av_frame()) {}
 
-///////////////////////////////////////////////////////////////////////////////
-// AVFrame - buffer unref
-////////////////////////////////////////////////////////////////////////////////
-AutoFrameUnref::AutoFrameUnref(AVFramePtr& p) : p_(p){};
-AutoFrameUnref::~AutoFrameUnref() {
-  av_frame_unref(p_);
-}
-AutoFrameUnref::operator AVFrame*() const {
-  return p_;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // AVCodecContext
 ////////////////////////////////////////////////////////////////////////////////

diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.h b/torchaudio/csrc/ffmpeg/ffmpeg.h
@@ -1,6 +1,7 @@
 // One stop header for all ffmepg needs
 #pragma once
 #include <cstdint>
+#include <map>
 #include <memory>
 #include <string>
 
@@ -58,7 +59,7 @@ struct AVFormatContextPtr
   AVFormatContextPtr(
       const std::string& src,
       const std::string& device,
-      AVDictionary** option);
+      const std::map<std::string, std::string>& option);
 };
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -101,18 +102,6 @@ struct AVFramePtr : public Wrapper<AVFrame, AVFrameDeleter> {
   AVFramePtr();
 };
 
-////////////////////////////////////////////////////////////////////////////////
-// AVFrame - buffer unref
-////////////////////////////////////////////////////////////////////////////////
-// Similar to `AutoPacketUnref`, this structure will release the memory
-// allocated for frame content.
-struct AutoFrameUnref {
-  AVFramePtr& p_;
-  AutoFrameUnref(AVFramePtr& p);
-  ~AutoFrameUnref();
-  operator AVFrame*() const;
-};
-
 ////////////////////////////////////////////////////////////////////////////////
 // AVCodecContext
 ////////////////////////////////////////////////////////////////////////////////

diff --git a/torchaudio/csrc/ffmpeg/filter_graph.cpp b/torchaudio/csrc/ffmpeg/filter_graph.cpp
@@ -15,6 +15,13 @@ FilterGraph::FilterGraph(
   create_filter();
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Query method
+////////////////////////////////////////////////////////////////////////////////
+std::string FilterGraph::get_description() const {
+  return filter_description;
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 // Configuration methods
 ////////////////////////////////////////////////////////////////////////////////