From 36c61a3312110d0f1e6167d1c5ae209ae499559a Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Sun, 28 Nov 2021 13:21:30 -0500
Subject: [PATCH 1/2] Add wrapper classes that manage memories allocated by
 ffmpeg

---
 torchaudio/csrc/ffmpeg/ffmpeg.cpp | 151 ++++++++++++++++++++++++++++++
 torchaudio/csrc/ffmpeg/ffmpeg.h   | 132 ++++++++++++++++++++++++++
 2 files changed, 283 insertions(+)
 create mode 100644 torchaudio/csrc/ffmpeg/ffmpeg.cpp
 create mode 100644 torchaudio/csrc/ffmpeg/ffmpeg.h

diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.cpp b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
new file mode 100644
index 0000000000..547c8e915b
--- /dev/null
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
@@ -0,0 +1,151 @@
+#include <torchaudio/csrc/ffmpeg/ffmpeg.h>
+
+namespace torchaudio {
+namespace ffmpeg {
+
+////////////////////////////////////////////////////////////////////////////////
+// AVFormatContext
+////////////////////////////////////////////////////////////////////////////////
+void AVFormatContextDeleter::operator()(AVFormatContext* p) {
+  avformat_close_input(&p);
+};
+
+namespace {
+AVFormatContext* get_format_context(const std::string& src) {
+  AVFormatContext* pFormat = NULL;
+  if (avformat_open_input(&pFormat, src.c_str(), NULL, NULL) < 0)
+    throw std::runtime_error("Failed to open the input: " + src);
+  return pFormat;
+}
+} // namespace
+
+AVFormatContextPtr::AVFormatContextPtr(const std::string& src)
+    : Wrapper<AVFormatContext, AVFormatContextDeleter>(
+          get_format_context(src)) {
+  if (avformat_find_stream_info(ptr.get(), NULL) < 0)
+    throw std::runtime_error("Failed to find stream information.");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// AVPacket
+////////////////////////////////////////////////////////////////////////////////
+void AVPacketDeleter::operator()(AVPacket* p) {
+  av_packet_free(&p);
+};
+
+namespace {
+AVPacket* get_av_packet() {
+  AVPacket* pPacket = av_packet_alloc();
+  if (!pPacket)
+    throw std::runtime_error("Failed to allocate AVPacket object.");
+  return pPacket;
+}
+} // namespace
+
+AVPacketPtr::AVPacketPtr()
+    : Wrapper<AVPacket, AVPacketDeleter>(get_av_packet()) {}
+
+////////////////////////////////////////////////////////////////////////////////
+// AVPacket - buffer unref
+////////////////////////////////////////////////////////////////////////////////
+AutoPacketUnref::AutoPacketUnref(AVPacketPtr& p) : p_(p){};
+AutoPacketUnref::~AutoPacketUnref() {
+  av_packet_unref(p_);
+}
+AutoPacketUnref::operator AVPacket*() const {
+  return p_;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// AVFrame
+////////////////////////////////////////////////////////////////////////////////
+void AVFrameDeleter::operator()(AVFrame* p) {
+  av_frame_free(&p);
+};
+namespace {
+AVFrame* get_av_frame() {
+  AVFrame* pFrame = av_frame_alloc();
+  if (!pFrame)
+    throw std::runtime_error("Failed to allocate AVFrame object.");
+  return pFrame;
+}
+} // namespace
+
+AVFramePtr::AVFramePtr() : Wrapper<AVFrame, AVFrameDeleter>(get_av_frame()) {}
+
+///////////////////////////////////////////////////////////////////////////////
+// AVFrame - buffer unref
+////////////////////////////////////////////////////////////////////////////////
+AutoFrameUnref::AutoFrameUnref(AVFramePtr& p) : p_(p){};
+AutoFrameUnref::~AutoFrameUnref() {
+  av_frame_unref(p_);
+}
+AutoFrameUnref::operator AVFrame*() const {
+  return p_;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// AVCodecContext
+////////////////////////////////////////////////////////////////////////////////
+void AVCodecContextDeleter::operator()(AVCodecContext* p) {
+  avcodec_free_context(&p);
+};
+
+namespace {
+AVCodecContext* get_codec_context(AVCodecParameters* pParams) {
+  const AVCodec* pCodec = avcodec_find_decoder(pParams->codec_id);
+
+  if (!pCodec) {
+    throw std::runtime_error("Unknown codec.");
+  }
+
+  AVCodecContext* pCodecContext = avcodec_alloc_context3(pCodec);
+  if (!pCodecContext) {
+    throw std::runtime_error("Failed to allocate CodecContext.");
+  }
+  return pCodecContext;
+}
+
+void init_codec_context(
+    AVCodecContext* pCodecContext,
+    AVCodecParameters* pParams) {
+  const AVCodec* pCodec = avcodec_find_decoder(pParams->codec_id);
+
+  if (avcodec_parameters_to_context(pCodecContext, pParams) < 0) {
+    throw std::runtime_error("Failed to set CodecContext parameter.");
+  }
+
+  if (avcodec_open2(pCodecContext, pCodec, NULL) < 0) {
+    throw std::runtime_error("Failed to initialize CodecContext.");
+  }
+
+  if (pParams->codec_type == AVMEDIA_TYPE_AUDIO && !pParams->channel_layout)
+    pParams->channel_layout =
+        av_get_default_channel_layout(pCodecContext->channels);
+}
+} // namespace
+
+AVCodecContextPtr::AVCodecContextPtr(AVCodecParameters* pParam)
+    : Wrapper<AVCodecContext, AVCodecContextDeleter>(
+          get_codec_context(pParam)) {
+  init_codec_context(ptr.get(), pParam);
+}
+////////////////////////////////////////////////////////////////////////////////
+// AVFilterGraph
+////////////////////////////////////////////////////////////////////////////////
+void AVFilterGraphDeleter::operator()(AVFilterGraph* p) {
+  avfilter_graph_free(&p);
+};
+
+namespace {
+AVFilterGraph* get_filter_graph() {
+  AVFilterGraph* ptr = avfilter_graph_alloc();
+  if (!ptr)
+    throw std::runtime_error("Failed to allocate resouce.");
+  return ptr;
+}
+} // namespace
+AVFilterGraphPtr::AVFilterGraphPtr()
+    : Wrapper<AVFilterGraph, AVFilterGraphDeleter>(get_filter_graph()) {}
+} // namespace ffmpeg
+} // namespace torchaudio
diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.h b/torchaudio/csrc/ffmpeg/ffmpeg.h
new file mode 100644
index 0000000000..ec9c40976c
--- /dev/null
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.h
@@ -0,0 +1,132 @@
+// One stop header for all ffmepg needs
+#pragma once
+#include <cstdint>
+#include <memory>
+#include <string>
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavfilter/avfilter.h>
+#include <libavfilter/buffersink.h>
+#include <libavfilter/buffersrc.h>
+#include <libavformat/avformat.h>
+#include <libavutil/avutil.h>
+#include <libavutil/frame.h>
+#include <libavutil/imgutils.h>
+#include <libavutil/pixdesc.h>
+}
+
+namespace torchaudio {
+namespace ffmpeg {
+
+// Base structure that handles memory management.
+// Resource is freed by the destructor of unique_ptr,
+// which will call custom delete mechanism provided via Deleter
+// https://stackoverflow.com/a/19054280
+//
+// The resource allocation will be provided by custom constructors.
+template <typename T, typename Deleter>
+class Wrapper {
+ protected:
+  std::unique_ptr<T, Deleter> ptr;
+
+ public:
+  Wrapper() = delete;
+  Wrapper<T, Deleter>(T* t) : ptr(t){};
+  T* operator->() const {
+    return ptr.get();
+  };
+  explicit operator bool() const {
+    return (bool)ptr;
+  };
+  operator T*() const {
+    return ptr.get();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// AVFormatContext
+////////////////////////////////////////////////////////////////////////////////
+struct AVFormatContextDeleter {
+  void operator()(AVFormatContext* p);
+};
+
+struct AVFormatContextPtr
+    : public Wrapper<AVFormatContext, AVFormatContextDeleter> {
+  AVFormatContextPtr(const std::string& src);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// AVPacket
+////////////////////////////////////////////////////////////////////////////////
+struct AVPacketDeleter {
+  void operator()(AVPacket* p);
+};
+
+struct AVPacketPtr : public Wrapper<AVPacket, AVPacketDeleter> {
+  AVPacketPtr();
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// AVPacket - buffer unref
+////////////////////////////////////////////////////////////////////////////////
+// AVPacket structure employs two-staged memory allocation.
+// The first-stage is for allocating AVPacket object itself, and it typically
+// happens only once throughout the lifetime of application.
+// The second-stage is for allocating the content (media data) each time the
+// input file is processed and a chunk of data is read. The memory allocated
+// during this time has to be released before the next iteration.
+// The first-stage memory management is handled by `AVPacketPtr`.
+// `AutoPacketUnref` handles the second-stage memory management.
+struct AutoPacketUnref {
+  AVPacketPtr& p_;
+  AutoPacketUnref(AVPacketPtr& p);
+  ~AutoPacketUnref();
+  operator AVPacket*() const;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// AVFrame
+////////////////////////////////////////////////////////////////////////////////
+struct AVFrameDeleter {
+  void operator()(AVFrame* p);
+};
+
+struct AVFramePtr : public Wrapper<AVFrame, AVFrameDeleter> {
+  AVFramePtr();
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// AVFrame - buffer unref
+////////////////////////////////////////////////////////////////////////////////
+// Similar to `AutoPacketUnref`, this structure will release the memory
+// allocated for frame content.
+struct AutoFrameUnref {
+  AVFramePtr& p_;
+  AutoFrameUnref(AVFramePtr& p);
+  ~AutoFrameUnref();
+  operator AVFrame*() const;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// AVCodecContext
+////////////////////////////////////////////////////////////////////////////////
+struct AVCodecContextDeleter {
+  void operator()(AVCodecContext* p);
+};
+struct AVCodecContextPtr
+    : public Wrapper<AVCodecContext, AVCodecContextDeleter> {
+  AVCodecContextPtr(AVCodecParameters* pParam);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// AVFilterGraph
+////////////////////////////////////////////////////////////////////////////////
+struct AVFilterGraphDeleter {
+  void operator()(AVFilterGraph* p);
+};
+struct AVFilterGraphPtr : public Wrapper<AVFilterGraph, AVFilterGraphDeleter> {
+  AVFilterGraphPtr();
+};
+} // namespace ffmpeg
+} // namespace torchaudio

From 014b11136efb63efbc0519d7ba5815f65410f326 Mon Sep 17 00:00:00 2001
From: moto <855818+mthrok@users.noreply.github.com>
Date: Fri, 3 Dec 2021 23:33:10 -0500
Subject: [PATCH 2/2] add device support

---
 torchaudio/csrc/ffmpeg/ffmpeg.cpp | 17 +++++++++++++----
 torchaudio/csrc/ffmpeg/ffmpeg.h   |  7 ++++++-
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.cpp b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
index 547c8e915b..ed434dc0aa 100644
--- a/torchaudio/csrc/ffmpeg/ffmpeg.cpp
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
@@ -11,17 +11,26 @@ void AVFormatContextDeleter::operator()(AVFormatContext* p) {
 };
 
 namespace {
-AVFormatContext* get_format_context(const std::string& src) {
+AVFormatContext* get_format_context(
+    const std::string& src,
+    const std::string& device,
+    AVDictionary** option) {
   AVFormatContext* pFormat = NULL;
-  if (avformat_open_input(&pFormat, src.c_str(), NULL, NULL) < 0)
+  AVInputFormat* pInput =
+      device.empty() ? NULL : av_find_input_format(device.c_str());
+
+  if (avformat_open_input(&pFormat, src.c_str(), pInput, option) < 0)
     throw std::runtime_error("Failed to open the input: " + src);
   return pFormat;
 }
 } // namespace
 
-AVFormatContextPtr::AVFormatContextPtr(const std::string& src)
+AVFormatContextPtr::AVFormatContextPtr(
+    const std::string& src,
+    const std::string& device,
+    AVDictionary** option)
     : Wrapper<AVFormatContext, AVFormatContextDeleter>(
-          get_format_context(src)) {
+          get_format_context(src, device, option)) {
   if (avformat_find_stream_info(ptr.get(), NULL) < 0)
     throw std::runtime_error("Failed to find stream information.");
 }
diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.h b/torchaudio/csrc/ffmpeg/ffmpeg.h
index ec9c40976c..da058e33c8 100644
--- a/torchaudio/csrc/ffmpeg/ffmpeg.h
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.h
@@ -6,6 +6,7 @@
 
 extern "C" {
 #include <libavcodec/avcodec.h>
+#include <libavdevice/avdevice.h>
 #include <libavfilter/avfilter.h>
 #include <libavfilter/buffersink.h>
 #include <libavfilter/buffersrc.h>
@@ -13,6 +14,7 @@ extern "C" {
 #include <libavutil/avutil.h>
 #include <libavutil/frame.h>
 #include <libavutil/imgutils.h>
+#include <libavutil/log.h>
 #include <libavutil/pixdesc.h>
 }
 
@@ -53,7 +55,10 @@ struct AVFormatContextDeleter {
 
 struct AVFormatContextPtr
     : public Wrapper<AVFormatContext, AVFormatContextDeleter> {
-  AVFormatContextPtr(const std::string& src);
+  AVFormatContextPtr(
+      const std::string& src,
+      const std::string& device,
+      AVDictionary** option);
 };
 
 ////////////////////////////////////////////////////////////////////////////////