Fix metadata fetch (#2464)

Summary: In #2461, `metadata` field was added to StreamInfo. However, the value attached to this new field was source-level metadata, while each stream can have different metadata. * source level metadata [AVFormatContext->metadata](https://ffmpeg.org/doxygen/4.1/structAVFormatContext.html#a3019a56080ed2e3297ff25bc2ff88adf) * stream level metadata [AVFormatContext->streams[]->metadata](https://ffmpeg.org/doxygen/4.1/structAVStream.html#a50d250a128a3da9ce3d135e84213fb82) This commit moves source level metadata to dedicated method, `get_metadata`, and fix the stream-level metadata to report stream metadata. Pull Request resolved: #2464 Reviewed By: hwangjeff, xiaohui-zhang Differential Revision: D36995452 Pulled By: mthrok fbshipit-source-id: 534be1f7feb07790a0ce8624c336cdb7b65a8697
pytorch · Jun 8, 2022 · 4d2fa19 · 4d2fa19
1 parent 711d601
commit 4d2fa19
Show file tree

Hide file tree

Showing 8 changed files with 83 additions and 33 deletions.
diff --git a/test/torchaudio_unittest/io/stream_reader_test.py b/test/torchaudio_unittest/io/stream_reader_test.py
@@ -89,12 +89,11 @@ def test_src_info(self):
         s = StreamReader(self.get_src())
         assert s.num_src_streams == 6
 
-        metadata = {
-            "compatible_brands": "isomiso2avc1mp41",
-            "encoder": "Lavf58.76.100",
-            "major_brand": "isom",
-            "minor_version": "512",
-        }
+        # Note:
+        # FFmpeg 4.4.1 and FFmpeg 5 also report
+        # `"vendor_id": "[0][0][0][0]"` in audio/video metadata.
+        # TODO:
+        # change expected metadata value based on FFmpeg version.
         expected = [
             StreamReaderSourceVideoStream(
                 media_type="video",
@@ -104,7 +103,10 @@ def test_src_info(self):
                 bit_rate=71925,
                 num_frames=325,
                 bits_per_sample=8,
-                metadata=metadata,
+                metadata={
+                    "handler_name": "\x1fMainconcept Video Media Handler",
+                    "language": "eng",
+                },
                 width=320,
                 height=180,
                 frame_rate=25.0,
@@ -117,7 +119,10 @@ def test_src_info(self):
                 bit_rate=72093,
                 num_frames=103,
                 bits_per_sample=0,
-                metadata=metadata,
+                metadata={
+                    "handler_name": "#Mainconcept MP4 Sound Media Handler",
+                    "language": "eng",
+                },
                 sample_rate=8000.0,
                 num_channels=2,
             ),
@@ -129,7 +134,10 @@ def test_src_info(self):
                 bit_rate=None,
                 num_frames=None,
                 bits_per_sample=None,
-                metadata=metadata,
+                metadata={
+                    "handler_name": "SubtitleHandler",
+                    "language": "eng",
+                },
             ),
             StreamReaderSourceVideoStream(
                 media_type="video",
@@ -139,7 +147,10 @@ def test_src_info(self):
                 bit_rate=128783,
                 num_frames=390,
                 bits_per_sample=8,
-                metadata=metadata,
+                metadata={
+                    "handler_name": "\x1fMainconcept Video Media Handler",
+                    "language": "eng",
+                },
                 width=480,
                 height=270,
                 frame_rate=29.97002997002997,
@@ -152,7 +163,10 @@ def test_src_info(self):
                 bit_rate=128837,
                 num_frames=205,
                 bits_per_sample=0,
-                metadata=metadata,
+                metadata={
+                    "handler_name": "#Mainconcept MP4 Sound Media Handler",
+                    "language": "eng",
+                },
                 sample_rate=16000.0,
                 num_channels=2,
             ),
@@ -164,32 +178,44 @@ def test_src_info(self):
                 bit_rate=None,
                 num_frames=None,
                 bits_per_sample=None,
-                metadata=metadata,
+                metadata={
+                    "handler_name": "SubtitleHandler",
+                    "language": "eng",
+                },
             ),
         ]
         output = [s.get_src_stream_info(i) for i in range(6)]
+        # Remove "vendor_id" if exists
+        # TODO: don't remove "vendor_id", instead,
+        # change expected based on FFmpeg version
+        for sinfo in output:
+            if "vendor_id" in sinfo.metadata:
+                del sinfo.metadata["vendor_id"]
         assert expected == output
 
     def test_id3tag(self):
+        """get_metadata method can fetch id3tag properly"""
         s = StreamReader(self.get_src("steam-train-whistle-daniel_simon.mp3"))
-        output = s.get_src_stream_info(s.default_audio_stream)
-
-        expected = StreamReaderSourceAudioStream(
-            media_type="audio",
-            codec="mp3",
-            codec_long_name="MP3 (MPEG audio layer 3)",
-            format="fltp",
-            bit_rate=210571,
-            num_frames=0,
-            bits_per_sample=0,
-            metadata={
-                "title": "SoundBible.com Must Credit",
-                "artist": "SoundBible.com Must Credit",
-                "date": "2017",
-            },
-            sample_rate=44100.0,
-            num_channels=2,
-        )
+        output = s.get_metadata()
+
+        expected = {
+            "title": "SoundBible.com Must Credit",
+            "artist": "SoundBible.com Must Credit",
+            "date": "2017",
+        }
+        assert output == expected
+
+    def test_video_metadata(self):
+        """get_metadata method can fetch video metadata"""
+        s = StreamReader(self.get_src())
+        output = s.get_metadata()
+
+        expected = {
+            "compatible_brands": "isomiso2avc1mp41",
+            "encoder": "Lavf58.76.100",
+            "major_brand": "isom",
+            "minor_version": "512",
+        }
         assert output == expected
 
     def test_src_info_invalid_index(self):

diff --git a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
@@ -22,6 +22,7 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
       .def(
           "find_best_video_stream",
           &StreamReaderFileObj::find_best_video_stream)
+      .def("get_metadata", &StreamReaderFileObj::get_metadata)
       .def(
           "get_src_stream_info",
           &StreamReaderFileObj::get_src_stream_info_pybind)

diff --git a/torchaudio/csrc/ffmpeg/pybind/stream_reader.cpp b/torchaudio/csrc/ffmpeg/pybind/stream_reader.cpp
@@ -15,5 +15,12 @@ StreamReaderFileObj::StreamReaderFileObj(
           option.value_or(OptionDict{}),
           pAVIO)) {}
 
+std::map<std::string, std::string> StreamReaderFileObj::get_metadata() const {
+  std::map<std::string, std::string> ret;
+  for (const auto& it : StreamReader::get_metadata()) {
+    ret.insert({it.key(), it.value()});
+  }
+  return ret;
+};
 } // namespace ffmpeg
 } // namespace torchaudio
diff --git a/torchaudio/csrc/ffmpeg/pybind/stream_reader.h b/torchaudio/csrc/ffmpeg/pybind/stream_reader.h
@@ -15,6 +15,8 @@ class StreamReaderFileObj : protected FileObj, public StreamReaderBinding {
       const c10::optional<std::string>& format,
       const c10::optional<OptionDict>& option,
       int64_t buffer_size);
+
+  std::map<std::string, std::string> get_metadata() const;
 };
 
 } // namespace ffmpeg

diff --git a/torchaudio/csrc/ffmpeg/stream_reader.cpp b/torchaudio/csrc/ffmpeg/stream_reader.cpp
@@ -83,6 +83,10 @@ c10::Dict<std::string, std::string> parse_metadata(
 }
 } // namespace
 
+c10::Dict<std::string, std::string> StreamReader::get_metadata() const {
+  return parse_metadata(pFormatContext->metadata);
+}
+
 SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
   validate_src_stream_index(i);
   AVStream* stream = pFormatContext->streams[i];
@@ -93,7 +97,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
   ret.bit_rate = codecpar->bit_rate;
   ret.num_frames = stream->nb_frames;
   ret.bits_per_sample = codecpar->bits_per_raw_sample;
-  ret.metadata = parse_metadata(pFormatContext->metadata);
+  ret.metadata = parse_metadata(stream->metadata);
   const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id);
   if (desc) {
     ret.codec_name = desc->name;

diff --git a/torchaudio/csrc/ffmpeg/stream_reader.h b/torchaudio/csrc/ffmpeg/stream_reader.h
@@ -44,6 +44,8 @@ class StreamReader {
   // Find a suitable audio/video streams using heuristics from ffmpeg
   int64_t find_best_audio_stream() const;
   int64_t find_best_video_stream() const;
+  // Fetch metadata of the source
+  c10::Dict<std::string, std::string> get_metadata() const;
   // Fetch information about source streams
   int64_t num_src_streams() const;
   SrcStreamInfo get_src_stream_info(int i) const;

diff --git a/torchaudio/csrc/ffmpeg/stream_reader_binding.cpp b/torchaudio/csrc/ffmpeg/stream_reader_binding.cpp
@@ -40,6 +40,7 @@ TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
       .def(torch::init<>(init))
       .def("num_src_streams", [](S self) { return self->num_src_streams(); })
       .def("num_out_streams", [](S self) { return self->num_out_streams(); })
+      .def("get_metadata", [](S self) { return self->get_metadata(); })
       .def(
           "get_src_stream_info",
           [](S s, int64_t i) { return s->get_src_stream_info(i); })

diff --git a/torchaudio/io/_stream_reader.py b/torchaudio/io/_stream_reader.py
@@ -62,8 +62,7 @@ class StreamReaderSourceStream:
     For compressed format, it can be 0.
     """
     metadata: Dict[str, str]
-    """Metadata attached to the source media.
-    Note that metadata is common across the source streams."""
+    """Metadata attached to the source stream."""
 
 
 @dataclass
@@ -397,6 +396,14 @@ def default_video_stream(self):
         """
         return self._default_video_stream
 
+    def get_metadata(self) -> Dict[str, str]:
+        """Get the metadata of the source media.
+
+        Returns:
+            dict
+        """
+        return self._be.get_metadata()
+
     def get_src_stream_info(self, i: int) -> torchaudio.io.StreamReaderSourceStream:
         """Get the metadata of source stream