Skip to content

Commit

Permalink
Fix metadata fetch (#2464)
Browse files Browse the repository at this point in the history
Summary:
In #2461, `metadata` field was added to StreamInfo.
However, the value attached to this new field was source-level metadata,
while each stream can have different metadata.

* source level metadata
[AVFormatContext->metadata](https://ffmpeg.org/doxygen/4.1/structAVFormatContext.html#a3019a56080ed2e3297ff25bc2ff88adf)
* stream level metadata
[AVFormatContext->streams[]->metadata](https://ffmpeg.org/doxygen/4.1/structAVStream.html#a50d250a128a3da9ce3d135e84213fb82)

This commit moves source level metadata to dedicated method, `get_metadata`, and
fix the stream-level metadata to report stream metadata.

Pull Request resolved: #2464

Reviewed By: hwangjeff, xiaohui-zhang

Differential Revision: D36995452

Pulled By: mthrok

fbshipit-source-id: 534be1f7feb07790a0ce8624c336cdb7b65a8697
  • Loading branch information
mthrok authored and facebook-github-bot committed Jun 8, 2022
1 parent 711d601 commit 4d2fa19
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 33 deletions.
86 changes: 56 additions & 30 deletions test/torchaudio_unittest/io/stream_reader_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,11 @@ def test_src_info(self):
s = StreamReader(self.get_src())
assert s.num_src_streams == 6

metadata = {
"compatible_brands": "isomiso2avc1mp41",
"encoder": "Lavf58.76.100",
"major_brand": "isom",
"minor_version": "512",
}
# Note:
# FFmpeg 4.4.1 and FFmpeg 5 also report
# `"vendor_id": "[0][0][0][0]"` in audio/video metadata.
# TODO:
# change expected metadata value based on FFmpeg version.
expected = [
StreamReaderSourceVideoStream(
media_type="video",
Expand All @@ -104,7 +103,10 @@ def test_src_info(self):
bit_rate=71925,
num_frames=325,
bits_per_sample=8,
metadata=metadata,
metadata={
"handler_name": "\x1fMainconcept Video Media Handler",
"language": "eng",
},
width=320,
height=180,
frame_rate=25.0,
Expand All @@ -117,7 +119,10 @@ def test_src_info(self):
bit_rate=72093,
num_frames=103,
bits_per_sample=0,
metadata=metadata,
metadata={
"handler_name": "#Mainconcept MP4 Sound Media Handler",
"language": "eng",
},
sample_rate=8000.0,
num_channels=2,
),
Expand All @@ -129,7 +134,10 @@ def test_src_info(self):
bit_rate=None,
num_frames=None,
bits_per_sample=None,
metadata=metadata,
metadata={
"handler_name": "SubtitleHandler",
"language": "eng",
},
),
StreamReaderSourceVideoStream(
media_type="video",
Expand All @@ -139,7 +147,10 @@ def test_src_info(self):
bit_rate=128783,
num_frames=390,
bits_per_sample=8,
metadata=metadata,
metadata={
"handler_name": "\x1fMainconcept Video Media Handler",
"language": "eng",
},
width=480,
height=270,
frame_rate=29.97002997002997,
Expand All @@ -152,7 +163,10 @@ def test_src_info(self):
bit_rate=128837,
num_frames=205,
bits_per_sample=0,
metadata=metadata,
metadata={
"handler_name": "#Mainconcept MP4 Sound Media Handler",
"language": "eng",
},
sample_rate=16000.0,
num_channels=2,
),
Expand All @@ -164,32 +178,44 @@ def test_src_info(self):
bit_rate=None,
num_frames=None,
bits_per_sample=None,
metadata=metadata,
metadata={
"handler_name": "SubtitleHandler",
"language": "eng",
},
),
]
output = [s.get_src_stream_info(i) for i in range(6)]
# Remove "vendor_id" if exists
# TODO: don't remove "vendor_id", instead,
# change expected based on FFmpeg version
for sinfo in output:
if "vendor_id" in sinfo.metadata:
del sinfo.metadata["vendor_id"]
assert expected == output

def test_id3tag(self):
"""get_metadata method can fetch id3tag properly"""
s = StreamReader(self.get_src("steam-train-whistle-daniel_simon.mp3"))
output = s.get_src_stream_info(s.default_audio_stream)

expected = StreamReaderSourceAudioStream(
media_type="audio",
codec="mp3",
codec_long_name="MP3 (MPEG audio layer 3)",
format="fltp",
bit_rate=210571,
num_frames=0,
bits_per_sample=0,
metadata={
"title": "SoundBible.com Must Credit",
"artist": "SoundBible.com Must Credit",
"date": "2017",
},
sample_rate=44100.0,
num_channels=2,
)
output = s.get_metadata()

expected = {
"title": "SoundBible.com Must Credit",
"artist": "SoundBible.com Must Credit",
"date": "2017",
}
assert output == expected

def test_video_metadata(self):
"""get_metadata method can fetch video metadata"""
s = StreamReader(self.get_src())
output = s.get_metadata()

expected = {
"compatible_brands": "isomiso2avc1mp41",
"encoder": "Lavf58.76.100",
"major_brand": "isom",
"minor_version": "512",
}
assert output == expected

def test_src_info_invalid_index(self):
Expand Down
1 change: 1 addition & 0 deletions torchaudio/csrc/ffmpeg/pybind/pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
.def(
"find_best_video_stream",
&StreamReaderFileObj::find_best_video_stream)
.def("get_metadata", &StreamReaderFileObj::get_metadata)
.def(
"get_src_stream_info",
&StreamReaderFileObj::get_src_stream_info_pybind)
Expand Down
7 changes: 7 additions & 0 deletions torchaudio/csrc/ffmpeg/pybind/stream_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,12 @@ StreamReaderFileObj::StreamReaderFileObj(
option.value_or(OptionDict{}),
pAVIO)) {}

std::map<std::string, std::string> StreamReaderFileObj::get_metadata() const {
std::map<std::string, std::string> ret;
for (const auto& it : StreamReader::get_metadata()) {
ret.insert({it.key(), it.value()});
}
return ret;
};
} // namespace ffmpeg
} // namespace torchaudio
2 changes: 2 additions & 0 deletions torchaudio/csrc/ffmpeg/pybind/stream_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ class StreamReaderFileObj : protected FileObj, public StreamReaderBinding {
const c10::optional<std::string>& format,
const c10::optional<OptionDict>& option,
int64_t buffer_size);

std::map<std::string, std::string> get_metadata() const;
};

} // namespace ffmpeg
Expand Down
6 changes: 5 additions & 1 deletion torchaudio/csrc/ffmpeg/stream_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ c10::Dict<std::string, std::string> parse_metadata(
}
} // namespace

c10::Dict<std::string, std::string> StreamReader::get_metadata() const {
return parse_metadata(pFormatContext->metadata);
}

SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
validate_src_stream_index(i);
AVStream* stream = pFormatContext->streams[i];
Expand All @@ -93,7 +97,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
ret.bit_rate = codecpar->bit_rate;
ret.num_frames = stream->nb_frames;
ret.bits_per_sample = codecpar->bits_per_raw_sample;
ret.metadata = parse_metadata(pFormatContext->metadata);
ret.metadata = parse_metadata(stream->metadata);
const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id);
if (desc) {
ret.codec_name = desc->name;
Expand Down
2 changes: 2 additions & 0 deletions torchaudio/csrc/ffmpeg/stream_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ class StreamReader {
// Find a suitable audio/video streams using heuristics from ffmpeg
int64_t find_best_audio_stream() const;
int64_t find_best_video_stream() const;
// Fetch metadata of the source
c10::Dict<std::string, std::string> get_metadata() const;
// Fetch information about source streams
int64_t num_src_streams() const;
SrcStreamInfo get_src_stream_info(int i) const;
Expand Down
1 change: 1 addition & 0 deletions torchaudio/csrc/ffmpeg/stream_reader_binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
.def(torch::init<>(init))
.def("num_src_streams", [](S self) { return self->num_src_streams(); })
.def("num_out_streams", [](S self) { return self->num_out_streams(); })
.def("get_metadata", [](S self) { return self->get_metadata(); })
.def(
"get_src_stream_info",
[](S s, int64_t i) { return s->get_src_stream_info(i); })
Expand Down
11 changes: 9 additions & 2 deletions torchaudio/io/_stream_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,7 @@ class StreamReaderSourceStream:
For compressed format, it can be 0.
"""
metadata: Dict[str, str]
"""Metadata attached to the source media.
Note that metadata is common across the source streams."""
"""Metadata attached to the source stream."""


@dataclass
Expand Down Expand Up @@ -397,6 +396,14 @@ def default_video_stream(self):
"""
return self._default_video_stream

def get_metadata(self) -> Dict[str, str]:
"""Get the metadata of the source media.
Returns:
dict
"""
return self._be.get_metadata()

def get_src_stream_info(self, i: int) -> torchaudio.io.StreamReaderSourceStream:
"""Get the metadata of source stream
Expand Down

0 comments on commit 4d2fa19

Please sign in to comment.