Update source info (pytorch#2418)

Summary: Add num_frames and bits_per_sample to match with the current `torchaudio.info` capability. Pull Request resolved: pytorch#2418 Reviewed By: carolineechen Differential Revision: D36749077 Pulled By: mthrok fbshipit-source-id: 7b368ee993cf5ed63ff2f53c9e3b1f50fcce7713
mthrok · May 29, 2022 · bb77cbe · bb77cbe
1 parent fd7ace1
commit bb77cbe
Show file tree

Hide file tree

Showing 6 changed files with 65 additions and 21 deletions.
diff --git a/test/torchaudio_unittest/io/stream_reader_test.py b/test/torchaudio_unittest/io/stream_reader_test.py
@@ -96,6 +96,8 @@ def test_src_info(self):
                 codec_long_name="H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10",
                 format="yuv420p",
                 bit_rate=71925,
+                num_frames=325,
+                bits_per_sample=8,
                 width=320,
                 height=180,
                 frame_rate=25.0,
@@ -106,6 +108,8 @@ def test_src_info(self):
                 codec_long_name="AAC (Advanced Audio Coding)",
                 format="fltp",
                 bit_rate=72093,
+                num_frames=103,
+                bits_per_sample=0,
                 sample_rate=8000.0,
                 num_channels=2,
             ),
@@ -115,13 +119,17 @@ def test_src_info(self):
                 codec_long_name="MOV text",
                 format=None,
                 bit_rate=None,
+                num_frames=None,
+                bits_per_sample=None,
             ),
             StreamReaderSourceVideoStream(
                 media_type="video",
                 codec="h264",
                 codec_long_name="H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10",
                 format="yuv420p",
                 bit_rate=128783,
+                num_frames=390,
+                bits_per_sample=8,
                 width=480,
                 height=270,
                 frame_rate=29.97002997002997,
@@ -132,6 +140,8 @@ def test_src_info(self):
                 codec_long_name="AAC (Advanced Audio Coding)",
                 format="fltp",
                 bit_rate=128837,
+                num_frames=205,
+                bits_per_sample=0,
                 sample_rate=16000.0,
                 num_channels=2,
             ),
@@ -141,6 +151,8 @@ def test_src_info(self):
                 codec_long_name="MOV text",
                 format=None,
                 bit_rate=None,
+                num_frames=None,
+                bits_per_sample=None,
             ),
         ]
         output = [s.get_src_stream_info(i) for i in range(6)]

diff --git a/torchaudio/csrc/ffmpeg/stream_reader.cpp b/torchaudio/csrc/ffmpeg/stream_reader.cpp
@@ -79,6 +79,8 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
   SrcStreamInfo ret;
   ret.media_type = codecpar->codec_type;
   ret.bit_rate = codecpar->bit_rate;
+  ret.num_frames = stream->nb_frames;
+  ret.bits_per_sample = codecpar->bits_per_raw_sample;
   const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id);
   if (desc) {
     ret.codec_name = desc->name;

diff --git a/torchaudio/csrc/ffmpeg/stream_reader_wrapper.cpp b/torchaudio/csrc/ffmpeg/stream_reader_wrapper.cpp
@@ -11,6 +11,8 @@ SrcInfo convert(SrcStreamInfo ssi) {
       ssi.codec_long_name,
       ssi.fmt_name,
       ssi.bit_rate,
+      ssi.num_frames,
+      ssi.bits_per_sample,
       ssi.sample_rate,
       ssi.num_channels,
       ssi.width,

diff --git a/torchaudio/csrc/ffmpeg/stream_reader_wrapper.h b/torchaudio/csrc/ffmpeg/stream_reader_wrapper.h
@@ -11,6 +11,8 @@ using SrcInfo = std::tuple<
     std::string, // codec long name
     std::string, // format name
     int64_t, // bit_rate
+    int64_t, // num_frames
+    int64_t, // bits_per_sample
     // Audio
     double, // sample_rate
     int64_t, // num_channels

diff --git a/torchaudio/csrc/ffmpeg/typedefs.h b/torchaudio/csrc/ffmpeg/typedefs.h
@@ -12,6 +12,8 @@ struct SrcStreamInfo {
   const char* codec_long_name = "N/A";
   const char* fmt_name = "N/A";
   int64_t bit_rate = 0;
+  int64_t num_frames = 0;
+  int bits_per_sample = 0;
   // Audio
   double sample_rate = 0;
   int num_channels = 0;

diff --git a/torchaudio/io/_stream_reader.py b/torchaudio/io/_stream_reader.py
@@ -55,6 +55,12 @@ class StreamReaderSourceStream:
     This is an estimated values based on the initial few frames of the stream.
     For container formats and variable bit rate, it can be 0.
     """
+    num_frames: Optional[int]
+    """The number of frames in the stream"""
+    bits_per_sample: Optional[int]
+    """This is the number of valid bits in each output sample.
+    For compressed format, it can be 0.
+    """
 
 
 @dataclass
@@ -100,41 +106,59 @@ class StreamReaderSourceVideoStream(StreamReaderSourceStream):
 _CODEC_LONG = 2
 _FORMAT = 3
 _BIT_RATE = 4
+_NUM_FRAMES = 5
+_BPS = 6
 # - AUDIO
-_SAMPLE_RATE = 5
-_NUM_CHANNELS = 6
+_SAMPLE_RATE = 7
+_NUM_CHANNELS = 8
 # - VIDEO
-_WIDTH = 7
-_HEIGHT = 8
-_FRAME_RATE = 9
+_WIDTH = 9
+_HEIGHT = 10
+_FRAME_RATE = 11
 
 
 def _parse_si(i):
     media_type = i[_MEDIA_TYPE]
     codec_name = i[_CODEC]
     codec_long_name = i[_CODEC_LONG]
+    fmt = i[_FORMAT]
+    bit_rate = i[_BIT_RATE]
+    num_frames = i[_NUM_FRAMES]
+    bps = i[_BPS]
     if media_type == "audio":
         return StreamReaderSourceAudioStream(
-            media_type,
-            codec_name,
-            codec_long_name,
-            i[_FORMAT],
-            i[_BIT_RATE],
-            i[_SAMPLE_RATE],
-            i[_NUM_CHANNELS],
+            media_type=media_type,
+            codec=codec_name,
+            codec_long_name=codec_long_name,
+            format=fmt,
+            bit_rate=bit_rate,
+            num_frames=num_frames,
+            bits_per_sample=bps,
+            sample_rate=i[_SAMPLE_RATE],
+            num_channels=i[_NUM_CHANNELS],
         )
     if media_type == "video":
         return StreamReaderSourceVideoStream(
-            media_type,
-            codec_name,
-            codec_long_name,
-            i[_FORMAT],
-            i[_BIT_RATE],
-            i[_WIDTH],
-            i[_HEIGHT],
-            i[_FRAME_RATE],
+            media_type=media_type,
+            codec=codec_name,
+            codec_long_name=codec_long_name,
+            format=fmt,
+            bit_rate=bit_rate,
+            num_frames=num_frames,
+            bits_per_sample=bps,
+            width=i[_WIDTH],
+            height=i[_HEIGHT],
+            frame_rate=i[_FRAME_RATE],
         )
-    return StreamReaderSourceStream(media_type, codec_name, codec_long_name, None, None)
+    return StreamReaderSourceStream(
+        media_type=media_type,
+        codec=codec_name,
+        codec_long_name=codec_long_name,
+        format=None,
+        bit_rate=None,
+        num_frames=None,
+        bits_per_sample=None,
+    )
 
 
 @dataclass