pytorch · fmassa · Mar 17, 2020 · Jan 14, 2020 · Feb 3, 2020 · Feb 9, 2020
diff --git a/setup.py b/setup.py
@@ -152,17 +152,6 @@ def get_extensions():
 
     include_dirs = [extensions_dir]
 
-    ffmpeg_exe = distutils.spawn.find_executable('ffmpeg')
-    has_ffmpeg = ffmpeg_exe is not None
-    if has_ffmpeg:
-        ffmpeg_bin = os.path.dirname(ffmpeg_exe)
-        ffmpeg_root = os.path.dirname(ffmpeg_bin)
-        ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include')
-
-        # TorchVision video reader
-        video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader')
-        video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp"))
-
     ext_modules = [
         extension(
             'torchvision._C',
@@ -182,12 +171,30 @@ def get_extensions():
                 extra_compile_args=extra_compile_args,
             )
         )
+
+    ffmpeg_exe = distutils.spawn.find_executable('ffmpeg')
+    has_ffmpeg = ffmpeg_exe is not None
+
     if has_ffmpeg:
+        ffmpeg_bin = os.path.dirname(ffmpeg_exe)
+        ffmpeg_root = os.path.dirname(ffmpeg_bin)
+        ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include')
+
+        # TorchVision base decoder + video reader
+        video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader')
+        video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp"))
+        base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder')
+        base_decoder_src = glob.glob(
+            os.path.join(base_decoder_src_dir, "[!sync_decoder_test,!utils_test]*.cpp"))
+
+        combined_src = video_reader_src + base_decoder_src
+
         ext_modules.append(
             CppExtension(
                 'torchvision.video_reader',
-                video_reader_src,
+                combined_src,
                 include_dirs=[
+                    base_decoder_src_dir,
                     video_reader_src_dir,
                     ffmpeg_include_dir,
                     extensions_dir,

diff --git a/test/test_video_reader.py b/test/test_video_reader.py
@@ -395,7 +395,7 @@ def compare_decoding_result(self, tv_result, ref_result, config=all_check_config
     def test_stress_test_read_video_from_file(self):
         num_iter = 10000
         # video related
-        width, height, min_dimension = 0, 0, 0
+        width, height, min_dimension, max_dimension = 0, 0, 0, 0
         video_start_pts, video_end_pts = 0, -1
         video_timebase_num, video_timebase_den = 0, 1
         # audio related
@@ -416,6 +416,7 @@ def test_stress_test_read_video_from_file(self):
                     width,
                     height,
                     min_dimension,
+                    max_dimension,
                     video_start_pts,
                     video_end_pts,
                     video_timebase_num,
@@ -434,7 +435,7 @@ def test_read_video_from_file(self):
         Test the case when decoder starts with a video file to decode frames.
         """
         # video related
-        width, height, min_dimension = 0, 0, 0
+        width, height, min_dimension, max_dimension = 0, 0, 0, 0
         video_start_pts, video_end_pts = 0, -1
         video_timebase_num, video_timebase_den = 0, 1
         # audio related
@@ -454,6 +455,7 @@ def test_read_video_from_file(self):
                 width,
                 height,
                 min_dimension,
+                max_dimension,
                 video_start_pts,
                 video_end_pts,
                 video_timebase_num,
@@ -479,7 +481,7 @@ def test_read_video_from_file_read_single_stream_only(self):
         only reads video stream and ignores audio stream
         """
         # video related
-        width, height, min_dimension = 0, 0, 0
+        width, height, min_dimension, max_dimension = 0, 0, 0, 0
         video_start_pts, video_end_pts = 0, -1
         video_timebase_num, video_timebase_den = 0, 1
         # audio related
@@ -499,6 +501,7 @@ def test_read_video_from_file_read_single_stream_only(self):
                     width,
                     height,
                     min_dimension,
+                    max_dimension,
                     video_start_pts,
                     video_end_pts,
                     video_timebase_num,
@@ -536,7 +539,7 @@ def test_read_video_from_file_rescale_min_dimension(self):
         video min dimension between height and width is set.
         """
         # video related
-        width, height, min_dimension = 0, 0, 128
+        width, height, min_dimension, max_dimension = 0, 0, 128, 0
         video_start_pts, video_end_pts = 0, -1
         video_timebase_num, video_timebase_den = 0, 1
         # audio related
@@ -555,6 +558,7 @@ def test_read_video_from_file_rescale_min_dimension(self):
                 width,
                 height,
                 min_dimension,
+                max_dimension,
                 video_start_pts,
                 video_end_pts,
                 video_timebase_num,
@@ -571,13 +575,100 @@ def test_read_video_from_file_rescale_min_dimension(self):
                 min_dimension, min(tv_result[0].size(1), tv_result[0].size(2))
             )
 
+    def test_read_video_from_file_rescale_max_dimension(self):
+        """
+        Test the case when decoder starts with a video file to decode frames, and
+        video min dimension between height and width is set.
+        """
+        # video related
+        width, height, min_dimension, max_dimension = 0, 0, 0, 85
+        video_start_pts, video_end_pts = 0, -1
+        video_timebase_num, video_timebase_den = 0, 1
+        # audio related
+        samples, channels = 0, 0
+        audio_start_pts, audio_end_pts = 0, -1
+        audio_timebase_num, audio_timebase_den = 0, 1
+
+        for test_video, _config in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+
+            tv_result = torch.ops.video_reader.read_video_from_file(
+                full_path,
+                seek_frame_margin,
+                0,  # getPtsOnly
+                1,  # readVideoStream
+                width,
+                height,
+                min_dimension,
+                max_dimension,
+                video_start_pts,
+                video_end_pts,
+                video_timebase_num,
+                video_timebase_den,
+                1,  # readAudioStream
+                samples,
+                channels,
+                audio_start_pts,
+                audio_end_pts,
+                audio_timebase_num,
+                audio_timebase_den,
+            )
+            self.assertEqual(
+                max_dimension, max(tv_result[0].size(1), tv_result[0].size(2))
+            )
+
+    def test_read_video_from_file_rescale_both_min_max_dimension(self):
+        """
+        Test the case when decoder starts with a video file to decode frames, and
+        video min dimension between height and width is set.
+        """
+        # video related
+        width, height, min_dimension, max_dimension = 0, 0, 64, 85
+        video_start_pts, video_end_pts = 0, -1
+        video_timebase_num, video_timebase_den = 0, 1
+        # audio related
+        samples, channels = 0, 0
+        audio_start_pts, audio_end_pts = 0, -1
+        audio_timebase_num, audio_timebase_den = 0, 1
+
+        for test_video, _config in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+
+            tv_result = torch.ops.video_reader.read_video_from_file(
+                full_path,
+                seek_frame_margin,
+                0,  # getPtsOnly
+                1,  # readVideoStream
+                width,
+                height,
+                min_dimension,
+                max_dimension,
+                video_start_pts,
+                video_end_pts,
+                video_timebase_num,
+                video_timebase_den,
+                1,  # readAudioStream
+                samples,
+                channels,
+                audio_start_pts,
+                audio_end_pts,
+                audio_timebase_num,
+                audio_timebase_den,
+            )
+            self.assertEqual(
+                min_dimension, min(tv_result[0].size(1), tv_result[0].size(2))
+            )
+            self.assertEqual(
+                max_dimension, max(tv_result[0].size(1), tv_result[0].size(2))
+            )
+
     def test_read_video_from_file_rescale_width(self):
         """
         Test the case when decoder starts with a video file to decode frames, and
         video width is set.
         """
         # video related
-        width, height, min_dimension = 256, 0, 0
+        width, height, min_dimension, max_dimension = 256, 0, 0, 0
         video_start_pts, video_end_pts = 0, -1
         video_timebase_num, video_timebase_den = 0, 1
         # audio related
@@ -596,6 +687,7 @@ def test_read_video_from_file_rescale_width(self):
                 width,
                 height,
                 min_dimension,
+                max_dimension,
                 video_start_pts,
                 video_end_pts,
                 video_timebase_num,
@@ -616,7 +708,7 @@ def test_read_video_from_file_rescale_height(self):
         video height is set.
         """
         # video related
-        width, height, min_dimension = 0, 224, 0
+        width, height, min_dimension, max_dimension = 0, 224, 0, 0
         video_start_pts, video_end_pts = 0, -1
         video_timebase_num, video_timebase_den = 0, 1
         # audio related
@@ -635,6 +727,7 @@ def test_read_video_from_file_rescale_height(self):
                 width,
                 height,
                 min_dimension,
+                max_dimension,
                 video_start_pts,
                 video_end_pts,
                 video_timebase_num,
@@ -655,7 +748,7 @@ def test_read_video_from_file_rescale_width_and_height(self):
         both video height and width are set.
         """
         # video related
-        width, height, min_dimension = 320, 240, 0
+        width, height, min_dimension, max_dimension = 320, 240, 0, 0
         video_start_pts, video_end_pts = 0, -1
         video_timebase_num, video_timebase_den = 0, 1
         # audio related
@@ -674,6 +767,7 @@ def test_read_video_from_file_rescale_width_and_height(self):
                 width,
                 height,
                 min_dimension,
+                max_dimension,
                 video_start_pts,
                 video_end_pts,
                 video_timebase_num,
@@ -697,7 +791,7 @@ def test_read_video_from_file_audio_resampling(self):
 
         for samples in [9600, 96000]:  # downsampling  # upsampling
             # video related
-            width, height, min_dimension = 0, 0, 0
+            width, height, min_dimension, max_dimension = 0, 0, 0, 0
             video_start_pts, video_end_pts = 0, -1
             video_timebase_num, video_timebase_den = 0, 1
             # audio related
@@ -716,6 +810,7 @@ def test_read_video_from_file_audio_resampling(self):
                     width,
                     height,
                     min_dimension,
+                    max_dimension,
                     video_start_pts,
                     video_end_pts,
                     video_timebase_num,
@@ -752,7 +847,7 @@ def test_compare_read_video_from_memory_and_file(self):
         Test the case when video is already in memory, and decoder reads data in memory
         """
         # video related
-        width, height, min_dimension = 0, 0, 0
+        width, height, min_dimension, max_dimension = 0, 0, 0, 0
         video_start_pts, video_end_pts = 0, -1
         video_timebase_num, video_timebase_den = 0, 1
         # audio related
@@ -772,6 +867,7 @@ def test_compare_read_video_from_memory_and_file(self):
                 width,
                 height,
                 min_dimension,
+                max_dimension,
                 video_start_pts,
                 video_end_pts,
                 video_timebase_num,
@@ -794,6 +890,7 @@ def test_compare_read_video_from_memory_and_file(self):
                 width,
                 height,
                 min_dimension,
+                max_dimension,
                 video_start_pts,
                 video_end_pts,
                 video_timebase_num,
@@ -816,7 +913,7 @@ def test_read_video_from_memory(self):
         Test the case when video is already in memory, and decoder reads data in memory
         """
         # video related
-        width, height, min_dimension = 0, 0, 0
+        width, height, min_dimension, max_dimension = 0, 0, 0, 0
         video_start_pts, video_end_pts = 0, -1
         video_timebase_num, video_timebase_den = 0, 1
         # audio related
@@ -836,6 +933,7 @@ def test_read_video_from_memory(self):
                 width,
                 height,
                 min_dimension,
+                max_dimension,
                 video_start_pts,
                 video_end_pts,
                 video_timebase_num,
@@ -861,7 +959,7 @@ def test_read_video_from_memory_get_pts_only(self):
         for both pts and frame data
         """
         # video related
-        width, height, min_dimension = 0, 0, 0
+        width, height, min_dimension, max_dimension = 0, 0, 0, 0
         video_start_pts, video_end_pts = 0, -1
         video_timebase_num, video_timebase_den = 0, 1
         # audio related
@@ -881,6 +979,7 @@ def test_read_video_from_memory_get_pts_only(self):
                 width,
                 height,
                 min_dimension,
+                max_dimension,
                 video_start_pts,
                 video_end_pts,
                 video_timebase_num,
@@ -904,6 +1003,7 @@ def test_read_video_from_memory_get_pts_only(self):
                 width,
                 height,
                 min_dimension,
+                max_dimension,
                 video_start_pts,
                 video_end_pts,
                 video_timebase_num,
@@ -930,7 +1030,7 @@ def test_read_video_in_range_from_memory(self):
         for test_video, config in test_videos.items():
             full_path, video_tensor = _get_video_tensor(VIDEO_DIR, test_video)
             # video related
-            width, height, min_dimension = 0, 0, 0
+            width, height, min_dimension, max_dimension = 0, 0, 0, 0
             video_start_pts, video_end_pts = 0, -1
             video_timebase_num, video_timebase_den = 0, 1
             # audio related
@@ -946,6 +1046,7 @@ def test_read_video_in_range_from_memory(self):
                 width,
                 height,
                 min_dimension,
+                max_dimension,
                 video_start_pts,
                 video_end_pts,
                 video_timebase_num,
@@ -1000,6 +1101,7 @@ def test_read_video_in_range_from_memory(self):
                     width,
                     height,
                     min_dimension,
+                    max_dimension,
                     video_start_pts,
                     video_end_pts,
                     video_timebase_num,
@@ -1099,7 +1201,7 @@ def test_read_video_from_memory_scripted(self):
         Test the case when video is already in memory, and decoder reads data in memory
         """
         # video related
-        width, height, min_dimension = 0, 0, 0
+        width, height, min_dimension, max_dimension = 0, 0, 0, 0
         video_start_pts, video_end_pts = 0, -1
         video_timebase_num, video_timebase_den = 0, 1
         # audio related
@@ -1130,6 +1232,7 @@ def test_read_video_from_memory_scripted(self):
                 [audio_start_pts, audio_end_pts],
                 audio_timebase_num,
                 audio_timebase_den,
+                max_dimension,
             )
             # FUTURE: check value of video / audio frames