Release version 0.2.0

luziferius · Nov 28, 2019 · 9dd1df9 · 9dd1df9
1 parent eae0584
commit 9dd1df9
Show file tree

Hide file tree

Showing 6 changed files with 144 additions and 68 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,3 +1,14 @@
+Version 0.2.0 (17.11.2019)
+
+- Split first pass from second pass for two-pass encoding and use a different task scheduler for two-pass encodes.
+  - Use the first-pass log size as simple metric to estimate
+    second-pass runtime and schedule the second passes accordingly.
+    Large logs indicate long and/or complex scenes that take long to encode.
+  - Start encoding long running scenes first, which will result in better multicore usage at the end of the processing.
+  - It avoids starting long scenes, like the credits, at the end of the processs, and therefore lessens the impact of
+    a single, long encode delaying the whole process. With this scheduling approach, it is way more likely that the
+    last running encodings will be encoding short and easy scenes and therefore having less overall delay.
+
 Version 0.1.3 (17.11.2019)
 
 - Fixed broken Two-pass log file moving.

diff --git a/README.rst b/README.rst
@@ -31,6 +31,21 @@ scenes in the scene repository and skips them, avoiding duplicate work.
 When all scenes are encoded, the ffmpeg concat demuxer is used to join all scenes into a single video file.
 
 
+Two-Pass mode: Technical details
+++++++++++++++++++++++++++++++++
+
+Two-Pass mode uses a simple scheduler to ensure high load throughout the encoding process, avoiding single, long running
+encoding processes remaining at the end of the encodin process and artificially delaying the whole process.
+
+This is done by doing all first pass encodes first and then use the first pass log file size as simple metric to estimate
+the second-pass runtime and schedule the second passes accordingly.
+The used metric assumes there is a linear correlation between first-pass log file size and second-pass encoding time.
+When the encoding tasks are sorted by the log file size and therefore by the assumed relative run time, the program will
+start encoding long running scenes first. This will result in better multicore usage at the end of the processing.
+It avoids starting long scenes, like the ending credits, at the end of the processs, and therefore lessens the impact of
+a single, long encode delaying the whole process. With this scheduling approach, it is way more likely that the
+last running encodings will be encoding short and easy scenes and therefore having less overall delay.
+
 Requirements
 ------------
 

diff --git a/TODO b/TODO
@@ -1,16 +1,14 @@
 TODO:
 - Implement maximum scene length limitation.
+- Stackable silent mode:
+  first suppress ffmpeg output except for warnings, then suppress input level log output, then
+  supress everything except errors, then output nothing.
 
 MAYBE, nice to have ideas:
+- Gracefully finishing running scenes when hitting <Ctrl>+C for the first time, hard aborting on hitting it
+  the second time. The user can use this to abort long processes early without losing any data.
+  Running instances will finish, but no new instances will start in soft exit mode.
 - Use a configuration file to define the default argument values.
-- Use a better two-pass strategy: Perform all first passes first. Then descendingly sort by log file size, then encode the second passes in log file size order.
-  - Log size corrolates with scene length and therefore encoding time.
-  - I hope that the saved time waiting for the last few encoder instances still processing long scenes in the end (movie credits scene encoder starting last!)
-    is longer than the intermediate lost time waiting for all first passes to finish. During the first pass the same happens,
-    but because the first pass is way faster, the effect is way milder.
-  - Running the scenes with large logs first will clear all long scenes in the queue first. At the end of the transcoding process, only short scenes will be left.
-  - The short scenes at the end will encode fast and are getting scheduled to the encoder slots more agile, which should yield better load at the end of the process
-    and less waiting for just a few remaining processes.
 - Use a better merge algorithm.
   - The current greedy algorithm solves the problem well, but does not take the scene cut score into account.
   - For example, maximize the overall scene score value.

diff --git a/av1transcoder/command_line.py b/av1transcoder/command_line.py
@@ -78,8 +78,8 @@ def run(self):
         if self.dump_mode != "only":
             completed = subprocess.run(self.command_line, executable=self.ffmpeg)
             if completed.returncode:
-                warn_msg = f"ffmpeg command exited with non-zero return value indicating failure. " \
-                           f"Failing input file: {self.input_file.input_file}"
+                warn_msg = f'ffmpeg command exited with non-zero return value indicating failure. ' \
+                           f'Failing input file: "{self.input_file.input_file}".'
                 logger.warning(warn_msg)
                 print(warn_msg, file=sys.stderr)
             else:

diff --git a/av1transcoder/constants.py b/av1transcoder/constants.py
@@ -13,7 +13,7 @@
 # You should have received a copy of the GNU General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 
-__version__ = "0.1.3"
+__version__ = "0.2.0"
 
 
 PROGRAMNAME = "av1transcoder"

diff --git a/av1transcoder/scene_transcode.py b/av1transcoder/scene_transcode.py
@@ -22,7 +22,9 @@
 import itertools
 from concurrent.futures import ThreadPoolExecutor
 import os
+import pathlib
 import shutil
+import typing
 
 from av1transcoder.argument_parser import Namespace
 from av1transcoder.input_file import InputFile
@@ -41,14 +43,20 @@ def __init__(self, arguments: Namespace, input_file: InputFile, scene: Scene):
         self.scene = scene
 
     def _add_command_line_arguments(self, arguments: Namespace):
-        # Always overwrite for encoding passes.
+        """
+        Adds the common command line arguments.
+        Concrete classes MUST overwrite this and SHOULD call super() first.
+        """
+
+        # Always overwrite for encoding passes (ffmpeg -y option).
         # The two-pass first pass requires -y to write to /dev/null or NUL
         # The single-pass and two-pass second pass write to self.in_progress_temp, which is always safe to write to.
         # Completed files are moved out of it on completion, so if files are present, it indicates that a previous
         # instance aborted. So it is safe to overwrite partial data. (There is a millisecond wide time frame
         # between finishing an encoding and moving the finished file. I’ll ignore that terminating this program during
         # that time frame _might_ cause overwriting and re-doing a single finished scene.)
         self.command_line.append("-y")
+        self._add_common_encoder_options(arguments)
 
     def _add_common_encoder_options(self, arguments: Namespace):
         """
@@ -73,10 +81,29 @@ def _add_common_encoder_options(self, arguments: Namespace):
     def two_pass_log_file_prefix(self) -> str:
         return f"scene_{self.scene.scene_number}"
 
+    @property
+    def output_scene_file_name(self):
+        return f"scene_{self.scene.scene_number}.mkv"
+
     @abstractmethod
     def _get_command_dump_file_name(self) -> str:
         """Returns the file name used to dump the ffmpeg command."""
         pass
+
+    def run(self):
+        super(AbstractEncoderCommandLine, self).run()
+        if self.finished and self.dump_mode != "only":
+            self._move_output_files_to_completed_dir()
+
+    def _move_output_files_to_completed_dir(self):
+        """
+        Move all files produced by ffmpeg into the completed directory. This is executed by the encoder command line
+        run(), after ffmpeg finished.
+        """
+        encoded_scene = self.in_progress_dir / f"scene_{self.scene.scene_number}.mkv"
+        shutil.move(str(encoded_scene), str(self.completed_dir))
+        logger.debug(f'Encoded scene "{encoded_scene}" finished. '
+                     f'Moved to the completed directory "{self.completed_dir}"')
 
 
 class AV1LibAomSinglePassEncoderCommandLine(AbstractEncoderCommandLine):
@@ -89,15 +116,12 @@ def __init__(self, arguments: Namespace, input_file: InputFile, scene: Scene):
         super(AV1LibAomSinglePassEncoderCommandLine, self).__init__(arguments, input_file, scene)
         logger.info(f'Constructing command line to encode scenes in input file "{input_file.input_file}" to AV1.')
         self._add_command_line_arguments(arguments)
-        logger.info(f"Created {self.__class__.__name__} instance.")
+        logger.debug(f"Created {self.__class__.__name__} instance.")
 
     def _add_command_line_arguments(self, arguments: Namespace):
         super(AV1LibAomSinglePassEncoderCommandLine, self)._add_command_line_arguments(arguments)
-        self._add_common_encoder_options(arguments)
-        # Now add the output file
-
-        scene_name = f"scene_{self.scene.scene_number}.mkv"
-        self.command_line.append(str(self.in_progress_dir / scene_name))
+        # The common arguments are sufficient for single-pass encoding, just add the output file path
+        self.command_line.append(str(self.in_progress_dir / self.output_scene_file_name))
 
     def _get_command_dump_file_name(self):
         return "single_pass_encode_commands.txt"
@@ -116,11 +140,10 @@ def __init__(self, arguments: Namespace, input_file: InputFile, scene: Scene):
         logger.info(f'Constructing command line to perform the first pass encode '
                     f'of scene {scene.scene_number} in input file "{input_file.input_file}" to AV1.')
         self._add_command_line_arguments(arguments)
-        logger.info(f"Created {self.__class__.__name__} instance.")
+        logger.debug(f"Created {self.__class__.__name__} instance.")
 
     def _add_command_line_arguments(self, arguments: Namespace):
         super(AV1LibAomTwoPass1EncoderCommandLine, self)._add_command_line_arguments(arguments)
-        self._add_common_encoder_options(arguments)
         # See Two-Pass section of https://trac.ffmpeg.org/wiki/Encode/AV1
         # Specify the muxer and pipe the output to the system null sink.
         # For the log file name, see https://ffmpeg.org/ffmpeg.html#Video-Options
@@ -137,6 +160,20 @@ def _add_command_line_arguments(self, arguments: Namespace):
 
     def _get_command_dump_file_name(self):
         return "two_pass_encode_pass_1_commands.txt"
+
+    def _move_output_files_to_completed_dir(self):
+        # May have produced multiple logs, if the file contains multiple video tracks.
+        logs = self.in_progress_dir.glob(f"{self.two_pass_log_file_prefix}*.log")
+        log_count = 0
+        for log_file in logs:
+            target_file = self.completed_dir/log_file.name
+            if self.force_overwrite and target_file.exists():
+                logger.info(f'Log file already present: "{target_file}". Force overwriting the file.')
+                target_file.unlink()
+            shutil.move(str(log_file), self.completed_dir)
+            log_count += 1
+        logger.debug(f'Moved {log_count} log file{"s" if log_count >= 1 else ""} '
+                     f'for scene {self.scene.scene_number} to the completed directory "{self.completed_dir}".')
 
 
 class AV1LibAomTwoPass2EncoderCommandLine(AbstractEncoderCommandLine):
@@ -150,21 +187,19 @@ def __init__(self, arguments: Namespace, input_file: InputFile, scene: Scene):
         logger.info(f'Constructing command line to perform the second pass encode '
                     f'of scene {scene.scene_number} in input file "{input_file.input_file}" to AV1.')
         self._add_command_line_arguments(arguments)
-        logger.info(f"Created {self.__class__.__name__} instance.")
+        logger.debug(f"Created {self.__class__.__name__} instance.")
 
     def _add_command_line_arguments(self, arguments: Namespace):
         super(AV1LibAomTwoPass2EncoderCommandLine, self)._add_command_line_arguments(arguments)
-        self._add_common_encoder_options(arguments)
 
-        scene_name = f"scene_{self.scene.scene_number}.mkv"
         # See Two-Pass section of https://trac.ffmpeg.org/wiki/Encode/AV1
         # For the log file name, see https://ffmpeg.org/ffmpeg.html#Video-Options
         # Make sure that each scene uses a unique log file name
         self.command_line += [
             "-pass", "2",
             # TODO: Verify that this works with arbitrary paths
             "-passlogfile", str(self.completed_dir/self.two_pass_log_file_prefix),
-            str(self.in_progress_dir / scene_name)
+            str(self.in_progress_dir / self.output_scene_file_name)
         ]
         command_line_str = f"[{', '.join(self.command_line)}]"
         logger.debug(f"Constructed command line. Result: {command_line_str}")
@@ -173,22 +208,38 @@ def _get_command_dump_file_name(self):
         return "two_pass_encode_pass_2_commands.txt"
 
 
+FirstPassList: typing.List[AV1LibAomTwoPass1EncoderCommandLine]
+
+
 def transcode_input_file(arguments: Namespace, input_file: InputFile, scenes: SceneList):
     """Transcode a single input file to AV1."""
-    transcode_function = _transcode_single_pass if arguments.enable_single_pass_encode else _transcode_two_pass
 
     with ThreadPoolExecutor(
             max_workers=arguments.max_concurrent_encodes, thread_name_prefix="ffmpeg_worker") as executor:
-        # Use tuple to drive the map operation
-        tuple(executor.map(transcode_function, itertools.repeat(arguments), itertools.repeat(input_file), scenes))
+        # Use tuple to drive the map operation whenever the result is not required
+        if arguments.enable_single_pass_encode:
+            tuple(executor.map(
+                _transcode_single_pass, itertools.repeat(arguments), itertools.repeat(input_file), scenes
+            ))
+        else:
+            # Only keep finished first passes.
+            first_passes: FirstPassList = list(filter(
+                (lambda pass1: pass1.finished),
+                executor.map(_transcode_two_pass_1, itertools.repeat(arguments), itertools.repeat(input_file), scenes)
+            ))
+            logger.info("All first passes finished. Sorting the second passes in descending order, "
+                        "based on the time consumption heuristic.")
+            _sort_first_passes(first_passes)
+            tuple(executor.map(
+                _transcode_two_pass_2, itertools.repeat(arguments), first_passes
+            ))
 
     concat_filter = ConcatFilterCommandLine(arguments, input_file)
     if concat_filter.handle_directory_creation():
         concat_filter.run()
 
     _cleanup(arguments, input_file)
 
-
 def _transcode_single_pass(arguments: Namespace, input_file: InputFile, scene: Scene):
     logger.info(f'Transcoding "{input_file.input_file}" using Single-Pass encoding…')
     cli = AV1LibAomSinglePassEncoderCommandLine(arguments, input_file, scene)
@@ -199,56 +250,57 @@ def _transcode_single_pass(arguments: Namespace, input_file: InputFile, scene: S
     if cli.handle_directory_creation():
         logger.debug(f'Starting encoding process for file "{input_file.input_file}".')
         cli.run()
-    _move_scene_to_finished_directory(cli)
 
 
-def _transcode_two_pass(arguments: Namespace, input_file: InputFile, scene: Scene):
-    logger.info(f'Transcoding "{input_file.input_file}" using Two-Pass encoding…')
+def _transcode_two_pass_1(
+        arguments: Namespace, input_file: InputFile, scene: Scene) -> AV1LibAomTwoPass1EncoderCommandLine:
+    logger.info(f'Transcoding "{input_file.input_file}" using Two-Pass encoding… Starting first pass.')
     pass1 = AV1LibAomTwoPass1EncoderCommandLine(arguments, input_file, scene)
     # Skip encoding, if the scene is already finished.
-    if (pass1.completed_dir / f"scene_{pass1.scene.scene_number}.mkv").exists():
-        logger.info(f"Scene number {pass1.scene.scene_number} already finished. Skipping.")
-        return
-    if pass1.handle_directory_creation():
-        if arguments.force_overwrite or not \
-                list(pass1.completed_dir.glob(f"{pass1.two_pass_log_file_prefix}*.log")):  # Finished log files exist
-            logger.debug(f'Starting first pass for file "{input_file.input_file}".')
+    final_output_exists = (pass1.completed_dir / pass1.output_scene_file_name).exists()
+    all_logs_present = len(list(pass1.completed_dir.glob(f"{pass1.two_pass_log_file_prefix}*.log"))) == \
+        len(pass1.input_file.video_streams)
+
+    if (final_output_exists or all_logs_present) and not arguments.force_overwrite:
+        logger.info(f"Scene number {pass1.scene.scene_number} already finished. Skipping pass 1.")
+        pass1.finished = True
+        return pass1
+    else:
+        if pass1.handle_directory_creation():
+            logger.debug(
+                f'Starting first pass for file "{input_file.input_file}" and scene {pass1.scene.scene_number}.'
+            )
+            if all_logs_present and arguments.force_overwrite:
+                logger.info(f'Target files for "{input_file.input_file}" and scene {pass1.scene.scene_number} already '
+                            f'present. Overwriting, because --force-overwrite was given.')
             pass1.run()
-            _move_first_pass_log_to_finished_directory(pass1)
-        else:
-            pass1.finished = True
-    pass2 = AV1LibAomTwoPass2EncoderCommandLine(arguments, input_file, scene)
-    if pass2.handle_directory_creation() and pass1.finished:
-        logger.debug(f'Starting second pass for file "{input_file.input_file}".')
-        pass2.run()
-        _move_scene_to_finished_directory(pass2)
+    return pass1
 
 
-def _move_scene_to_finished_directory(cli: AbstractEncoderCommandLine):
-    if cli.finished and cli.dump_mode != "only":
-        encoded_scene = cli.in_progress_dir / f"scene_{cli.scene.scene_number}.mkv"
-        shutil.move(str(encoded_scene), str(cli.completed_dir))
-        logger.debug(f'Encoded scene "{encoded_scene}" finished. '
-                     f'Moved to the completed directory "{cli.completed_dir}"')
-
-
-def _move_first_pass_log_to_finished_directory(cli: AV1LibAomTwoPass1EncoderCommandLine):
-    if cli.finished and cli.dump_mode != "only":
-        # May have produced multiple logs, if the file contains multiple video tracks.
-        logs = cli.in_progress_dir.glob(f"{cli.two_pass_log_file_prefix}*.log")
-        log_count = 0
-        for log_file in logs:
-            target_file = cli.completed_dir/log_file.name
-            if cli.force_overwrite and target_file.exists():
-                logger.info(f'Log file already present: "{target_file}". Force overwriting the file.')
-                target_file.unlink()
-            shutil.move(str(log_file), cli.completed_dir)
-            log_count += 1
-        logger.debug(f'Moved {log_count} log file{"s" if log_count >= 1 else ""} '
-                     f'for scene {cli.scene.scene_number} to the completed directory "{cli.completed_dir}".')
+def _transcode_two_pass_2(arguments: Namespace, pass1: AV1LibAomTwoPass1EncoderCommandLine):
+    logger.info(f'Transcoding "{pass1.input_file.input_file}" using Two-Pass encoding… Starting second pass.')
+    pass2 = AV1LibAomTwoPass2EncoderCommandLine(arguments, pass1.input_file, pass1.scene)
+    final_output_exists = (pass2.completed_dir / pass2.output_scene_file_name).exists()
+    if final_output_exists and not arguments.force_overwrite:
+        logger.info(f"Scene number {pass1.scene.scene_number} already finished. Skipping pass 2.")
+        return
+    if pass2.handle_directory_creation():
+        logger.debug(f'Starting second pass for file "{pass1.input_file.input_file}".')
+        pass2.run()
 
 
 def _cleanup(arguments: Namespace, input_file: InputFile):
     if not arguments.keep_temp:
         logger.info(f'Removing temporary files: "{input_file.temp_dir}"')
         input_file.temp_dir.unlink()
+
+
+def _sort_first_passes(passes) -> None:
+
+    def key(pass1: AV1LibAomTwoPass1EncoderCommandLine):
+        all_logs = pass1.completed_dir.glob(f"{pass1.two_pass_log_file_prefix}*.log")
+        first_log: pathlib.Path = sorted(all_logs)[0]
+        size_bytes = first_log.stat().st_size
+        return size_bytes
+
+    passes.sort(key=key, reverse=True)