Make subtitle timestamp fix an opt-out with --no-timestamp-fix

devine-dl · May 6, 2024 · 5cc8017 · 5cc8017
1 parent 9869ba7
commit 5cc8017
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 40 deletions.
diff --git a/devine/commands/dl.py b/devine/commands/dl.py
@@ -105,6 +105,8 @@ class dl:
     @click.option("--sub-format", type=click.Choice(Subtitle.Codec, case_sensitive=False),
                   default=None,
                   help="Set Output Subtitle Format, only converting if necessary.")
+    @click.option("--no-timestamp-fix", "fix_sub_timestamp", is_flag=True, default=True,
+                  help="Disable subtitle timestamp fix.")
     @click.option("-V", "--video-only", is_flag=True, default=False,
                   help="Only download video tracks.")
     @click.option("-A", "--audio-only", is_flag=True, default=False,
@@ -266,6 +268,7 @@ def result(
         v_lang: list[str],
         s_lang: list[str],
         sub_format: Optional[Subtitle.Codec],
+        fix_sub_timestamp: bool,
         video_only: bool,
         audio_only: bool,
         subs_only: bool,

diff --git a/devine/core/tracks/subtitle.py b/devine/core/tracks/subtitle.py
@@ -264,8 +264,22 @@ def convert(self, codec: Subtitle.Codec) -> Path:
             if writer is None:
                 raise NotImplementedError(f"Cannot yet convert {self.codec.name} to {codec.name}.")
 
-            caption_set = self.parse(self.path.read_bytes(), self.codec)
+            if self.descriptor == Subtitle.Descriptor.DASH:
+                # TODO: Populated in DASH.download_track. Perhaps DASH/HLS class should
+                #       use a dict instead of a tuple?
+                # TODO PR#67 rlaphoenix: This will be moved/done within self.parse instead
+                extra = {
+                    "_timescale": self.data["dash"]["_timescale"],
+                    "_segment_duration": self.data["dash"]["_segment_duration"],
+                }
+            else:
+                extra = None
+
+            # TODO PR#67 rlaphoenix: the True is a bool to say if we should fix webvtt timestamps or not, a bool from dl CLI args
+            #                        The Subtitle.convert() method is not passed it but idc cause I will remove this anyway
+            caption_set = self.parse(self.path.read_bytes(), self.codec, True, extra)
             Subtitle.merge_same_cues(caption_set)
+
             subtitle_text = writer().write(caption_set)
 
             output_path.write_text(subtitle_text, encoding="utf8")
@@ -279,7 +293,9 @@ def convert(self, codec: Subtitle.Codec) -> Path:
         return output_path
 
     @staticmethod
-    def parse(data: bytes, codec: Subtitle.Codec) -> pycaption.CaptionSet:
+    def parse(data: bytes, codec: Subtitle.Codec, fix_sub_timestamp: bool = False, extra: Optional[dict] = None) -> pycaption.CaptionSet:
+        extra = extra or {}
+        # TODO: Use an "enum" for subtitle codecs
         if not isinstance(data, bytes):
             raise ValueError(f"Subtitle data must be parsed as bytes data, not {type(data).__name__}")
 
@@ -310,7 +326,13 @@ def parse(data: bytes, codec: Subtitle.Codec) -> pycaption.CaptionSet:
                 caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists)
             elif codec == Subtitle.Codec.WebVTT:
                 text = Subtitle.space_webvtt_headers(data)
-                caption_set = pycaption.WebVTTReader().read(text)
+                caption_set: pycaption.CaptionSet
+                if fix_sub_timestamp:
+                    duration = extra.get("_segment_duration")
+                    timescale = extra.get("_timescale", 1)
+                    caption_set = fix_webvtt_timestamp(text, segment_duration=duration, timescale=timescale)
+                else:
+                    caption_set = pycaption.WebVTTReader().read(text)
             else:
                 raise ValueError(f"Unknown Subtitle format \"{codec}\"...")
         except pycaption.exceptions.CaptionReadSyntaxError as e:
@@ -575,39 +597,5 @@ def reverse_rtl(self) -> None:
             stdout=subprocess.DEVNULL
         )
 
-    def fix_webvtt_timestamp(self) -> None:
-        # TODO PR#67 rlaphoenix: This func name clashes with the import from newly added utils.webvtt
-        """
-        Convert segmented WebVTT timestamps where each cue starts at 0 (relative to the segment)
-        to absolute timestamps.
-
-        This function is not called by default; instead, service code should explicitly call
-        this function when needed. Example using a callback::
-
-            if isinstance(track, Subtitle):
-                track.OnDownloaded = lambda track: track.fix_webvtt_timestamp()
-
-        """
-        if not self.path or not self.path.exists():
-            raise ValueError("You must download the subtitle track first.")
-
-        if self.codec is not Subtitle.Codec.WebVTT:
-            raise ValueError(f"Expected subtitle codec to be a {Subtitle.Codec.WebVTT}, not {self.codec}.")
-
-        if self.descriptor is Subtitle.Descriptor.MPD:
-            segment_duration = self.data["dash"]["_segment_duration"]
-            timescale = self.data["dash"]["_timescale"]
-        elif self.descriptor is Subtitle.Descriptor.M3U:
-            segment_duration = None
-            timescale = 1
-        else:
-            return
-
-        text = Subtitle.space_webvtt_headers(self.path.read_text("utf8"))
-        fixed = fix_webvtt_timestamp(
-            text, segment_duration=segment_duration, timescale=timescale
-        )
-
-        self.path.write_text(fixed, "utf8")
 
 __all__ = ("Subtitle",)
diff --git a/devine/core/utils/webvtt.py b/devine/core/utils/webvtt.py
@@ -3,7 +3,7 @@
 import typing
 from typing import Optional
 
-from pycaption import Caption, CaptionList, CaptionNode, CaptionReadError, WebVTTReader, WebVTTWriter
+from pycaption import Caption, CaptionList, CaptionNode, CaptionReadError, CaptionSet, WebVTTReader
 
 
 class CaptionListExt(CaptionList):
@@ -121,7 +121,7 @@ def _parse_local(string: str) -> float:
         return (milliseconds / 1000) + seconds + (minutes * 60) + (hours * 3600)
 
 
-def fix_webvtt_timestamp(vtt_raw: str, segment_duration: Optional[list[int]] = None, timescale: int = 1) -> str:
+def fix_webvtt_timestamp(vtt_raw: str, segment_duration: Optional[list[int]] = None, timescale: int = 1) -> CaptionSet:
     """
     Fix relative timestamp from segmented WebVTT to absolute timestamp.
 
@@ -175,4 +175,4 @@ def fix_webvtt_timestamp(vtt_raw: str, segment_duration: Optional[list[int]] = N
         # Remove duplicate
         captions[:] = [c for c_index, c in enumerate(captions) if c_index not in set(duplicate_index)]
 
-    return WebVTTWriter().write(vtt)
+    return vtt