Skip to content

Commit

Permalink
Make subtitle timestamp fix an opt-out with --no-timestamp-fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Shivelight authored and rlaphoenix committed May 6, 2024
1 parent 04b88d7 commit 0e5af93
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 40 deletions.
3 changes: 3 additions & 0 deletions devine/commands/dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ class dl:
@click.option("--sub-format", type=click.Choice(Subtitle.Codec, case_sensitive=False),
default=None,
help="Set Output Subtitle Format, only converting if necessary.")
@click.option("--no-timestamp-fix", "fix_sub_timestamp", is_flag=True, default=True,
help="Disable subtitle timestamp fix.")
@click.option("-V", "--video-only", is_flag=True, default=False,
help="Only download video tracks.")
@click.option("-A", "--audio-only", is_flag=True, default=False,
Expand Down Expand Up @@ -266,6 +268,7 @@ def result(
v_lang: list[str],
s_lang: list[str],
sub_format: Optional[Subtitle.Codec],
fix_sub_timestamp: bool,
video_only: bool,
audio_only: bool,
subs_only: bool,
Expand Down
62 changes: 25 additions & 37 deletions devine/core/tracks/subtitle.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,22 @@ def convert(self, codec: Subtitle.Codec) -> Path:
if writer is None:
raise NotImplementedError(f"Cannot yet convert {self.codec.name} to {codec.name}.")

caption_set = self.parse(self.path.read_bytes(), self.codec)
if self.descriptor == Subtitle.Descriptor.DASH:
# TODO: Populated in DASH.download_track. Perhaps DASH/HLS class should
# use a dict instead of a tuple?
# TODO PR#67 rlaphoenix: This will be moved/done within self.parse instead
extra = {
"_timescale": self.data["dash"]["_timescale"],
"_segment_duration": self.data["dash"]["_segment_duration"],
}
else:
extra = None

# TODO PR#67 rlaphoenix: the True is a bool to say if we should fix webvtt timestamps or not, a bool from dl CLI args
# The Subtitle.convert() method is not passed it but idc cause I will remove this anyway
caption_set = self.parse(self.path.read_bytes(), self.codec, True, extra)
Subtitle.merge_same_cues(caption_set)

subtitle_text = writer().write(caption_set)

output_path.write_text(subtitle_text, encoding="utf8")
Expand All @@ -279,7 +293,9 @@ def convert(self, codec: Subtitle.Codec) -> Path:
return output_path

@staticmethod
def parse(data: bytes, codec: Subtitle.Codec) -> pycaption.CaptionSet:
def parse(data: bytes, codec: Subtitle.Codec, fix_sub_timestamp: bool = False, extra: Optional[dict] = None) -> pycaption.CaptionSet:
extra = extra or {}
# TODO: Use an "enum" for subtitle codecs
if not isinstance(data, bytes):
raise ValueError(f"Subtitle data must be parsed as bytes data, not {type(data).__name__}")

Expand Down Expand Up @@ -310,7 +326,13 @@ def parse(data: bytes, codec: Subtitle.Codec) -> pycaption.CaptionSet:
caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists)
elif codec == Subtitle.Codec.WebVTT:
text = Subtitle.space_webvtt_headers(data)
caption_set = pycaption.WebVTTReader().read(text)
caption_set: pycaption.CaptionSet
if fix_sub_timestamp:
duration = extra.get("_segment_duration")
timescale = extra.get("_timescale", 1)
caption_set = fix_webvtt_timestamp(text, segment_duration=duration, timescale=timescale)
else:
caption_set = pycaption.WebVTTReader().read(text)
else:
raise ValueError(f"Unknown Subtitle format \"{codec}\"...")
except pycaption.exceptions.CaptionReadSyntaxError as e:
Expand Down Expand Up @@ -575,39 +597,5 @@ def reverse_rtl(self) -> None:
stdout=subprocess.DEVNULL
)

def fix_webvtt_timestamp(self) -> None:
# TODO PR#67 rlaphoenix: This func name clashes with the import from newly added utils.webvtt
"""
Convert segmented WebVTT timestamps where each cue starts at 0 (relative to the segment)
to absolute timestamps.
This function is not called by default; instead, service code should explicitly call
this function when needed. Example using a callback::
if isinstance(track, Subtitle):
track.OnDownloaded = lambda track: track.fix_webvtt_timestamp()
"""
if not self.path or not self.path.exists():
raise ValueError("You must download the subtitle track first.")

if self.codec is not Subtitle.Codec.WebVTT:
raise ValueError(f"Expected subtitle codec to be a {Subtitle.Codec.WebVTT}, not {self.codec}.")

if self.descriptor is Subtitle.Descriptor.MPD:
segment_duration = self.data["dash"]["_segment_duration"]
timescale = self.data["dash"]["_timescale"]
elif self.descriptor is Subtitle.Descriptor.M3U:
segment_duration = None
timescale = 1
else:
return

text = Subtitle.space_webvtt_headers(self.path.read_text("utf8"))
fixed = fix_webvtt_timestamp(
text, segment_duration=segment_duration, timescale=timescale
)

self.path.write_text(fixed, "utf8")

__all__ = ("Subtitle",)
6 changes: 3 additions & 3 deletions devine/core/utils/webvtt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import typing
from typing import Optional

from pycaption import Caption, CaptionList, CaptionNode, CaptionReadError, WebVTTReader, WebVTTWriter
from pycaption import Caption, CaptionList, CaptionNode, CaptionReadError, CaptionSet, WebVTTReader


class CaptionListExt(CaptionList):
Expand Down Expand Up @@ -121,7 +121,7 @@ def _parse_local(string: str) -> float:
return (milliseconds / 1000) + seconds + (minutes * 60) + (hours * 3600)


def fix_webvtt_timestamp(vtt_raw: str, segment_duration: Optional[list[int]] = None, timescale: int = 1) -> str:
def fix_webvtt_timestamp(vtt_raw: str, segment_duration: Optional[list[int]] = None, timescale: int = 1) -> CaptionSet:
"""
Fix relative timestamp from segmented WebVTT to absolute timestamp.
Expand Down Expand Up @@ -175,4 +175,4 @@ def fix_webvtt_timestamp(vtt_raw: str, segment_duration: Optional[list[int]] = N
# Remove duplicate
captions[:] = [c for c_index, c in enumerate(captions) if c_index not in set(duplicate_index)]

return WebVTTWriter().write(vtt)
return vtt

0 comments on commit 0e5af93

Please sign in to comment.