Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite transcript logic to be more generic #4747

Merged
merged 3 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/invidious/routes/api/v1/videos.cr
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,14 @@ module Invidious::Routes::API::V1::Videos

if CONFIG.use_innertube_for_captions
params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated)
initial_data = YoutubeAPI.get_transcript(params)

webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code)
transcript = Invidious::Videos::Transcript.from_raw(
YoutubeAPI.get_transcript(params),
caption.language_code,
caption.auto_generated
)

webvtt = transcript.to_vtt
else
# Timedtext API handling
url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target
Expand Down
111 changes: 80 additions & 31 deletions src/invidious/videos/transcript.cr
Original file line number Diff line number Diff line change
@@ -1,8 +1,26 @@
module Invidious::Videos
# Namespace for methods primarily relating to Transcripts
module Transcript
record TranscriptLine, start_ms : Time::Span, end_ms : Time::Span, line : String
# A `Transcripts` struct encapsulates a sequence of lines that together forms the whole transcript for a given YouTube video.
# These lines can be categorized into two types: section headings and regular lines representing content from the video.
struct Transcript
# Types
record HeadingLine, start_ms : Time::Span, end_ms : Time::Span, line : String
record RegularLine, start_ms : Time::Span, end_ms : Time::Span, line : String
alias TranscriptLine = HeadingLine | RegularLine

property lines : Array(TranscriptLine)

property language_code : String
property auto_generated : Bool

# User friendly label for the current transcript.
# Example: "English (auto-generated)"
property label : String

# Initializes a new Transcript struct with the contents and associated metadata describing it
def initialize(@lines : Array(TranscriptLine), @language_code : String, @auto_generated : Bool, @label : String)
end

# Generates a protobuf string to fetch the requested transcript from YouTube
def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String
kind = auto_generated ? "asr" : ""

Expand Down Expand Up @@ -30,48 +48,79 @@ module Invidious::Videos
return params
end

def self.convert_transcripts_to_vtt(initial_data : Hash(String, JSON::Any), target_language : String) : String
# Convert into array of TranscriptLine
lines = self.parse(initial_data)
# Constructs a Transcripts struct from the initial YouTube response
def self.from_raw(initial_data : Hash(String, JSON::Any), language_code : String, auto_generated : Bool)
transcript_panel = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
"content", "transcriptSearchPanelRenderer")

settings_field = {
"Kind" => "captions",
"Language" => target_language,
}
segment_list = transcript_panel.dig("body", "transcriptSegmentListRenderer")

# Taken from Invidious::Videos::Captions::Metadata.timedtext_to_vtt()
vtt = WebVTT.build(settings_field) do |vtt|
lines.each do |line|
vtt.cue(line.start_ms, line.end_ms, line.line)
end
if !segment_list["initialSegments"]?
raise NotFoundException.new("Requested transcript does not exist")
end

return vtt
end
# Extract user-friendly label for the current transcript

footer_language_menu = transcript_panel.dig?(
"footer", "transcriptFooterRenderer", "languageMenu", "sortFilterSubMenuRenderer", "subMenuItems"
)

private def self.parse(initial_data : Hash(String, JSON::Any))
body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
"content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer",
"initialSegments").as_a
if footer_language_menu
label = footer_language_menu.as_a.select(&.["selected"].as_bool)[0]["title"].as_s
else
label = language_code
end

# Extract transcript lines

initial_segments = segment_list["initialSegments"].as_a

lines = [] of TranscriptLine
body.each do |line|
# Transcript section headers. They are not apart of the captions and as such we can safely skip them.
if line.as_h.has_key?("transcriptSectionHeaderRenderer")
next

initial_segments.each do |line|
if unpacked_line = line["transcriptSectionHeaderRenderer"]?
line_type = HeadingLine
else
unpacked_line = line["transcriptSegmentRenderer"]
line_type = RegularLine
end

line = line["transcriptSegmentRenderer"]
start_ms = unpacked_line["startMs"].as_s.to_i.millisecond
end_ms = unpacked_line["endMs"].as_s.to_i.millisecond
text = extract_text(unpacked_line["snippet"]) || ""

lines << line_type.new(start_ms, end_ms, text)
end

return Transcript.new(
lines: lines,
language_code: language_code,
auto_generated: auto_generated,
label: label
)
end

start_ms = line["startMs"].as_s.to_i.millisecond
end_ms = line["endMs"].as_s.to_i.millisecond
# Converts transcript lines to a WebVTT file
#
# This is used within Invidious to replace subtitles
# as to workaround YouTube's rate-limited timedtext endpoint.
def to_vtt
settings_field = {
"Kind" => "captions",
"Language" => @language_code,
}

text = extract_text(line["snippet"]) || ""
vtt = WebVTT.build(settings_field) do |vtt|
@lines.each do |line|
# Section headers are excluded from the VTT conversion as to
# match the regular captions returned from YouTube as much as possible
next if line.is_a? HeadingLine

lines << TranscriptLine.new(start_ms, end_ms, text)
vtt.cue(line.start_ms, line.end_ms, line.line)
end
end

return lines
return vtt
end
end
end
Loading