Merge pull request #1 from FredBill1/fix-timing-overlap

fix timing overlap issue (m-bain#816)
SaguaroCapital · Jul 2, 2024 · a832332 · a832332
2 parents f2da2f8 + faff50a
commit a832332
Showing 1 changed file with 8 additions and 4 deletions.
diff --git a/whisperx/utils.py b/whisperx/utils.py
@@ -278,15 +278,19 @@ def iterate_subtitles():
                 yield subtitle, times
 
         if "words" in result["segments"][0]:
-            for subtitle, _ in iterate_subtitles():
-                sstart, ssend, speaker = _[0]
+            for subtitle, times in iterate_subtitles():
+                # TODO: handle multiple segments with different start/end times and speakers
+                sstart, send, speaker = times[0]
+                has_timing = any(["start" in timing for timing in subtitle])
+                if has_timing:
+                    sstart = next(timing["start"] for timing in subtitle if "start" in timing)
+                    send = next(timing["end"] for timing in reversed(subtitle) if "end" in timing)
                 subtitle_start = self.format_timestamp(sstart)
-                subtitle_end = self.format_timestamp(ssend)
+                subtitle_end = self.format_timestamp(send)
                 if result["language"] in LANGUAGES_WITHOUT_SPACES:
                     subtitle_text = "".join([word["word"] for word in subtitle])
                 else:
                     subtitle_text = " ".join([word["word"] for word in subtitle])
-                has_timing = any(["start" in word for word in subtitle])
 
                 # add [$SPEAKER_ID]: to each subtitle if speaker is available
                 prefix = ""