-
Notifications
You must be signed in to change notification settings - Fork 0
/
transcribe.py
80 lines (65 loc) · 2.55 KB
/
transcribe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import re
import torch
from faster_whisper import WhisperModel
from moviepy import (
VideoFileClip,
)
def transcribe(individuals, word_pause=1):
print("list of vids found to transcribe " + str(individuals))
def _extract_name(file_name):
pattern = r"_-_\d-(.*)-webcam"
match = re.search(pattern, file_name)
if match:
name = match.group(1)
print(f"found name {name}")
return name
return None
def _format_seconds(seconds):
hours = int(seconds // 3600)
seconds %= 3600
minutes = int(seconds // 60)
seconds = int(seconds % 60)
return f"{hours:02d}h {minutes:02d}m {seconds:02d}s"
def _format_chat(entry):
return f"[{_format_seconds(entry[1])}] {entry[0]}: {entry[3]}\n"
transcription = []
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"using {device}")
model = WhisperModel("large-v3", device)
for i, vid in enumerate(individuals):
print(f"transcribing video {vid}")
name = _extract_name(vid) or f"Person{i}"
video = VideoFileClip(vid)
video.audio.write_audiofile("temp/temp.wav") # pyright: ignore
segments, _ = model.transcribe(
"temp/temp.wav",
suppress_tokens=[],
vad_filter=True,
vad_parameters={"speech_pad_ms": 1000},
initial_prompt="transcribe the following and include a *laughing* when there is laughter'",
condition_on_previous_text=True,
log_progress=True,
beam_size=5,
word_timestamps=True,
hotwords="laughing laugh laughter",
)
current_speech = [name, 0, 0, ""]
for segment in segments:
for word in segment.words or []:
if word.end - current_speech[2] > word_pause:
if current_speech[3] != "":
transcription.append(current_speech)
# we use end as start since start isn't accurate
current_speech = [name, word.end - 0.5, word.end, word.word]
else:
current_speech[3] += word.word
current_speech[2] = word.end
if current_speech[3] != "":
transcription.append(current_speech)
print(f"finished transcribing {name}")
os.remove("temp/temp.wav")
transcription.sort(key=lambda x: x[1])
with open("output/transcript.txt", "w") as file:
for entry in transcription:
file.write(_format_chat(entry))