-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
111 lines (83 loc) · 4.16 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import string
import whisperx
import whisper
import elevenlabslib
import elevenlabslib.helpers
from pydub import AudioSegment
api_key = "API KEY HERE"
user = elevenlabslib.ElevenLabsUser(api_key)
voice = user.get_voices_by_name("Rachel")[0]
def generate_and_splice_text(quotedText, postText):
"""
This function will generate two files: audio.mp3, with the full audio, and cut_audio.mp3, with only the quotedText
:param quotedText: The text to be put in quotes.
:param postText: The text after the quoted section that will be removed from the final audio file.
"""
device = "cuda"
audioFile = "audio.mp3"
splicedAudioFile = "cut_audio.mp3"
print(f'Text to be generated: "{quotedText}" {postText}')
audioData = voice.generate_audio_bytes(f'"{quotedText}" {postText}')
elevenlabslib.helpers.save_audio_bytes(audioData,audioFile,outputFormat="mp3")
# transcribe with original whisper
model = whisper.load_model("medium.en", device)
result = model.transcribe(audioFile)
# load alignment model and metadata
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
# align whisper output
result_aligned = whisperx.align(result["segments"], model_a, metadata, audioFile, device)
print(result_aligned["word_segments"]) # after alignment
#If all you're interested in is how to use whisperX to get the timestamps, you can stop here.
#Now we need to identify the start and end of the sentence.
#I assume whisper correctly identifies every word (besides the punctuation we stripped) else I will drive myself insane.
#I also make the assumption that you haven't re-used the quoted sentence elsewhere word for word.
quoteStartTime = 0.0
quoteEndTime = 0.0
#Effectively what this entire chunk of code does is just identify the portion of text that corresponds to the quotedText, to find the start and end times.
#Get all the words and strip them of all punctuation and casing.
quotedWords = list()
for word in quotedText.lower().split(" "):
strippedWord = word
for char in string.punctuation:
strippedWord = strippedWord.replace(char,"")
quotedWords.append(strippedWord)
word_segments_stripped = result_aligned["word_segments"]
for wordSegment in word_segments_stripped:
newText = wordSegment["text"].lower()
for char in string.punctuation:
newText = newText.replace(char, "")
wordSegment["text"] = newText
possibleStarts = list()
for index, wordSegment in enumerate(word_segments_stripped):
if wordSegment["text"] == quotedWords[0]:
print("Possibly found the start?")
possibleStarts.append(index)
fullAudioSegment = AudioSegment.from_mp3(audioFile)
for possibleStart in possibleStarts:
print(f"Checking from {possibleStart}")
counter = 0
while \
counter < len(quotedWords) \
and possibleStart+counter < len(word_segments_stripped) \
and word_segments_stripped[possibleStart+counter]["text"] == quotedWords[counter]:
counter = counter+1
if counter == len(quotedWords):
print("Found our match.")
quoteStartTime = word_segments_stripped[possibleStart]["start"]
endIndex = possibleStart+len(quotedWords)-1
if endIndex == len(word_segments_stripped):
quoteEndTime = fullAudioSegment.duration_seconds
else:
quoteEndTime = word_segments_stripped[endIndex]["end"]
endOffset = (word_segments_stripped[endIndex+1]["start"] - quoteEndTime)/2
quoteEndTime += endOffset
break
else:
print("Did not match the entire phrase.")
print(f"Identified start time: {quoteStartTime}\nIdentified end time: {quoteEndTime}")
#Use pydub to extract the section of audio.
cutAudioSegment = fullAudioSegment[quoteStartTime*1000:quoteEndTime*1000]
cutAudioSegment.export(splicedAudioFile, format="mp3")
input("Done. Press Enter to exit.")
if __name__=="__main__":
generate_and_splice_text("Don't test me!","she shouted angrily.")