-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathsubtitle_to_audio.py
65 lines (46 loc) · 2.23 KB
/
subtitle_to_audio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import tempfile
import argparse
from pysubparser import parser
from pydub import AudioSegment
import pyttsx3
engine = pyttsx3.init()
voices = engine.getProperty('voices') # getting details of current voice
vlist = []
for voice in voices:
vlist.append(voice.name)
def time_to_ms(time):
return ((time.hour * 60 + time.minute) * 60 + time.second) * 1000 + time.microsecond / 1000
def generate_audio(path, rate=200, voice_idx=0):
print("Generating audio file for {} with {}".format(path, "pyttsx3"))
subtitles = parser.parse(path)
tts_engine = pyttsx3.init()
tts_engine.setProperty('rate', rate)
tts_engine.setProperty('voice', tts_engine.getProperty('voices')[voice_idx].id)
audio_sum = AudioSegment.empty()
with tempfile.TemporaryDirectory() as tmpdirname:
print('created temporary directory', tmpdirname)
temp_file_path = os.path.join(tmpdirname, "temp.wav")
prev_subtitle = None
prev_audio_duration_ms = 0
for subtitle in subtitles:
tts_engine.save_to_file(subtitle.text, temp_file_path)
tts_engine.runAndWait()
audio_segment = AudioSegment.from_wav(temp_file_path)
print(subtitle.start, subtitle.text)
if prev_subtitle is None:
silence_duration_ms = time_to_ms(subtitle.start)
else:
silence_duration_ms = time_to_ms(subtitle.start) - time_to_ms(prev_subtitle.start) - prev_audio_duration_ms
audio_sum = audio_sum + AudioSegment.silent(duration=silence_duration_ms) + audio_segment
prev_subtitle = subtitle
prev_audio_duration_ms = len(audio_segment)
with open(f'output/{vlist[voice_idx]}.wav', 'wb') as out_f:
audio_sum.export(out_f, format='wav')
if __name__ == "__main__":
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("-p", "--path", help="subtitle file path",default="two_single.srt")
arg_parser.add_argument("-r", "--rate", help="speech rate(words per minute)", type=int, default=240)
arg_parser.add_argument("-v", "--voice-idx", help="voice selection", type=int, default=1, choices=[0, 1])
args = arg_parser.parse_args()
generate_audio(path=args.path, rate=args.rate, voice_idx=args.voice_idx)