-
Notifications
You must be signed in to change notification settings - Fork 5
/
voice_translator.py
146 lines (118 loc) · 4.59 KB
/
voice_translator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import json
import argparse
import scipy
import whisper
import librosa
from moviepy.editor import (
VideoFileClip,
AudioFileClip,
CompositeAudioClip,
)
from bark import generate_audio, preload_models, SAMPLE_RATE
from pydub import AudioSegment
from toolkit import VideoProcessor as vp
def change_audio_speed(audio_file, speed):
name = f"{audio_file}_edited.wav"
print("Changing speed to: ", speed, " of audio file: ", audio_file)
if speed > 1:
sound = AudioSegment.from_file(audio_file)
so = sound.speedup(speed, 150, 25)
so.export(name)
else:
song, fs = librosa.load(audio_file)
audio_stretched = librosa.effects.time_stretch(y=song, rate=speed)
scipy.io.wavfile.write(name, fs, audio_stretched)
return name
def video_translation(video_path_data):
input_video_file_clip = VideoFileClip(video_path_data)
audio_path = vp.get_audio(input_video_file_clip, video_path_data[:-4])
whisper_model = whisper.load_model("large")
results = whisper_model.transcribe(audio_path, task="translate")
audio_info = []
for segment in results["segments"]:
audio_info.append(
{
"id": segment["id"],
"text": segment["text"].strip(),
"audio_file": "",
"start": segment["start"],
"end": segment["end"],
}
)
json_file = f"{video_path_data[:-4]}_audio_info.json"
with open(json_file, "w", encoding="utf-8") as outfile:
json.dump(audio_info, outfile)
print(
"Audio info saved to: ",
json_file,
" check it before generating audio with audio_generator",
)
def audio_generator(video_path_data, voice_info="v2/en_speaker_2", low_profile=True):
if low_profile:
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ["SUNO_USE_SMALL_MODELS"] = "1"
preload_models()
json_file = f"{video_path_data[:-4]}_audio_info.json"
audio_clips = []
with open(json_file, "r", encoding="utf-8") as openfile:
audio_clips = json.load(openfile)
for segment in audio_clips:
if segment["audio_file"] != "":
continue
audio_file = f"{video_path_data[:-4]}_generated_audio_{segment['id']}.wav"
print("Writing: ", segment["text"], " to ", audio_file)
audio_array = generate_audio(segment["text"], history_prompt=voice_info)
scipy.io.wavfile.write(audio_file, rate=SAMPLE_RATE, data=audio_array)
segment["audio_file"] = audio_file
with open(json_file, "w", encoding="utf-8") as outfile:
json.dump(audio_clips, outfile)
input_video_file_clip_no_audio = VideoFileClip(video_path_data).without_audio()
clips = []
for item in audio_clips:
audio = AudioFileClip(item["audio_file"])
duration = item["end"] - item["start"]
target_speed = audio.duration / duration
audio_file = change_audio_speed(item["audio_file"], target_speed)
audio = AudioFileClip(audio_file)
audio = audio.set_start(item["start"])
clips.append(audio)
audio = CompositeAudioClip(clips)
audio = audio.subclip(0, input_video_file_clip_no_audio.duration)
input_video_file_clip_no_audio = input_video_file_clip_no_audio.set_audio(audio)
final_video_name = f"{video_path_data[:-4]}_final_video.mp4"
input_video_file_clip_no_audio.write_videofile(final_video_name, audio_codec="aac")
print("Final video saved to: ", final_video_name)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"operation",
type=str,
help="The operation to be performed, can be video_translation or audio_generator",
)
parser.add_argument("video_path", type=str, help="The video file to be translated")
parser.add_argument(
"--voice",
type=str,
default="v2/en_speaker_2",
help="The voice to be used in the translation",
)
parser.add_argument(
"--low_profile_mode",
type=bool,
default=True,
help="Use low profile mode for less powerful computers",
)
args = parser.parse_args()
operation = args.operation
video_path = args.video_path
voice = args.voice
low_profile_mode = args.low_profile_mode
if operation == "video_translation":
print("Starting video translation...")
video_translation(video_path)
elif operation == "audio_generator":
print("Starting audio generation...")
audio_generator(video_path, voice, low_profile_mode)
else:
print("Invalid operation, use --help for more info")