forked from suno-ai/bark
-
Notifications
You must be signed in to change notification settings - Fork 1
/
barkapp.py
134 lines (113 loc) Β· 5.17 KB
/
barkapp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from bark import SAMPLE_RATE, generate_audio, preload_models, save_as_prompt, text_to_semantic, semantic_to_waveform
from pydub import AudioSegment
import gradio as gr
import numpy as np
import torch
import random
import torchaudio
import torch
import nltk
import os
nltk.download('punkt')
torchaudio.set_audio_backend("soundfile")
seed = 0
def set_seed(seed):
seed = int(seed)
if(seed == 0):
seed = random.randint(0, 2**32-1)
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)
os.environ["CUBLAS_WORKSPACE_CONFIG"]=":4096:8"
os.environ["PYTHONHASHSEED"] = str(seed)
return seed
# define your function that takes a text input and returns an audio output
def text_to_audio(text, history_prompt, reuse_previous, pre_stitch, text_temp, coarse_temp, fine_temp, allow_early_stop, max_chunk_size, seed):
seed = set_seed(seed)
if(history_prompt == "Unconditional"):
history_prompt = None
#segment the sentences
text_prompts_list = nltk.sent_tokenize(text)
if pre_stitch:
#attempt to create a single semantic file
outputs = []
i = 1
for prompt in text_prompts_list:
print(f"{i} of {len(text_prompts_list)}")
temp_prompt = text_to_semantic(
prompt,
history_prompt,
text_temp,
allow_early_stop = allow_early_stop
)
outputs.append(temp_prompt)
i += 1
x_semantics = [outputs[i:i+max_chunk_size] for i in range(0, len(outputs), max_chunk_size)]
audio_outputs = []
i = 1
for prompts in x_semantics:
print(f"{i} of {len(x_semantics)}")
x_semantic = np.concatenate(prompts)
audio_array = semantic_to_waveform(
x_semantic,
history_prompt,
coarse_temp,
fine_temp,
)
audio_outputs.append(audio_array)
i += 1
audio_arrays=np.concatenate(audio_outputs)
else:
# generate audio from text
audio_arrays = np.array([])
i = 1
audio_list = []
full_generation = {}
for prompt in text_prompts_list:
print(f"{i} of {len(text_prompts_list)}")
full_generation, audio_array = generate_audio(
text = prompt,
history_prompt = history_prompt,
text_temp = text_temp,
coarse_temp = coarse_temp,
fine_temp = fine_temp,
output_full = True,
allow_early_stop = allow_early_stop)
audio_arrays = np.concatenate((audio_arrays, audio_array))
if reuse_previous:
save_as_prompt(os.path.join(cwd, f"bark/assets/userprompts/{i}.npz"), full_generation)
history_prompt = os.path.join(cwd, f"bark/assets/userprompts/{i}.npz")
i=i+1
# return audio array as output
return (SAMPLE_RATE, audio_arrays), seed
# get the list of files in the prompts folder
cwd = os.getcwd()
files = os.listdir(os.path.join(cwd, "bark/assets/prompts"))
# remove the file extension names
files = [os.path.splitext(f)[0] for f in files]
files.insert(0, "Unconditional")
# create a list of input components
inputs = [
gr.Textbox(label="text"),
gr.Dropdown(label="history_prompt", choices=files),
gr.Checkbox(label="Reuse last chunk as history", value=False, info="Causes distortion in long prompts"),
gr.Checkbox(label="Prestitch", value=False, info="Whether the semantic prompts are stitched together before having audio generated, or the audio is generated by chunk and stitched"),
gr.Slider(minimum=0.01, maximum=1.0, value=0.7, label="text_temp", info="Lower is more consistent with input text (Less likely to um and stammer"),
gr.Slider(minimum=0.01, maximum=1.0, value=0.7, label="coarse_temp", info="Lower is more consistent with the history_prompt, too low seems to copy the history_prompt"),
gr.Slider(minimum=0.01, maximum=1.0, value=0.5, label="fine_temp", info="Lower is more consistent, seems to control intonation and pitch"),
gr.Checkbox(label="Allow Early Stop", value = True, info="Unchecked, model will fill entire context length"),
gr.Slider(minimum=1, maximum=10, value=10, step=1, label="Max Chunck Size", info="The size of chunks used in prestitching, larger chunks have more consistent long outputs but use more vram"),
gr.Number(value=0, label="Seed", info="0 for random")
]
# create an interface object
interface = gr.Interface(text_to_audio, inputs, outputs=["audio", gr.Textbox(label="Seed Used")])
# download and load all models
preload_models()
# launch the interface
interface.launch()
#TODO
#Add save as prompt