forked from Plachtaa/VITS-fast-fine-tuning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_v2.py
154 lines (144 loc) · 6.55 KB
/
preprocess_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os
import argparse
import json
import sys
sys.setrecursionlimit(500000) # Fix the error message of RecursionError: maximum recursion depth exceeded while calling a Python object. You can change the number as you want.
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--add_auxiliary_data", type=bool, help="Whether to add extra data as fine-tuning helper")
parser.add_argument("--languages", default="CJE")
args = parser.parse_args()
if args.languages == "CJE":
langs = ["[ZH]", "[JA]", "[EN]"]
elif args.languages == "CJ":
langs = ["[ZH]", "[JA]"]
elif args.languages == "C":
langs = ["[ZH]"]
new_annos = []
# Source 1: transcribed short audios
if os.path.exists("short_character_anno.txt"):
with open("short_character_anno.txt", 'r', encoding='utf-8') as f:
short_character_anno = f.readlines()
new_annos += short_character_anno
# Source 2: transcribed long audio segments
if os.path.exists("./long_character_anno.txt"):
with open("./long_character_anno.txt", 'r', encoding='utf-8') as f:
long_character_anno = f.readlines()
new_annos += long_character_anno
# Get all speaker names
speakers = []
for line in new_annos:
path, speaker, text = line.split("|")
if speaker not in speakers:
speakers.append(speaker)
assert (len(speakers) != 0), "No audio file found. Please check your uploaded file structure."
# Source 3 (Optional): sampled audios as extra training helpers
if args.add_auxiliary_data:
with open("./sampled_audio4ft.txt", 'r', encoding='utf-8') as f:
old_annos = f.readlines()
# filter old_annos according to supported languages
filtered_old_annos = []
for line in old_annos:
for lang in langs:
if lang in line:
filtered_old_annos.append(line)
old_annos = filtered_old_annos
for line in old_annos:
path, speaker, text = line.split("|")
if speaker not in speakers:
speakers.append(speaker)
num_old_voices = len(old_annos)
num_new_voices = len(new_annos)
# STEP 1: balance number of new & old voices
cc_duplicate = num_old_voices // num_new_voices
if cc_duplicate == 0:
cc_duplicate = 1
# STEP 2: modify config file
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
hps = json.load(f)
# assign ids to new speakers
speaker2id = {}
for i, speaker in enumerate(speakers):
speaker2id[speaker] = i
# modify n_speakers
hps['data']["n_speakers"] = len(speakers)
# overwrite speaker names
hps['speakers'] = speaker2id
hps['train']['log_interval'] = 10
hps['train']['eval_interval'] = 100
hps['train']['batch_size'] = 16
hps['data']['training_files'] = "final_annotation_train.txt"
hps['data']['validation_files'] = "final_annotation_val.txt"
# save modified config
with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
json.dump(hps, f, indent=2)
# STEP 3: clean annotations, replace speaker names with assigned speaker IDs
import text
cleaned_new_annos = []
for i, line in enumerate(new_annos):
path, speaker, txt = line.split("|")
if len(txt) > 150:
continue
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
cleaned_old_annos = []
for i, line in enumerate(old_annos):
path, speaker, txt = line.split("|")
if len(txt) > 150:
continue
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
cleaned_old_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
# merge with old annotation
final_annos = cleaned_old_annos + cc_duplicate * cleaned_new_annos
# save annotation file
with open("./final_annotation_train.txt", 'w', encoding='utf-8') as f:
for line in final_annos:
f.write(line)
# save annotation file for validation
with open("./final_annotation_val.txt", 'w', encoding='utf-8') as f:
for line in cleaned_new_annos:
f.write(line)
print("finished")
else:
# Do not add extra helper data
# STEP 1: modify config file
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
hps = json.load(f)
# assign ids to new speakers
speaker2id = {}
for i, speaker in enumerate(speakers):
speaker2id[speaker] = i
# modify n_speakers
hps['data']["n_speakers"] = len(speakers)
# overwrite speaker names
hps['speakers'] = speaker2id
hps['train']['log_interval'] = 10
hps['train']['eval_interval'] = 100
hps['train']['batch_size'] = 16
hps['data']['training_files'] = "final_annotation_train.txt"
hps['data']['validation_files'] = "final_annotation_val.txt"
# save modified config
with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
json.dump(hps, f, indent=2)
# STEP 2: clean annotations, replace speaker names with assigned speaker IDs
import text
cleaned_new_annos = []
for i, line in enumerate(new_annos):
path, speaker, txt = line.split("|")
if len(txt) > 150:
continue
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners']).replace("[ZH]", "")
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
final_annos = cleaned_new_annos
# save annotation file
with open("./final_annotation_train.txt", 'w', encoding='utf-8') as f:
for line in final_annos:
f.write(line)
# save annotation file for validation
with open("./final_annotation_val.txt", 'w', encoding='utf-8') as f:
for line in cleaned_new_annos:
f.write(line)
print("finished")