Skip to content

Commit

Permalink
voice range
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxMax2016 committed Apr 2, 2023
1 parent 76c2375 commit a0dbed1
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 17 deletions.
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,6 @@ maxgan v2 == bigvgan + latent f0 PlayVoice/maxgan-svc
提取每个音频文件的音色

> python svc_preprocess_speaker.py ./data_svc/waves ./data_svc/speaker
取所有音频音色的平均作为目标发音人的音色

> python svc_preprocess_speaker_lora.py ./data_svc/
- 3 下载whisper模型 [multiple language medium model](https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt), 确定下载的是**medium.pt**,把它放到文件夹 **whisper_pretrain/** 中,提取每个音频的内容编码

Expand All @@ -40,7 +36,13 @@ maxgan v2 == bigvgan + latent f0 PlayVoice/maxgan-svc

> python svc_preprocess_f0.py
- 5 从release页面下载预训练模型maxgan_pretrain,放到model_pretrain文件夹中,预训练模型中包含了生成器和判别器
- 5 取所有音频音色的平均作为目标发音人的音色,并完成声域分析

> python svc_preprocess_speaker_lora.py ./data_svc/
生成 lora_speaker.npy 和 lora_pitch_statics.npy 两个文件

- 6 从release页面下载预训练模型maxgan_pretrain,放到model_pretrain文件夹中,预训练模型中包含了生成器和判别器

> python svc_trainer.py -c config/maxgan.yaml -n lora -p model_pretrain/maxgan_pretrain.pth
Expand Down
2 changes: 1 addition & 1 deletion model/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def forward(self, spk, x, pos, f0):
# nsf
f0 = f0[:, None]
f0 = self.f0_upsamp(f0).transpose(1, 2)
har_source, noi_source, uv = self.m_source(f0)
har_source = self.m_source(f0)
har_source = har_source.transpose(1, 2)
# pre conv
x = self.cond_pre(x) # [B, L, D]
Expand Down
14 changes: 3 additions & 11 deletions model/nsf.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ def forward(self, f0):
# first: set the unvoiced part to 0 by uv
# then: additive noise
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise
return sine_waves


class SourceModuleCycNoise_v1(torch.nn.Module):
Expand Down Expand Up @@ -404,14 +404,6 @@ def forward(self, x):
noise_source (batchsize, length 1)
"""
# source for harmonic branch
sine_wavs, uv, _ = self.l_sin_gen(x)
sine_wavs = self.l_sin_gen(x)
sine_merge = self.l_tanh(self.l_linear(sine_wavs))

# source for noise branch, in the same shape as uv
noise = torch.randn_like(uv) * self.sine_amp / 3
return sine_merge, noise, uv


if __name__ == "__main__":
source = SourceModuleCycNoise_v1(24000)
x = torch.randn(16, 25600, 1)
return sine_merge
24 changes: 24 additions & 0 deletions svc_preprocess_speaker_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,27 @@
print(speaker_ave)
np.save(os.path.join(data_svc, "lora_speaker.npy"),
speaker_ave, allow_pickle=False)

if os.path.isdir(os.path.join(data_svc, "pitch")):
subfile_num = 0
speaker_ave = 0
speaker_max = 0
speaker_min = 1000
for file in os.listdir(os.path.join(data_svc, "pitch")):
if file.endswith(".npy"):
pitch = np.load(os.path.join(data_svc, "pitch", file))
pitch = pitch.astype(np.float32)
pitch = pitch[pitch > 0]
speaker_ave = speaker_ave + pitch.mean()
subfile_num = subfile_num + 1
if (speaker_max < pitch.max()):
speaker_max = pitch.max()
print(f'{file} has {speaker_max}')
if (speaker_min > pitch.min()):
speaker_min = pitch.min()
print(f'{file} has {speaker_min}')
speaker_ave = speaker_ave / subfile_num
pitch_statics = [speaker_ave, speaker_min, speaker_max]
print(pitch_statics)
np.save(os.path.join(data_svc, "lora_pitch_statics.npy"),
pitch_statics, allow_pickle=False)

0 comments on commit a0dbed1

Please sign in to comment.