voice range

PlayVoice · Apr 2, 2023 · a0dbed1 · a0dbed1
1 parent 76c2375
commit a0dbed1
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -27,10 +27,6 @@ maxgan v2 == bigvgan + latent f0  PlayVoice/maxgan-svc
     提取每个音频文件的音色
 
     > python svc_preprocess_speaker.py ./data_svc/waves ./data_svc/speaker
-    
-    取所有音频音色的平均作为目标发音人的音色
-
-    > python svc_preprocess_speaker_lora.py ./data_svc/
 
 - 3 下载whisper模型 [multiple language medium model](https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt), 确定下载的是**medium.pt**，把它放到文件夹 **whisper_pretrain/** 中，提取每个音频的内容编码
 
@@ -40,7 +36,13 @@ maxgan v2 == bigvgan + latent f0  PlayVoice/maxgan-svc
 
     > python svc_preprocess_f0.py
 
-- 5 从release页面下载预训练模型maxgan_pretrain，放到model_pretrain文件夹中，预训练模型中包含了生成器和判别器
+- 5 取所有音频音色的平均作为目标发音人的音色，并完成声域分析
+
+    > python svc_preprocess_speaker_lora.py ./data_svc/
+
+    生成 lora_speaker.npy 和 lora_pitch_statics.npy 两个文件
+
+- 6 从release页面下载预训练模型maxgan_pretrain，放到model_pretrain文件夹中，预训练模型中包含了生成器和判别器
 
     > python svc_trainer.py -c config/maxgan.yaml -n lora -p model_pretrain/maxgan_pretrain.pth
 

diff --git a/model/generator.py b/model/generator.py
@@ -117,7 +117,7 @@ def forward(self, spk, x, pos, f0):
         # nsf
         f0 = f0[:, None]
         f0 = self.f0_upsamp(f0).transpose(1, 2)
-        har_source, noi_source, uv = self.m_source(f0)
+        har_source = self.m_source(f0)
         har_source = har_source.transpose(1, 2)
         # pre conv
         x = self.cond_pre(x)                # [B, L, D]

diff --git a/model/nsf.py b/model/nsf.py
@@ -313,7 +313,7 @@ def forward(self, f0):
             # first: set the unvoiced part to 0 by uv
             # then: additive noise
             sine_waves = sine_waves * uv + noise
-        return sine_waves, uv, noise
+        return sine_waves
 
 
 class SourceModuleCycNoise_v1(torch.nn.Module):
@@ -404,14 +404,6 @@ def forward(self, x):
         noise_source (batchsize, length 1)
         """
         # source for harmonic branch
-        sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_wavs = self.l_sin_gen(x)
         sine_merge = self.l_tanh(self.l_linear(sine_wavs))
-
-        # source for noise branch, in the same shape as uv
-        noise = torch.randn_like(uv) * self.sine_amp / 3
-        return sine_merge, noise, uv
-
-
-if __name__ == "__main__":
-    source = SourceModuleCycNoise_v1(24000)
-    x = torch.randn(16, 25600, 1)
+        return sine_merge
diff --git a/svc_preprocess_speaker_lora.py b/svc_preprocess_speaker_lora.py
@@ -23,3 +23,27 @@
         print(speaker_ave)
         np.save(os.path.join(data_svc, "lora_speaker.npy"),
                 speaker_ave, allow_pickle=False)
+
+    if os.path.isdir(os.path.join(data_svc, "pitch")):
+        subfile_num = 0
+        speaker_ave = 0
+        speaker_max = 0
+        speaker_min = 1000
+        for file in os.listdir(os.path.join(data_svc, "pitch")):
+            if file.endswith(".npy"):
+                pitch = np.load(os.path.join(data_svc, "pitch", file))
+                pitch = pitch.astype(np.float32)
+                pitch = pitch[pitch > 0]
+                speaker_ave = speaker_ave + pitch.mean()
+                subfile_num = subfile_num + 1
+                if (speaker_max < pitch.max()):
+                    speaker_max = pitch.max()
+                    print(f'{file} has {speaker_max}')
+                if (speaker_min > pitch.min()):
+                    speaker_min = pitch.min()
+                    print(f'{file} has {speaker_min}')
+        speaker_ave = speaker_ave / subfile_num
+        pitch_statics = [speaker_ave, speaker_min, speaker_max]
+        print(pitch_statics)
+        np.save(os.path.join(data_svc, "lora_pitch_statics.npy"),
+                pitch_statics, allow_pickle=False)