Skip to content

Commit

Permalink
Merge pull request #718 from IAHispano/formatter/main
Browse files Browse the repository at this point in the history
chore(format): run black on main
  • Loading branch information
blaisewf authored Sep 19, 2024
2 parents 17180b2 + d58fbf2 commit 8b68a28
Show file tree
Hide file tree
Showing 3 changed files with 287 additions and 81 deletions.
48 changes: 36 additions & 12 deletions rvc/infer/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,41 +417,62 @@ def voice_conversion(
with torch.no_grad():
pitch_guidance = pitch != None and pitchf != None
# prepare source audio
feats = torch.from_numpy(audio0).half() if self.is_half else torch.from_numpy(audio0).float()
feats = (
torch.from_numpy(audio0).half()
if self.is_half
else torch.from_numpy(audio0).float()
)
feats = feats.mean(-1) if feats.dim() == 2 else feats
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1).to(self.device)
# extract features
feats = model(feats)["last_hidden_state"]
feats = model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats
feats = (
model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats
)
# make a copy for pitch guidance and protection
feats0 = feats.clone() if pitch_guidance else None
if index: # set by parent function, only true if index is available, loaded, and index rate > 0
feats = self._retrieve_speaker_embeddings(feats, index, big_npy, index_rate)
# feature upsampling
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
if (
index
): # set by parent function, only true if index is available, loaded, and index rate > 0
feats = self._retrieve_speaker_embeddings(
feats, index, big_npy, index_rate
)
# feature upsampling
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(
0, 2, 1
)
# adjust the length if the audio is short
p_len = min(audio0.shape[0] // self.window, feats.shape[1])
if pitch_guidance:
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
0, 2, 1
)
pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len]
# Pitch protection blending
if protect < 0.5:
pitchff = pitchf.clone()
pitchff[pitchf > 0] = 1
pitchff[pitchf < 1] = protect
feats = feats * pitchff.unsqueeze(-1) + feats0 * (1 - pitchff.unsqueeze(-1))
feats = feats * pitchff.unsqueeze(-1) + feats0 * (
1 - pitchff.unsqueeze(-1)
)
feats = feats.to(feats0.dtype)
else:
pitch, pitchf = None, None
p_len = torch.tensor([p_len], device=self.device).long()
audio1 = ((net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]).data.cpu().float().numpy())
audio1 = (
(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
.data.cpu()
.float()
.numpy()
)
# clean up
del feats, feats0, p_len
if torch.cuda.is_available():
torch.cuda.empty_cache()
return audio1

def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate):
npy = feats[0].cpu().numpy()
npy = npy.astype("float32") if self.is_half else npy
Expand All @@ -460,9 +481,12 @@ def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate):
weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
npy = npy.astype("float16") if self.is_half else npy
feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
feats = (
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+ (1 - index_rate) * feats
)
return feats

def pipeline(
self,
model,
Expand Down
2 changes: 2 additions & 0 deletions rvc/train/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from mel_processing import spectrogram_torch
from utils import load_filepaths_and_text, load_wav_to_torch


class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
"""
Dataset that loads text and audio pairs.
Expand Down Expand Up @@ -163,6 +164,7 @@ def __len__(self):
"""
return len(self.audiopaths_and_text)


class TextAudioCollateMultiNSFsid:
"""
Collates text and audio data for training.
Expand Down
Loading

0 comments on commit 8b68a28

Please sign in to comment.