diff --git a/TTS/tts/layers/tortoise/dpm_solver.py b/TTS/tts/layers/tortoise/dpm_solver.py index cb540577f8..2166eebb3c 100644 --- a/TTS/tts/layers/tortoise/dpm_solver.py +++ b/TTS/tts/layers/tortoise/dpm_solver.py @@ -1548,4 +1548,4 @@ def expand_dims(v, dims): Returns: a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`. """ - return v[(...,) + (None,) * (dims - 1)] + return v[(...,) + (None,) * (dims - 1)] \ No newline at end of file diff --git a/TTS/tts/layers/xtts/diffusion.py b/TTS/tts/layers/xtts/diffusion.py deleted file mode 100644 index 37665bc676..0000000000 --- a/TTS/tts/layers/xtts/diffusion.py +++ /dev/null @@ -1,1319 +0,0 @@ -import enum -import math - -import numpy as np -import torch -import torch as th -from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral -from tqdm import tqdm - -from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper - -K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m} -SAMPLERS = ["dpm++2m", "p", "ddim"] - - -def normal_kl(mean1, logvar1, mean2, logvar2): - """ - Compute the KL divergence between two gaussians. - - Shapes are automatically broadcasted, so batches can be compared to - scalars, among other use cases. - """ - tensor = None - for obj in (mean1, logvar1, mean2, logvar2): - if isinstance(obj, th.Tensor): - tensor = obj - break - assert tensor is not None, "at least one argument must be a Tensor" - - # Force variances to be Tensors. Broadcasting helps convert scalars to - # Tensors, but it does not work for th.exp(). - logvar1, logvar2 = [x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) for x in (logvar1, logvar2)] - - return 0.5 * (-1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * th.exp(-logvar2)) - - -def approx_standard_normal_cdf(x): - """ - A fast approximation of the cumulative distribution function of the - standard normal. - """ - return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) - - -def discretized_gaussian_log_likelihood(x, *, means, log_scales): - """ - Compute the log-likelihood of a Gaussian distribution discretizing to a - given image. - - :param x: the target images. It is assumed that this was uint8 values, - rescaled to the range [-1, 1]. - :param means: the Gaussian mean Tensor. - :param log_scales: the Gaussian log stddev Tensor. - :return: a tensor like x of log probabilities (in nats). - """ - assert x.shape == means.shape == log_scales.shape - centered_x = x - means - inv_stdv = th.exp(-log_scales) - plus_in = inv_stdv * (centered_x + 1.0 / 255.0) - cdf_plus = approx_standard_normal_cdf(plus_in) - min_in = inv_stdv * (centered_x - 1.0 / 255.0) - cdf_min = approx_standard_normal_cdf(min_in) - log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) - log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) - cdf_delta = cdf_plus - cdf_min - log_probs = th.where( - x < -0.999, - log_cdf_plus, - th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), - ) - assert log_probs.shape == x.shape - return log_probs - - -def mean_flat(tensor): - """ - Take the mean over all non-batch dimensions. - """ - return tensor.mean(dim=list(range(1, len(tensor.shape)))) - - -def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): - """ - Get a pre-defined beta schedule for the given name. - - The beta schedule library consists of beta schedules which remain similar - in the limit of num_diffusion_timesteps. - Beta schedules may be added, but should not be removed or changed once - they are committed to maintain backwards compatibility. - """ - if schedule_name == "linear": - # Linear schedule from Ho et al, extended to work for any number of - # diffusion steps. - scale = 1000 / num_diffusion_timesteps - beta_start = scale * 0.0001 - beta_end = scale * 0.02 - return np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64) - elif schedule_name == "cosine": - return betas_for_alpha_bar( - num_diffusion_timesteps, - lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, - ) - else: - raise NotImplementedError(f"unknown beta schedule: {schedule_name}") - - -def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): - """ - Create a beta schedule that discretizes the given alpha_t_bar function, - which defines the cumulative product of (1-beta) over time from t = [0,1]. - - :param num_diffusion_timesteps: the number of betas to produce. - :param alpha_bar: a lambda that takes an argument t from 0 to 1 and - produces the cumulative product of (1-beta) up to that - part of the diffusion process. - :param max_beta: the maximum beta to use; use values lower than 1 to - prevent singularities. - """ - betas = [] - for i in range(num_diffusion_timesteps): - t1 = i / num_diffusion_timesteps - t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) - return np.array(betas) - - -class ModelMeanType(enum.Enum): - """ - Which type of output the model predicts. - """ - - PREVIOUS_X = "previous_x" # the model predicts x_{t-1} - START_X = "start_x" # the model predicts x_0 - EPSILON = "epsilon" # the model predicts epsilon - - -class ModelVarType(enum.Enum): - """ - What is used as the model's output variance. - - The LEARNED_RANGE option has been added to allow the model to predict - values between FIXED_SMALL and FIXED_LARGE, making its job easier. - """ - - LEARNED = "learned" - FIXED_SMALL = "fixed_small" - FIXED_LARGE = "fixed_large" - LEARNED_RANGE = "learned_range" - - -class LossType(enum.Enum): - MSE = "mse" # use raw MSE loss (and KL when learning variances) - RESCALED_MSE = "rescaled_mse" # use raw MSE loss (with RESCALED_KL when learning variances) - KL = "kl" # use the variational lower-bound - RESCALED_KL = "rescaled_kl" # like KL, but rescale to estimate the full VLB - - def is_vb(self): - return self == LossType.KL or self == LossType.RESCALED_KL - - -class GaussianDiffusion: - """ - Utilities for training and sampling diffusion models. - - Ported directly from here, and then adapted over time to further experimentation. - https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42 - - :param betas: a 1-D numpy array of betas for each diffusion timestep, - starting at T and going to 1. - :param model_mean_type: a ModelMeanType determining what the model outputs. - :param model_var_type: a ModelVarType determining how variance is output. - :param loss_type: a LossType determining the loss function to use. - :param rescale_timesteps: if True, pass floating point timesteps into the - model so that they are always scaled like in the - original paper (0 to 1000). - """ - - def __init__( - self, - *, - betas, - model_mean_type, - model_var_type, - loss_type, - rescale_timesteps=False, # this is generally False - conditioning_free=False, - conditioning_free_k=1, - ramp_conditioning_free=True, - sampler="ddim", - ): - self.sampler = sampler - self.model_mean_type = ModelMeanType(model_mean_type) - self.model_var_type = ModelVarType(model_var_type) - self.loss_type = LossType(loss_type) - self.rescale_timesteps = rescale_timesteps - self.conditioning_free = conditioning_free - self.conditioning_free_k = conditioning_free_k - self.ramp_conditioning_free = ramp_conditioning_free - - # Use float64 for accuracy. - betas = np.array(betas, dtype=np.float64) - self.betas = betas - assert len(betas.shape) == 1, "betas must be 1-D" - assert (betas > 0).all() and (betas <= 1).all() - - self.num_timesteps = int(betas.shape[0]) - - alphas = 1.0 - betas - self.alphas_cumprod = np.cumprod(alphas, axis=0) - self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) - self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0) - assert self.alphas_cumprod_prev.shape == (self.num_timesteps,) - - # calculations for diffusion q(x_t | x_{t-1}) and others - self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod) - self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod) - self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod) - self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod) - self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1) - - # calculations for posterior q(x_{t-1} | x_t, x_0) - self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) - # log calculation clipped because the posterior variance is 0 at the - # beginning of the diffusion chain. - self.posterior_log_variance_clipped = np.log(np.append(self.posterior_variance[1], self.posterior_variance[1:])) - self.posterior_mean_coef1 = betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) - self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod) - - def q_mean_variance(self, x_start, t): - """ - Get the distribution q(x_t | x_0). - - :param x_start: the [N x C x ...] tensor of noiseless inputs. - :param t: the number of diffusion steps (minus 1). Here, 0 means one step. - :return: A tuple (mean, variance, log_variance), all of x_start's shape. - """ - mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start - variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape) - log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape) - return mean, variance, log_variance - - def q_sample(self, x_start, t, noise=None): - """ - Diffuse the data for a given number of diffusion steps. - - In other words, sample from q(x_t | x_0). - - :param x_start: the initial data batch. - :param t: the number of diffusion steps (minus 1). Here, 0 means one step. - :param noise: if specified, the split-out normal noise. - :return: A noisy version of x_start. - """ - if noise is None: - noise = th.randn_like(x_start) - assert noise.shape == x_start.shape - return ( - _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start - + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise - ) - - def q_posterior_mean_variance(self, x_start, x_t, t): - """ - Compute the mean and variance of the diffusion posterior: - - q(x_{t-1} | x_t, x_0) - - """ - assert x_start.shape == x_t.shape - posterior_mean = ( - _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start - + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t - ) - posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape) - posterior_log_variance_clipped = _extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape) - assert ( - posterior_mean.shape[0] - == posterior_variance.shape[0] - == posterior_log_variance_clipped.shape[0] - == x_start.shape[0] - ) - return posterior_mean, posterior_variance, posterior_log_variance_clipped - - def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None): - """ - Apply the model to get p(x_{t-1} | x_t), as well as a prediction of - the initial x, x_0. - - :param model: the model, which takes a signal and a batch of timesteps - as input. - :param x: the [N x C x ...] tensor at time t. - :param t: a 1-D Tensor of timesteps. - :param clip_denoised: if True, clip the denoised signal into [-1, 1]. - :param denoised_fn: if not None, a function which applies to the - x_start prediction before it is used to sample. Applies before - clip_denoised. - :param model_kwargs: if not None, a dict of extra keyword arguments to - pass to the model. This can be used for conditioning. - :return: a dict with the following keys: - - 'mean': the model mean output. - - 'variance': the model variance output. - - 'log_variance': the log of 'variance'. - - 'pred_xstart': the prediction for x_0. - """ - if model_kwargs is None: - model_kwargs = {} - - assert self.model_var_type == ModelVarType.LEARNED_RANGE - assert self.model_mean_type == ModelMeanType.EPSILON - assert denoised_fn is None - assert clip_denoised is True - B, C = x.shape[:2] - assert t.shape == (B,) - model_output = model(x, self._scale_timesteps(t), **model_kwargs) - if self.conditioning_free: - model_output_no_conditioning = model(x, self._scale_timesteps(t), conditioning_free=True, **model_kwargs) - - if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]: - assert model_output.shape == (B, C * 2, *x.shape[2:]) - model_output, model_var_values = th.split(model_output, C, dim=1) - if self.conditioning_free: - model_output_no_conditioning, _ = th.split(model_output_no_conditioning, C, dim=1) - if self.model_var_type == ModelVarType.LEARNED: - assert False - model_log_variance = model_var_values - model_variance = th.exp(model_log_variance) - else: - min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape) - max_log = _extract_into_tensor(np.log(self.betas), t, x.shape) - # The model_var_values is [-1, 1] for [min_var, max_var]. - frac = (model_var_values + 1) / 2 - model_log_variance = frac * max_log + (1 - frac) * min_log - model_variance = th.exp(model_log_variance) - else: - assert False - model_variance, model_log_variance = { - # for fixedlarge, we set the initial (log-)variance like so - # to get a better decoder log likelihood. - ModelVarType.FIXED_LARGE: ( - np.append(self.posterior_variance[1], self.betas[1:]), - np.log(np.append(self.posterior_variance[1], self.betas[1:])), - ), - ModelVarType.FIXED_SMALL: ( - self.posterior_variance, - self.posterior_log_variance_clipped, - ), - }[self.model_var_type] - model_variance = _extract_into_tensor(model_variance, t, x.shape) - model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape) - - if self.conditioning_free: - if self.ramp_conditioning_free: - assert t.shape[0] == 1 # This should only be used in inference. - cfk = self.conditioning_free_k * (1 - self._scale_timesteps(t)[0].item() / self.num_timesteps) - else: - cfk = self.conditioning_free_k - model_output = (1 + cfk) * model_output - cfk * model_output_no_conditioning - - def process_xstart(x): - if denoised_fn is not None: - assert False - x = denoised_fn(x) - if clip_denoised: - return x.clamp(-1, 1) - assert False - return x - - if self.model_mean_type == ModelMeanType.PREVIOUS_X: - assert False - pred_xstart = process_xstart(self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output)) - model_mean = model_output - elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]: - if self.model_mean_type == ModelMeanType.START_X: - assert False - pred_xstart = process_xstart(model_output) - else: - pred_xstart = process_xstart(self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)) - model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t) - else: - raise NotImplementedError(self.model_mean_type) - - assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape - return { - "mean": model_mean, - "variance": model_variance, - "log_variance": model_log_variance, - "pred_xstart": pred_xstart, - } - - def _predict_xstart_from_eps(self, x_t, t, eps): - assert x_t.shape == eps.shape - return ( - _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps - ) - - def _predict_xstart_from_xprev(self, x_t, t, xprev): - assert x_t.shape == xprev.shape - return ( # (xprev - coef2*x_t) / coef1 - _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev - - _extract_into_tensor(self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape) * x_t - ) - - def _predict_eps_from_xstart(self, x_t, t, pred_xstart): - return ( - _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart - ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) - - def _scale_timesteps(self, t): - if self.rescale_timesteps: - return t.float() * (1000.0 / self.num_timesteps) - return t - - def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None): - """ - Compute the mean for the previous step, given a function cond_fn that - computes the gradient of a conditional log probability with respect to - x. In particular, cond_fn computes grad(log(p(y|x))), and we want to - condition on y. - - This uses the conditioning strategy from Sohl-Dickstein et al. (2015). - """ - gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs) - new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float() - return new_mean - - def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None): - """ - Compute what the p_mean_variance output would have been, should the - model's score function be conditioned by cond_fn. - - See condition_mean() for details on cond_fn. - - Unlike condition_mean(), this instead uses the conditioning strategy - from Song et al (2020). - """ - alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) - - eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"]) - eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, self._scale_timesteps(t), **model_kwargs) - - out = p_mean_var.copy() - out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps) - out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t) - return out - - def p_sample( - self, - model, - x, - t, - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - model_kwargs=None, - ): - """ - Sample x_{t-1} from the model at the given timestep. - - :param model: the model to sample from. - :param x: the current tensor at x_{t-1}. - :param t: the value of t, starting at 0 for the first diffusion step. - :param clip_denoised: if True, clip the x_start prediction to [-1, 1]. - :param denoised_fn: if not None, a function which applies to the - x_start prediction before it is used to sample. - :param cond_fn: if not None, this is a gradient function that acts - similarly to the model. - :param model_kwargs: if not None, a dict of extra keyword arguments to - pass to the model. This can be used for conditioning. - :return: a dict containing the following keys: - - 'sample': a random sample from the model. - - 'pred_xstart': a prediction of x_0. - """ - out = self.p_mean_variance( - model, - x, - t, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - model_kwargs=model_kwargs, - ) - noise = th.randn_like(x) - nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) # no noise when t == 0 - if cond_fn is not None: - out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs) - sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise - return {"sample": sample, "pred_xstart": out["pred_xstart"]} - - def k_diffusion_sample_loop( - self, - k_sampler, - pbar, - model, - shape, - noise=None, # all given - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - device=None, # ALL UNUSED - model_kwargs=None, # {'precomputed_aligned_embeddings': precomputed_embeddings}, - progress=False, # unused as well - ): - assert isinstance(model_kwargs, dict) - if device is None: - device = next(model.parameters()).device - s_in = noise.new_ones([noise.shape[0]]) - - def model_split(*args, **kwargs): - model_output = model(*args, **kwargs) - model_epsilon, model_var = th.split(model_output, model_output.shape[1] // 2, dim=1) - return model_epsilon, model_var - - # - """ - print(self.betas) - print(th.tensor(self.betas)) - noise_schedule = NoiseScheduleVP(schedule='discrete', betas=th.tensor(self.betas)) - """ - noise_schedule = NoiseScheduleVP(schedule="linear", continuous_beta_0=0.1 / 4, continuous_beta_1=20.0 / 4) - - def model_fn_prewrap(x, t, *args, **kwargs): - """ - x_in = torch.cat([x] * 2) - t_in = torch.cat([t_continuous] * 2) - c_in = torch.cat([unconditional_condition, condition]) - noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2) - print(t) - print(self.timestep_map) - exit() - """ - """ - model_output = model(x, self._scale_timesteps(t*4000), **model_kwargs) - out = self.p_mean_variance(model, x, t*4000, model_kwargs=model_kwargs) - return out['pred_xstart'] - """ - x, _ = x.chunk(2) - t, _ = (t * 1000).chunk(2) - res = torch.cat( - [ - model_split(x, t, conditioning_free=True, **model_kwargs)[0], - model_split(x, t, **model_kwargs)[0], - ] - ) - pbar.update(1) - return res - - model_fn = model_wrapper( - model_fn_prewrap, - noise_schedule, - model_type="noise", # "noise" or "x_start" or "v" or "score" - model_kwargs=model_kwargs, - guidance_type="classifier-free", - condition=th.Tensor(1), - unconditional_condition=th.Tensor(1), - guidance_scale=self.conditioning_free_k, - ) - """ - model_fn = model_wrapper( - model_fn_prewrap, - noise_schedule, - model_type='x_start', - model_kwargs={} - ) - # - dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver") - x_sample = dpm_solver.sample( - noise, - steps=20, - order=3, - skip_type="time_uniform", - method="singlestep", - ) - """ - dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++") - x_sample = dpm_solver.sample( - noise, - steps=self.num_timesteps, - order=2, - skip_type="time_uniform", - method="multistep", - ) - #''' - return x_sample - - # HF DIFFUSION ATTEMPT - """ - from .hf_diffusion import EulerAncestralDiscreteScheduler - Scheduler = EulerAncestralDiscreteScheduler() - Scheduler.set_timesteps(100) - for timestep in Scheduler.timesteps: - noise_input = Scheduler.scale_model_input(noise, timestep) - ts = s_in * timestep - model_output = model(noise_input, ts, **model_kwargs) - model_epsilon, _model_var = th.split(model_output, model_output.shape[1]//2, dim=1) - noise, _x0 = Scheduler.step(model_epsilon, timestep, noise) - return noise - """ - - # KARRAS DIFFUSION ATTEMPT - """ - TRAINED_DIFFUSION_STEPS = 4000 # HARDCODED - ratio = TRAINED_DIFFUSION_STEPS/14.5 - def call_model(*args, **kwargs): - model_output = model(*args, **kwargs) - model_output, model_var_values = th.split(model_output, model_output.shape[1]//2, dim=1) - return model_output - print(get_sigmas_karras(self.num_timesteps, sigma_min=0.0, sigma_max=4000, device=device)) - exit() - sigmas = get_sigmas_karras(self.num_timesteps, sigma_min=0.03, sigma_max=14.5, device=device) - return k_sampler(call_model, noise, sigmas, extra_args=model_kwargs, disable=not progress) - ''' - sigmas = get_sigmas_karras(self.num_timesteps, sigma_min=0.03, sigma_max=14.5, device=device) - step = 0 # LMAO - global_sigmas = None - # - def fakemodel(x, t, **model_kwargs): - print(t,global_sigmas*ratio) - return model(x, t, **model_kwargs) - def denoised(x, sigmas, **extra_args): - t = th.tensor([self.num_timesteps-step-1] * shape[0], device=device) - nonlocal global_sigmas - global_sigmas = sigmas - with th.no_grad(): - out = self.p_sample( - fakemodel, - x, - t, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - cond_fn=cond_fn, - model_kwargs=model_kwargs, - ) - return out["sample"] - def callback(d): - nonlocal step - step += 1 - - return k_sampler(denoised, noise, sigmas, extra_args=model_kwargs, callback=callback, disable=not progress) - ''' - """ - - def sample_loop(self, *args, **kwargs): - s = self.sampler - if s == "p": - return self.p_sample_loop(*args, **kwargs) - elif s == "ddim": - return self.ddim_sample_loop(*args, **kwargs) - elif s == "dpm++2m": - if self.conditioning_free is not True: - raise RuntimeError("cond_free must be true") - with tqdm(total=self.num_timesteps) as pbar: - return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs) - else: - raise RuntimeError("sampler not impl") - - def p_sample_loop( - self, - model, - shape, - noise=None, - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - model_kwargs=None, - device=None, - progress=False, - ): - """ - Generate samples from the model. - - :param model: the model module. - :param shape: the shape of the samples, (N, C, H, W). - :param noise: if specified, the noise from the encoder to sample. - Should be of the same shape as `shape`. - :param clip_denoised: if True, clip x_start predictions to [-1, 1]. - :param denoised_fn: if not None, a function which applies to the - x_start prediction before it is used to sample. - :param cond_fn: if not None, this is a gradient function that acts - similarly to the model. - :param model_kwargs: if not None, a dict of extra keyword arguments to - pass to the model. This can be used for conditioning. - :param device: if specified, the device to create the samples on. - If not specified, use a model parameter's device. - :param progress: if True, show a tqdm progress bar. - :return: a non-differentiable batch of samples. - """ - final = None - for sample in self.p_sample_loop_progressive( - model, - shape, - noise=noise, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - cond_fn=cond_fn, - model_kwargs=model_kwargs, - device=device, - progress=progress, - ): - final = sample - return final["sample"] - - def p_sample_loop_progressive( - self, - model, - shape, - noise=None, - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - model_kwargs=None, - device=None, - progress=False, - ): - """ - Generate samples from the model and yield intermediate samples from - each timestep of diffusion. - - Arguments are the same as p_sample_loop(). - Returns a generator over dicts, where each dict is the return value of - p_sample(). - """ - if device is None: - device = next(model.parameters()).device - assert isinstance(shape, (tuple, list)) - if noise is not None: - img = noise - else: - img = th.randn(*shape, device=device) - indices = list(range(self.num_timesteps))[::-1] - - for i in tqdm(indices, disable=not progress): - t = th.tensor([i] * shape[0], device=device) - with th.no_grad(): - out = self.p_sample( - model, - img, - t, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - cond_fn=cond_fn, - model_kwargs=model_kwargs, - ) - yield out - img = out["sample"] - - def ddim_sample( - self, - model, - x, - t, - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - model_kwargs=None, - eta=0.0, - ): - """ - Sample x_{t-1} from the model using DDIM. - - Same usage as p_sample(). - """ - out = self.p_mean_variance( - model, - x, - t, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - model_kwargs=model_kwargs, - ) - if cond_fn is not None: - out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs) - - # Usually our model outputs epsilon, but we re-derive it - # in case we used x_start or x_prev prediction. - eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) - - alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) - alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) - sigma = eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev) - # Equation 12. - noise = th.randn_like(x) - mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma**2) * eps - nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) # no noise when t == 0 - sample = mean_pred + nonzero_mask * sigma * noise - return {"sample": sample, "pred_xstart": out["pred_xstart"]} - - def ddim_reverse_sample( - self, - model, - x, - t, - clip_denoised=True, - denoised_fn=None, - model_kwargs=None, - eta=0.0, - ): - """ - Sample x_{t+1} from the model using DDIM reverse ODE. - """ - assert eta == 0.0, "Reverse ODE only for deterministic path" - out = self.p_mean_variance( - model, - x, - t, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - model_kwargs=model_kwargs, - ) - # Usually our model outputs epsilon, but we re-derive it - # in case we used x_start or x_prev prediction. - eps = ( - _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"] - ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape) - alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape) - - # Equation 12. reversed - mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps - - return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]} - - def ddim_sample_loop( - self, - model, - shape, - noise=None, - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - model_kwargs=None, - device=None, - progress=False, - eta=0.0, - ): - """ - Generate samples from the model using DDIM. - - Same usage as p_sample_loop(). - """ - final = None - for sample in self.ddim_sample_loop_progressive( - model, - shape, - noise=noise, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - cond_fn=cond_fn, - model_kwargs=model_kwargs, - device=device, - progress=progress, - eta=eta, - ): - final = sample - return final["sample"] - - def ddim_sample_loop_progressive( - self, - model, - shape, - noise=None, - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - model_kwargs=None, - device=None, - progress=False, - eta=0.0, - ): - """ - Use DDIM to sample from the model and yield intermediate samples from - each timestep of DDIM. - - Same usage as p_sample_loop_progressive(). - """ - if device is None: - device = next(model.parameters()).device - assert isinstance(shape, (tuple, list)) - if noise is not None: - img = noise - else: - img = th.randn(*shape, device=device) - indices = list(range(self.num_timesteps))[::-1] - - if progress: - # Lazy import so that we don't depend on tqdm. - from tqdm.auto import tqdm - - indices = tqdm(indices, disable=not progress) - - for i in indices: - t = th.tensor([i] * shape[0], device=device) - with th.no_grad(): - out = self.ddim_sample( - model, - img, - t, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - cond_fn=cond_fn, - model_kwargs=model_kwargs, - eta=eta, - ) - yield out - img = out["sample"] - - def _vb_terms_bpd(self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None): - """ - Get a term for the variational lower-bound. - - The resulting units are bits (rather than nats, as one might expect). - This allows for comparison to other papers. - - :return: a dict with the following keys: - - 'output': a shape [N] tensor of NLLs or KLs. - - 'pred_xstart': the x_0 predictions. - """ - true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t) - out = self.p_mean_variance(model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs) - kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]) - kl = mean_flat(kl) / np.log(2.0) - - decoder_nll = -discretized_gaussian_log_likelihood( - x_start, means=out["mean"], log_scales=0.5 * out["log_variance"] - ) - assert decoder_nll.shape == x_start.shape - decoder_nll = mean_flat(decoder_nll) / np.log(2.0) - - # At the first timestep return the decoder NLL, - # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t)) - output = th.where((t == 0), decoder_nll, kl) - return {"output": output, "pred_xstart": out["pred_xstart"]} - - def training_losses(self, model, x_start, t, model_kwargs=None, noise=None): - """ - Compute training losses for a single timestep. - - :param model: the model to evaluate loss on. - :param x_start: the [N x C x ...] tensor of inputs. - :param t: a batch of timestep indices. - :param model_kwargs: if not None, a dict of extra keyword arguments to - pass to the model. This can be used for conditioning. - :param noise: if specified, the specific Gaussian noise to try to remove. - :return: a dict with the key "loss" containing a tensor of shape [N]. - Some mean or variance settings may also have other keys. - """ - if model_kwargs is None: - model_kwargs = {} - if noise is None: - noise = th.randn_like(x_start) - x_t = self.q_sample(x_start, t, noise=noise) - - terms = {} - - if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: - # TODO: support multiple model outputs for this mode. - terms["loss"] = self._vb_terms_bpd( - model=model, - x_start=x_start, - x_t=x_t, - t=t, - clip_denoised=False, - model_kwargs=model_kwargs, - )["output"] - if self.loss_type == LossType.RESCALED_KL: - terms["loss"] *= self.num_timesteps - elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: - model_outputs = model(x_t, self._scale_timesteps(t), **model_kwargs) - if isinstance(model_outputs, tuple): - model_output = model_outputs[0] - terms["extra_outputs"] = model_outputs[1:] - else: - model_output = model_outputs - - if self.model_var_type in [ - ModelVarType.LEARNED, - ModelVarType.LEARNED_RANGE, - ]: - B, C = x_t.shape[:2] - assert model_output.shape == (B, C * 2, *x_t.shape[2:]) - model_output, model_var_values = th.split(model_output, C, dim=1) - # Learn the variance using the variational bound, but don't let - # it affect our mean prediction. - frozen_out = th.cat([model_output.detach(), model_var_values], dim=1) - terms["vb"] = self._vb_terms_bpd( - model=lambda *args, r=frozen_out: r, - x_start=x_start, - x_t=x_t, - t=t, - clip_denoised=False, - )["output"] - if self.loss_type == LossType.RESCALED_MSE: - # Divide by 1000 for equivalence with initial implementation. - # Without a factor of 1/1000, the VB term hurts the MSE term. - terms["vb"] *= self.num_timesteps / 1000.0 - - if self.model_mean_type == ModelMeanType.PREVIOUS_X: - target = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0] - x_start_pred = torch.zeros(x_start) # Not supported. - elif self.model_mean_type == ModelMeanType.START_X: - target = x_start - x_start_pred = model_output - elif self.model_mean_type == ModelMeanType.EPSILON: - target = noise - x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output) - else: - raise NotImplementedError(self.model_mean_type) - assert model_output.shape == target.shape == x_start.shape - terms["mse"] = mean_flat((target - model_output) ** 2) - terms["x_start_predicted"] = x_start_pred - if "vb" in terms: - terms["loss"] = terms["mse"] + terms["vb"] - else: - terms["loss"] = terms["mse"] - else: - raise NotImplementedError(self.loss_type) - - return terms - - def autoregressive_training_losses( - self, - model, - x_start, - t, - model_output_keys, - gd_out_key, - model_kwargs=None, - noise=None, - ): - """ - Compute training losses for a single timestep. - - :param model: the model to evaluate loss on. - :param x_start: the [N x C x ...] tensor of inputs. - :param t: a batch of timestep indices. - :param model_kwargs: if not None, a dict of extra keyword arguments to - pass to the model. This can be used for conditioning. - :param noise: if specified, the specific Gaussian noise to try to remove. - :return: a dict with the key "loss" containing a tensor of shape [N]. - Some mean or variance settings may also have other keys. - """ - if model_kwargs is None: - model_kwargs = {} - if noise is None: - noise = th.randn_like(x_start) - x_t = self.q_sample(x_start, t, noise=noise) - terms = {} - if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: - assert False # not currently supported for this type of diffusion. - elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: - model_outputs = model(x_t, x_start, self._scale_timesteps(t), **model_kwargs) - terms.update({k: o for k, o in zip(model_output_keys, model_outputs)}) - model_output = terms[gd_out_key] - if self.model_var_type in [ - ModelVarType.LEARNED, - ModelVarType.LEARNED_RANGE, - ]: - B, C = x_t.shape[:2] - assert model_output.shape == (B, C, 2, *x_t.shape[2:]) - model_output, model_var_values = ( - model_output[:, :, 0], - model_output[:, :, 1], - ) - # Learn the variance using the variational bound, but don't let - # it affect our mean prediction. - frozen_out = th.cat([model_output.detach(), model_var_values], dim=1) - terms["vb"] = self._vb_terms_bpd( - model=lambda *args, r=frozen_out: r, - x_start=x_start, - x_t=x_t, - t=t, - clip_denoised=False, - )["output"] - if self.loss_type == LossType.RESCALED_MSE: - # Divide by 1000 for equivalence with initial implementation. - # Without a factor of 1/1000, the VB term hurts the MSE term. - terms["vb"] *= self.num_timesteps / 1000.0 - - if self.model_mean_type == ModelMeanType.PREVIOUS_X: - target = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0] - x_start_pred = torch.zeros(x_start) # Not supported. - elif self.model_mean_type == ModelMeanType.START_X: - target = x_start - x_start_pred = model_output - elif self.model_mean_type == ModelMeanType.EPSILON: - target = noise - x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output) - else: - raise NotImplementedError(self.model_mean_type) - assert model_output.shape == target.shape == x_start.shape - terms["mse"] = mean_flat((target - model_output) ** 2) - terms["x_start_predicted"] = x_start_pred - if "vb" in terms: - terms["loss"] = terms["mse"] + terms["vb"] - else: - terms["loss"] = terms["mse"] - else: - raise NotImplementedError(self.loss_type) - - return terms - - def _prior_bpd(self, x_start): - """ - Get the prior KL term for the variational lower-bound, measured in - bits-per-dim. - - This term can't be optimized, as it only depends on the encoder. - - :param x_start: the [N x C x ...] tensor of inputs. - :return: a batch of [N] KL values (in bits), one per batch element. - """ - batch_size = x_start.shape[0] - t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device) - qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t) - kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0) - return mean_flat(kl_prior) / np.log(2.0) - - def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None): - """ - Compute the entire variational lower-bound, measured in bits-per-dim, - as well as other related quantities. - - :param model: the model to evaluate loss on. - :param x_start: the [N x C x ...] tensor of inputs. - :param clip_denoised: if True, clip denoised samples. - :param model_kwargs: if not None, a dict of extra keyword arguments to - pass to the model. This can be used for conditioning. - - :return: a dict containing the following keys: - - total_bpd: the total variational lower-bound, per batch element. - - prior_bpd: the prior term in the lower-bound. - - vb: an [N x T] tensor of terms in the lower-bound. - - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep. - - mse: an [N x T] tensor of epsilon MSEs for each timestep. - """ - device = x_start.device - batch_size = x_start.shape[0] - - vb = [] - xstart_mse = [] - mse = [] - for t in list(range(self.num_timesteps))[::-1]: - t_batch = th.tensor([t] * batch_size, device=device) - noise = th.randn_like(x_start) - x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise) - # Calculate VLB term at the current timestep - with th.no_grad(): - out = self._vb_terms_bpd( - model, - x_start=x_start, - x_t=x_t, - t=t_batch, - clip_denoised=clip_denoised, - model_kwargs=model_kwargs, - ) - vb.append(out["output"]) - xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2)) - eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"]) - mse.append(mean_flat((eps - noise) ** 2)) - - vb = th.stack(vb, dim=1) - xstart_mse = th.stack(xstart_mse, dim=1) - mse = th.stack(mse, dim=1) - - prior_bpd = self._prior_bpd(x_start) - total_bpd = vb.sum(dim=1) + prior_bpd - return { - "total_bpd": total_bpd, - "prior_bpd": prior_bpd, - "vb": vb, - "xstart_mse": xstart_mse, - "mse": mse, - } - - -class SpacedDiffusion(GaussianDiffusion): - """ - A diffusion process which can skip steps in a base diffusion process. - - :param use_timesteps: a collection (sequence or set) of timesteps from the - original diffusion process to retain. - :param kwargs: the kwargs to create the base diffusion process. - """ - - def __init__(self, use_timesteps, **kwargs): - self.use_timesteps = set(use_timesteps) - self.timestep_map = [] - self.original_num_steps = len(kwargs["betas"]) - - base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa - last_alpha_cumprod = 1.0 - new_betas = [] - for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod): - if i in self.use_timesteps: - new_betas.append(1 - alpha_cumprod / last_alpha_cumprod) - last_alpha_cumprod = alpha_cumprod - self.timestep_map.append(i) - kwargs["betas"] = np.array(new_betas) - super().__init__(**kwargs) - - def p_mean_variance(self, model, *args, **kwargs): # pylint: disable=signature-differs - return super().p_mean_variance(self._wrap_model(model), *args, **kwargs) - - def training_losses(self, model, *args, **kwargs): # pylint: disable=signature-differs - return super().training_losses(self._wrap_model(model), *args, **kwargs) - - def autoregressive_training_losses(self, model, *args, **kwargs): # pylint: disable=signature-differs - return super().autoregressive_training_losses(self._wrap_model(model, True), *args, **kwargs) - - def condition_mean(self, cond_fn, *args, **kwargs): - return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs) - - def condition_score(self, cond_fn, *args, **kwargs): - return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs) - - def _wrap_model(self, model, autoregressive=False): - if isinstance(model, _WrappedModel) or isinstance(model, _WrappedAutoregressiveModel): - return model - mod = _WrappedAutoregressiveModel if autoregressive else _WrappedModel - return mod(model, self.timestep_map, self.rescale_timesteps, self.original_num_steps) - - def _scale_timesteps(self, t): - # Scaling is done by the wrapped model. - return t - - -def space_timesteps(num_timesteps, section_counts): - """ - Create a list of timesteps to use from an original diffusion process, - given the number of timesteps we want to take from equally-sized portions - of the original process. - - For example, if there's 300 timesteps and the section counts are [10,15,20] - then the first 100 timesteps are strided to be 10 timesteps, the second 100 - are strided to be 15 timesteps, and the final 100 are strided to be 20. - - If the stride is a string starting with "ddim", then the fixed striding - from the DDIM paper is used, and only one section is allowed. - - :param num_timesteps: the number of diffusion steps in the original - process to divide up. - :param section_counts: either a list of numbers, or a string containing - comma-separated numbers, indicating the step count - per section. As a special case, use "ddimN" where N - is a number of steps to use the striding from the - DDIM paper. - :return: a set of diffusion steps from the original process to use. - """ - if isinstance(section_counts, str): - if section_counts.startswith("ddim"): - desired_count = int(section_counts[len("ddim") :]) - for i in range(1, num_timesteps): - if len(range(0, num_timesteps, i)) == desired_count: - return set(range(0, num_timesteps, i)) - raise ValueError(f"cannot create exactly {num_timesteps} steps with an integer stride") - section_counts = [int(x) for x in section_counts.split(",")] - size_per = num_timesteps // len(section_counts) - extra = num_timesteps % len(section_counts) - start_idx = 0 - all_steps = [] - for i, section_count in enumerate(section_counts): - size = size_per + (1 if i < extra else 0) - if size < section_count: - raise ValueError(f"cannot divide section of {size} steps into {section_count}") - if section_count <= 1: - frac_stride = 1 - else: - frac_stride = (size - 1) / (section_count - 1) - cur_idx = 0.0 - taken_steps = [] - for _ in range(section_count): - taken_steps.append(start_idx + round(cur_idx)) - cur_idx += frac_stride - all_steps += taken_steps - start_idx += size - return set(all_steps) - - -class _WrappedModel: - def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps): - self.model = model - self.timestep_map = timestep_map - self.rescale_timesteps = rescale_timesteps - self.original_num_steps = original_num_steps - - def __call__(self, x, ts, **kwargs): - map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype) - new_ts = map_tensor[ts] - if self.rescale_timesteps: - new_ts = new_ts.float() * (1000.0 / self.original_num_steps) - return self.model(x, new_ts, **kwargs) - - -class _WrappedAutoregressiveModel: - def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps): - self.model = model - self.timestep_map = timestep_map - self.rescale_timesteps = rescale_timesteps - self.original_num_steps = original_num_steps - - def __call__(self, x, x0, ts, **kwargs): - map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype) - new_ts = map_tensor[ts] - if self.rescale_timesteps: - new_ts = new_ts.float() * (1000.0 / self.original_num_steps) - return self.model(x, x0, new_ts, **kwargs) - - -def _extract_into_tensor(arr, timesteps, broadcast_shape): - """ - Extract values from a 1-D numpy array for a batch of indices. - - :param arr: the 1-D numpy array. - :param timesteps: a tensor of indices into the array to extract. - :param broadcast_shape: a larger shape of K dimensions with the batch - dimension equal to the length of timesteps. - :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. - """ - res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() - while len(res.shape) < len(broadcast_shape): - res = res[..., None] - return res.expand(broadcast_shape) diff --git a/TTS/tts/layers/xtts/vocoder.py b/TTS/tts/layers/xtts/vocoder.py deleted file mode 100644 index 0f4991b886..0000000000 --- a/TTS/tts/layers/xtts/vocoder.py +++ /dev/null @@ -1,385 +0,0 @@ -import json -from dataclasses import dataclass -from enum import Enum -from typing import Callable, Optional - -import torch -import torch.nn as nn -import torch.nn.functional as F - -MAX_WAV_VALUE = 32768.0 - - -class KernelPredictor(torch.nn.Module): - """Kernel predictor for the location-variable convolutions""" - - def __init__( - self, - cond_channels, - conv_in_channels, - conv_out_channels, - conv_layers, - conv_kernel_size=3, - kpnet_hidden_channels=64, - kpnet_conv_size=3, - kpnet_dropout=0.0, - kpnet_nonlinear_activation="LeakyReLU", - kpnet_nonlinear_activation_params={"negative_slope": 0.1}, - ): - """ - Args: - cond_channels (int): number of channel for the conditioning sequence, - conv_in_channels (int): number of channel for the input sequence, - conv_out_channels (int): number of channel for the output sequence, - conv_layers (int): number of layers - """ - super().__init__() - - self.conv_in_channels = conv_in_channels - self.conv_out_channels = conv_out_channels - self.conv_kernel_size = conv_kernel_size - self.conv_layers = conv_layers - - kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers # l_w - kpnet_bias_channels = conv_out_channels * conv_layers # l_b - - self.input_conv = nn.Sequential( - nn.utils.weight_norm(nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)), - getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - ) - - self.residual_convs = nn.ModuleList() - padding = (kpnet_conv_size - 1) // 2 - for _ in range(3): - self.residual_convs.append( - nn.Sequential( - nn.Dropout(kpnet_dropout), - nn.utils.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_hidden_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ), - getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - nn.utils.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_hidden_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ), - getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - ) - ) - self.kernel_conv = nn.utils.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_kernel_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ) - self.bias_conv = nn.utils.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_bias_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ) - - def forward(self, c): - """ - Args: - c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) - """ - batch, _, cond_length = c.shape - c = self.input_conv(c) - for residual_conv in self.residual_convs: - residual_conv.to(c.device) - c = c + residual_conv(c) - k = self.kernel_conv(c) - b = self.bias_conv(c) - kernels = k.contiguous().view( - batch, - self.conv_layers, - self.conv_in_channels, - self.conv_out_channels, - self.conv_kernel_size, - cond_length, - ) - bias = b.contiguous().view( - batch, - self.conv_layers, - self.conv_out_channels, - cond_length, - ) - - return kernels, bias - - def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.input_conv[0]) - nn.utils.remove_weight_norm(self.kernel_conv) - nn.utils.remove_weight_norm(self.bias_conv) - for block in self.residual_convs: - nn.utils.remove_weight_norm(block[1]) - nn.utils.remove_weight_norm(block[3]) - - -class LVCBlock(torch.nn.Module): - """the location-variable convolutions""" - - def __init__( - self, - in_channels, - cond_channels, - stride, - dilations=[1, 3, 9, 27], - lReLU_slope=0.2, - conv_kernel_size=3, - cond_hop_length=256, - kpnet_hidden_channels=64, - kpnet_conv_size=3, - kpnet_dropout=0.0, - ): - super().__init__() - - self.cond_hop_length = cond_hop_length - self.conv_layers = len(dilations) - self.conv_kernel_size = conv_kernel_size - - self.kernel_predictor = KernelPredictor( - cond_channels=cond_channels, - conv_in_channels=in_channels, - conv_out_channels=2 * in_channels, - conv_layers=len(dilations), - conv_kernel_size=conv_kernel_size, - kpnet_hidden_channels=kpnet_hidden_channels, - kpnet_conv_size=kpnet_conv_size, - kpnet_dropout=kpnet_dropout, - kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope}, - ) - - self.convt_pre = nn.Sequential( - nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm( - nn.ConvTranspose1d( - in_channels, - in_channels, - 2 * stride, - stride=stride, - padding=stride // 2 + stride % 2, - output_padding=stride % 2, - ) - ), - ) - - self.conv_blocks = nn.ModuleList() - for dilation in dilations: - self.conv_blocks.append( - nn.Sequential( - nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm( - nn.Conv1d( - in_channels, - in_channels, - conv_kernel_size, - padding=dilation * (conv_kernel_size - 1) // 2, - dilation=dilation, - ) - ), - nn.LeakyReLU(lReLU_slope), - ) - ) - - def forward(self, x, c): - """forward propagation of the location-variable convolutions. - Args: - x (Tensor): the input sequence (batch, in_channels, in_length) - c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) - - Returns: - Tensor: the output sequence (batch, in_channels, in_length) - """ - _, in_channels, _ = x.shape # (B, c_g, L') - - x = self.convt_pre(x) # (B, c_g, stride * L') - kernels, bias = self.kernel_predictor(c) - - for i, conv in enumerate(self.conv_blocks): - output = conv(x) # (B, c_g, stride * L') - - k = kernels[:, i, :, :, :, :] # (B, 2 * c_g, c_g, kernel_size, cond_length) - b = bias[:, i, :, :] # (B, 2 * c_g, cond_length) - - output = self.location_variable_convolution( - output, k, b, hop_size=self.cond_hop_length - ) # (B, 2 * c_g, stride * L'): LVC - x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh( - output[:, in_channels:, :] - ) # (B, c_g, stride * L'): GAU - - return x - - def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256): - """perform location-variable convolution operation on the input sequence (x) using the local convolution kernl. - Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100. - Args: - x (Tensor): the input sequence (batch, in_channels, in_length). - kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length) - bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length) - dilation (int): the dilation of convolution. - hop_size (int): the hop_size of the conditioning sequence. - Returns: - (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length). - """ - batch, _, in_length = x.shape - batch, _, out_channels, kernel_size, kernel_length = kernel.shape - assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched" - - padding = dilation * int((kernel_size - 1) / 2) - x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding) - x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding) - - if hop_size < dilation: - x = F.pad(x, (0, dilation), "constant", 0) - x = x.unfold( - 3, dilation, dilation - ) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation) - x = x[:, :, :, :, :hop_size] - x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation) - x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size) - - o = torch.einsum("bildsk,biokl->bolsd", x, kernel) - o = o.to(memory_format=torch.channels_last_3d) - bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d) - o = o + bias - o = o.contiguous().view(batch, out_channels, -1) - - return o - - def remove_weight_norm(self): - self.kernel_predictor.remove_weight_norm() - nn.utils.remove_weight_norm(self.convt_pre[1]) - for block in self.conv_blocks: - nn.utils.remove_weight_norm(block[1]) - - -class UnivNetGenerator(nn.Module): - """ - UnivNet Generator - - Originally from https://github.com/mindslab-ai/univnet/blob/master/model/generator.py. - """ - - def __init__( - self, - noise_dim=64, - channel_size=32, - dilations=[1, 3, 9, 27], - strides=[8, 8, 4], - lReLU_slope=0.2, - kpnet_conv_size=3, - # Below are MEL configurations options that this generator requires. - hop_length=256, - n_mel_channels=100, - ): - super(UnivNetGenerator, self).__init__() - self.mel_channel = n_mel_channels - self.noise_dim = noise_dim - self.hop_length = hop_length - channel_size = channel_size - kpnet_conv_size = kpnet_conv_size - - self.res_stack = nn.ModuleList() - hop_length = 1 - for stride in strides: - hop_length = stride * hop_length - self.res_stack.append( - LVCBlock( - channel_size, - n_mel_channels, - stride=stride, - dilations=dilations, - lReLU_slope=lReLU_slope, - cond_hop_length=hop_length, - kpnet_conv_size=kpnet_conv_size, - ) - ) - - self.conv_pre = nn.utils.weight_norm(nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode="reflect")) - - self.conv_post = nn.Sequential( - nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm(nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode="reflect")), - nn.Tanh(), - ) - - def forward(self, c, z): - """ - Args: - c (Tensor): the conditioning sequence of mel-spectrogram (batch, mel_channels, in_length) - z (Tensor): the noise sequence (batch, noise_dim, in_length) - - """ - z = self.conv_pre(z) # (B, c_g, L) - - for res_block in self.res_stack: - res_block.to(z.device) - z = res_block(z, c) # (B, c_g, L * s_0 * ... * s_i) - - z = self.conv_post(z) # (B, 1, L * 256) - - return z - - def eval(self, inference=False): - super(UnivNetGenerator, self).eval() - # don't remove weight norm while validation in training loop - if inference: - self.remove_weight_norm() - - def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.conv_pre) - - for layer in self.conv_post: - if len(layer.state_dict()) != 0: - nn.utils.remove_weight_norm(layer) - - for res_block in self.res_stack: - res_block.remove_weight_norm() - - def inference(self, c, z=None): - # pad input mel with zeros to cut artifact - # see https://github.com/seungwonpark/melgan/issues/8 - zero = torch.full((c.shape[0], self.mel_channel, 10), -11.5129).to(c.device) - mel = torch.cat((c, zero), dim=2) - - if z is None: - z = torch.randn(c.shape[0], self.noise_dim, mel.size(2)).to(mel.device) - - audio = self.forward(mel, z) - audio = audio[:, :, : -(self.hop_length * 10)] - audio = audio.clamp(min=-1, max=1) - return audio - - -if __name__ == "__main__": - model = UnivNetGenerator() - - c = torch.randn(3, 100, 10) - z = torch.randn(3, 64, 10) - print(c.shape) - - y = model(c, z) - print(y.shape) - assert y.shape == torch.Size([3, 1, 2560]) - - pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - print(pytorch_total_params) diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py index 4aaf526111..f38dace235 100644 --- a/TTS/tts/models/base_tacotron.py +++ b/TTS/tts/models/base_tacotron.py @@ -252,7 +252,12 @@ def compute_gst(self, inputs, style_input, speaker_embedding=None): def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None): """Capacitron Variational Autoencoder""" - (VAE_outputs, posterior_distribution, prior_distribution, capacitron_beta,) = self.capacitron_vae_layer( + ( + VAE_outputs, + posterior_distribution, + prior_distribution, + capacitron_beta, + ) = self.capacitron_vae_layer( reference_mel_info, text_info, speaker_embedding, # pylint: disable=not-callable diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py index c8cfcfdd04..16644ff95e 100644 --- a/TTS/tts/models/tortoise.py +++ b/TTS/tts/models/tortoise.py @@ -676,7 +676,12 @@ def inference( ), "Too much text provided. Break the text up into separate segments and re-try inference." if voice_samples is not None: - (auto_conditioning, diffusion_conditioning, _, _,) = self.get_conditioning_latents( + ( + auto_conditioning, + diffusion_conditioning, + _, + _, + ) = self.get_conditioning_latents( voice_samples, return_mels=True, latent_averaging_mode=latent_averaging_mode, diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 477f31bfc9..af94675be9 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -9,13 +9,10 @@ from coqpit import Coqpit from TTS.tts.layers.tortoise.audio_utils import denormalize_tacotron_mel, wav_to_univnet_mel -from TTS.tts.layers.tortoise.diffusion_decoder import DiffusionTts -from TTS.tts.layers.xtts.diffusion import SpacedDiffusion, get_named_beta_schedule, space_timesteps from TTS.tts.layers.xtts.gpt import GPT from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder from TTS.tts.layers.xtts.stream_generator import init_stream_support from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer -from TTS.tts.layers.xtts.vocoder import UnivNetGenerator from TTS.tts.models.base_tts import BaseTTS from TTS.utils.io import load_fsspec @@ -168,12 +165,10 @@ class XttsAudioConfig(Coqpit): Args: sample_rate (int): The sample rate in which the GPT operates. - diffusion_sample_rate (int): The sample rate of the diffusion audio waveform. output_sample_rate (int): The sample rate of the output audio waveform. """ sample_rate: int = 22050 - diffusion_sample_rate: int = 24000 output_sample_rate: int = 24000 @@ -189,7 +184,6 @@ class XttsArgs(Coqpit): clvp_checkpoint (str, optional): The checkpoint for the ConditionalLatentVariablePerseq model. Defaults to None. decoder_checkpoint (str, optional): The checkpoint for the DiffTTS model. Defaults to None. num_chars (int, optional): The maximum number of characters to generate. Defaults to 255. - use_hifigan (bool, optional): Whether to use hifigan with implicit enhancement or diffusion + univnet as a decoder. Defaults to True. For GPT model: gpt_max_audio_tokens (int, optional): The maximum mel tokens for the autoregressive model. Defaults to 604. @@ -227,7 +221,6 @@ class XttsArgs(Coqpit): clvp_checkpoint: str = None decoder_checkpoint: str = None num_chars: int = 255 - use_hifigan: bool = True # XTTS GPT Encoder params tokenizer_file: str = "" @@ -324,32 +317,15 @@ def init_models(self): code_stride_len=self.args.gpt_code_stride_len, ) - if self.args.use_hifigan: - self.hifigan_decoder = HifiDecoder( - input_sample_rate=self.args.input_sample_rate, - output_sample_rate=self.args.output_sample_rate, - output_hop_length=self.args.output_hop_length, - ar_mel_length_compression=self.args.gpt_code_stride_len, - decoder_input_dim=self.args.decoder_input_dim, - d_vector_dim=self.args.d_vector_dim, - cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer, - ) - - if not self.args.use_hifigan: - self.diffusion_decoder = DiffusionTts( - model_channels=self.args.diff_model_channels, - num_layers=self.args.diff_num_layers, - in_channels=self.args.diff_in_channels, - out_channels=self.args.diff_out_channels, - in_latent_channels=self.args.diff_in_latent_channels, - in_tokens=self.args.diff_in_tokens, - dropout=self.args.diff_dropout, - use_fp16=self.args.diff_use_fp16, - num_heads=self.args.diff_num_heads, - layer_drop=self.args.diff_layer_drop, - unconditioned_percentage=self.args.diff_unconditioned_percentage, - ) - self.vocoder = UnivNetGenerator() + self.hifigan_decoder = HifiDecoder( + input_sample_rate=self.args.input_sample_rate, + output_sample_rate=self.args.output_sample_rate, + output_hop_length=self.args.output_hop_length, + ar_mel_length_compression=self.args.gpt_code_stride_len, + decoder_input_dim=self.args.decoder_input_dim, + d_vector_dim=self.args.d_vector_dim, + cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer, + ) @property def device(self): @@ -430,7 +406,6 @@ def get_conditioning_latents( sound_norm_refs=False, ): speaker_embedding = None - diffusion_cond_latents = None audio, sr = torchaudio.load(audio_path) audio = audio[:, : sr * max_ref_length].to(self.device) @@ -441,12 +416,9 @@ def get_conditioning_latents( if librosa_trim_db is not None: audio = librosa.effects.trim(audio, top_db=librosa_trim_db)[0] - if self.args.use_hifigan or self.args.use_hifigan: - speaker_embedding = self.get_speaker_embedding(audio, sr) - else: - diffusion_cond_latents = self.get_diffusion_cond_latents(audio, sr) + speaker_embedding = self.get_speaker_embedding(audio, sr) gpt_cond_latents = self.get_gpt_cond_latents(audio, sr, length=gpt_cond_len) # [1, 1024, T] - return gpt_cond_latents, diffusion_cond_latents, speaker_embedding + return gpt_cond_latents, speaker_embedding def synthesize(self, text, config, speaker_wav, language, **kwargs): """Synthesize speech with the given input text. @@ -579,7 +551,7 @@ def full_inference( Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length. Sample rate is 24kHz. """ - (gpt_cond_latent, diffusion_conditioning, speaker_embedding) = self.get_conditioning_latents( + (gpt_cond_latent, speaker_embedding) = self.get_conditioning_latents( audio_path=ref_audio_path, gpt_cond_len=gpt_cond_len, max_ref_length=max_ref_len, @@ -591,7 +563,6 @@ def full_inference( language, gpt_cond_latent, speaker_embedding, - diffusion_conditioning, temperature=temperature, length_penalty=length_penalty, repetition_penalty=repetition_penalty, @@ -614,7 +585,6 @@ def inference( language, gpt_cond_latent, speaker_embedding, - diffusion_conditioning, # GPT inference temperature=0.65, length_penalty=1, @@ -643,14 +613,6 @@ def inference( text_tokens.shape[-1] < self.args.gpt_max_text_tokens ), " ❗ XTTS can only generate text with a maximum of 400 tokens." - if not self.args.use_hifigan: - diffuser = load_discrete_vocoder_diffuser( - desired_diffusion_steps=decoder_iterations, - cond_free=cond_free, - cond_free_k=cond_free_k, - sampler=decoder_sampler, - ) - with torch.no_grad(): gpt_codes = self.gpt.generate( cond_latents=gpt_cond_latent, @@ -692,29 +654,12 @@ def inference( gpt_latents = gpt_latents[:, :k] break - if decoder == "hifigan": - assert ( - hasattr(self, "hifigan_decoder") and self.hifigan_decoder is not None - ), "You must enable hifigan decoder to use it by setting config `use_hifigan: true`" - wav = self.hifigan_decoder(gpt_latents, g=speaker_embedding) - else: - assert hasattr( - self, "diffusion_decoder" - ), "You must disable hifigan decoders to use difffusion by setting `use_hifigan: false`" - mel = do_spectrogram_diffusion( - self.diffusion_decoder, - diffuser, - gpt_latents, - diffusion_conditioning, - temperature=diffusion_temperature, - ) - wav = self.vocoder.inference(mel) + wav = self.hifigan_decoder(gpt_latents, g=speaker_embedding) return { "wav": wav.cpu().numpy().squeeze(), "gpt_latents": gpt_latents, "speaker_embedding": speaker_embedding, - "diffusion_conditioning": diffusion_conditioning, } def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len): @@ -752,9 +697,6 @@ def inference_stream( decoder="hifigan", **hf_generate_kwargs, ): - assert hasattr( - self, "hifigan_decoder" - ), "`inference_stream` requires use_hifigan to be set to true in the config.model_args, diffusion is too slow to stream." text = text.strip().lower() text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device) @@ -793,13 +735,7 @@ def inference_stream( if is_end or (stream_chunk_size > 0 and len(last_tokens) >= stream_chunk_size): gpt_latents = torch.cat(all_latents, dim=0)[None, :] - if decoder == "hifigan": - assert hasattr( - self, "hifigan_decoder" - ), "You must enable hifigan decoder to use it by setting config `use_hifigan: true`" - wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device)) - else: - raise NotImplementedError("Diffusion for streaming inference not implemented.") + wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device)) wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks( wav_gen.squeeze(), wav_gen_prev, wav_overlap, overlap_wav_len ) @@ -827,10 +763,8 @@ def eval(self): # pylint: disable=redefined-builtin def get_compatible_checkpoint_state_dict(self, model_path): checkpoint = load_fsspec(model_path, map_location=torch.device("cpu"))["model"] - ignore_keys = ["diffusion_decoder", "vocoder"] if self.args.use_hifigan else [] - ignore_keys += [] if self.args.use_hifigan else ["hifigan_decoder"] # remove xtts gpt trainer extra keys - ignore_keys += ["torch_mel_spectrogram_style_encoder", "torch_mel_spectrogram_dvae", "dvae"] + ignore_keys = ["torch_mel_spectrogram_style_encoder", "torch_mel_spectrogram_dvae", "dvae"] for key in list(checkpoint.keys()): # check if it is from the coqui Trainer if so convert it if key.startswith("xtts."): @@ -889,12 +823,7 @@ def load_checkpoint( self.load_state_dict(checkpoint, strict=strict) if eval: - if hasattr(self, "hifigan_decoder"): - self.hifigan_decoder.eval() - if hasattr(self, "diffusion_decoder"): - self.diffusion_decoder.eval() - if hasattr(self, "vocoder"): - self.vocoder.eval() + self.hifigan_decoder.eval() self.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache, use_deepspeed=use_deepspeed) self.gpt.eval() diff --git a/recipes/ljspeech/xtts_v1/train_gpt_xtts.py b/recipes/ljspeech/xtts_v1/train_gpt_xtts.py index 9134be0db2..268a033535 100644 --- a/recipes/ljspeech/xtts_v1/train_gpt_xtts.py +++ b/recipes/ljspeech/xtts_v1/train_gpt_xtts.py @@ -94,12 +94,9 @@ def main(): gpt_num_audio_tokens=8194, gpt_start_audio_token=8192, gpt_stop_audio_token=8193, - use_ne_hifigan=True, # if it is true it will keep the non-enhanced keys on the output checkpoint ) # define audio config - audio_config = XttsAudioConfig( - sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000 - ) + audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) # training parameters config config = GPTTrainerConfig( output_path=OUT_PATH, diff --git a/recipes/ljspeech/xtts_v2/train_gpt_xtts.py b/recipes/ljspeech/xtts_v2/train_gpt_xtts.py index ee6b22becd..d94204ca4d 100644 --- a/recipes/ljspeech/xtts_v2/train_gpt_xtts.py +++ b/recipes/ljspeech/xtts_v2/train_gpt_xtts.py @@ -93,14 +93,11 @@ def main(): gpt_num_audio_tokens=8194, gpt_start_audio_token=8192, gpt_stop_audio_token=8193, - use_ne_hifigan=True, # if it is true it will keep the non-enhanced keys on the output checkpoint gpt_use_masking_gt_prompt_approach=True, gpt_use_perceiver_resampler=True, ) # define audio config - audio_config = XttsAudioConfig( - sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000 - ) + audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) # training parameters config config = GPTTrainerConfig( output_path=OUT_PATH, diff --git a/tests/xtts_tests/test_xtts_gpt_train.py b/tests/xtts_tests/test_xtts_gpt_train.py index 47b1dd7d27..03514daa3b 100644 --- a/tests/xtts_tests/test_xtts_gpt_train.py +++ b/tests/xtts_tests/test_xtts_gpt_train.py @@ -86,11 +86,8 @@ gpt_num_audio_tokens=8194, gpt_start_audio_token=8192, gpt_stop_audio_token=8193, - use_ne_hifigan=True, -) -audio_config = XttsAudioConfig( - sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000 ) +audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) config = GPTTrainerConfig( epochs=1, output_path=OUT_PATH, diff --git a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py b/tests/xtts_tests/test_xtts_v2-0_gpt_train.py index 6b6f1330dc..8099503855 100644 --- a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py +++ b/tests/xtts_tests/test_xtts_v2-0_gpt_train.py @@ -86,11 +86,8 @@ gpt_stop_audio_token=8193, gpt_use_masking_gt_prompt_approach=True, gpt_use_perceiver_resampler=True, - use_ne_hifigan=True, -) -audio_config = XttsAudioConfig( - sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000 ) +audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) config = GPTTrainerConfig( epochs=1, output_path=OUT_PATH,