diff --git a/TTS/tts/layers/tortoise/dpm_solver.py b/TTS/tts/layers/tortoise/dpm_solver.py
index cb540577f8..2166eebb3c 100644
--- a/TTS/tts/layers/tortoise/dpm_solver.py
+++ b/TTS/tts/layers/tortoise/dpm_solver.py
@@ -1548,4 +1548,4 @@ def expand_dims(v, dims):
     Returns:
         a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
     """
-    return v[(...,) + (None,) * (dims - 1)]
+    return v[(...,) + (None,) * (dims - 1)]
\ No newline at end of file
diff --git a/TTS/tts/layers/xtts/diffusion.py b/TTS/tts/layers/xtts/diffusion.py
deleted file mode 100644
index 37665bc676..0000000000
--- a/TTS/tts/layers/xtts/diffusion.py
+++ /dev/null
@@ -1,1319 +0,0 @@
-import enum
-import math
-
-import numpy as np
-import torch
-import torch as th
-from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
-from tqdm import tqdm
-
-from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper
-
-K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
-SAMPLERS = ["dpm++2m", "p", "ddim"]
-
-
-def normal_kl(mean1, logvar1, mean2, logvar2):
-    """
-    Compute the KL divergence between two gaussians.
-
-    Shapes are automatically broadcasted, so batches can be compared to
-    scalars, among other use cases.
-    """
-    tensor = None
-    for obj in (mean1, logvar1, mean2, logvar2):
-        if isinstance(obj, th.Tensor):
-            tensor = obj
-            break
-    assert tensor is not None, "at least one argument must be a Tensor"
-
-    # Force variances to be Tensors. Broadcasting helps convert scalars to
-    # Tensors, but it does not work for th.exp().
-    logvar1, logvar2 = [x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) for x in (logvar1, logvar2)]
-
-    return 0.5 * (-1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * th.exp(-logvar2))
-
-
-def approx_standard_normal_cdf(x):
-    """
-    A fast approximation of the cumulative distribution function of the
-    standard normal.
-    """
-    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
-
-
-def discretized_gaussian_log_likelihood(x, *, means, log_scales):
-    """
-    Compute the log-likelihood of a Gaussian distribution discretizing to a
-    given image.
-
-    :param x: the target images. It is assumed that this was uint8 values,
-              rescaled to the range [-1, 1].
-    :param means: the Gaussian mean Tensor.
-    :param log_scales: the Gaussian log stddev Tensor.
-    :return: a tensor like x of log probabilities (in nats).
-    """
-    assert x.shape == means.shape == log_scales.shape
-    centered_x = x - means
-    inv_stdv = th.exp(-log_scales)
-    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
-    cdf_plus = approx_standard_normal_cdf(plus_in)
-    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
-    cdf_min = approx_standard_normal_cdf(min_in)
-    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
-    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
-    cdf_delta = cdf_plus - cdf_min
-    log_probs = th.where(
-        x < -0.999,
-        log_cdf_plus,
-        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
-    )
-    assert log_probs.shape == x.shape
-    return log_probs
-
-
-def mean_flat(tensor):
-    """
-    Take the mean over all non-batch dimensions.
-    """
-    return tensor.mean(dim=list(range(1, len(tensor.shape))))
-
-
-def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
-    """
-    Get a pre-defined beta schedule for the given name.
-
-    The beta schedule library consists of beta schedules which remain similar
-    in the limit of num_diffusion_timesteps.
-    Beta schedules may be added, but should not be removed or changed once
-    they are committed to maintain backwards compatibility.
-    """
-    if schedule_name == "linear":
-        # Linear schedule from Ho et al, extended to work for any number of
-        # diffusion steps.
-        scale = 1000 / num_diffusion_timesteps
-        beta_start = scale * 0.0001
-        beta_end = scale * 0.02
-        return np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
-    elif schedule_name == "cosine":
-        return betas_for_alpha_bar(
-            num_diffusion_timesteps,
-            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
-        )
-    else:
-        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
-
-
-def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function,
-    which defines the cumulative product of (1-beta) over time from t = [0,1].
-
-    :param num_diffusion_timesteps: the number of betas to produce.
-    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
-                      produces the cumulative product of (1-beta) up to that
-                      part of the diffusion process.
-    :param max_beta: the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-    """
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return np.array(betas)
-
-
-class ModelMeanType(enum.Enum):
-    """
-    Which type of output the model predicts.
-    """
-
-    PREVIOUS_X = "previous_x"  # the model predicts x_{t-1}
-    START_X = "start_x"  # the model predicts x_0
-    EPSILON = "epsilon"  # the model predicts epsilon
-
-
-class ModelVarType(enum.Enum):
-    """
-    What is used as the model's output variance.
-
-    The LEARNED_RANGE option has been added to allow the model to predict
-    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
-    """
-
-    LEARNED = "learned"
-    FIXED_SMALL = "fixed_small"
-    FIXED_LARGE = "fixed_large"
-    LEARNED_RANGE = "learned_range"
-
-
-class LossType(enum.Enum):
-    MSE = "mse"  # use raw MSE loss (and KL when learning variances)
-    RESCALED_MSE = "rescaled_mse"  # use raw MSE loss (with RESCALED_KL when learning variances)
-    KL = "kl"  # use the variational lower-bound
-    RESCALED_KL = "rescaled_kl"  # like KL, but rescale to estimate the full VLB
-
-    def is_vb(self):
-        return self == LossType.KL or self == LossType.RESCALED_KL
-
-
-class GaussianDiffusion:
-    """
-    Utilities for training and sampling diffusion models.
-
-    Ported directly from here, and then adapted over time to further experimentation.
-    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
-
-    :param betas: a 1-D numpy array of betas for each diffusion timestep,
-                  starting at T and going to 1.
-    :param model_mean_type: a ModelMeanType determining what the model outputs.
-    :param model_var_type: a ModelVarType determining how variance is output.
-    :param loss_type: a LossType determining the loss function to use.
-    :param rescale_timesteps: if True, pass floating point timesteps into the
-                              model so that they are always scaled like in the
-                              original paper (0 to 1000).
-    """
-
-    def __init__(
-        self,
-        *,
-        betas,
-        model_mean_type,
-        model_var_type,
-        loss_type,
-        rescale_timesteps=False,  # this is generally False
-        conditioning_free=False,
-        conditioning_free_k=1,
-        ramp_conditioning_free=True,
-        sampler="ddim",
-    ):
-        self.sampler = sampler
-        self.model_mean_type = ModelMeanType(model_mean_type)
-        self.model_var_type = ModelVarType(model_var_type)
-        self.loss_type = LossType(loss_type)
-        self.rescale_timesteps = rescale_timesteps
-        self.conditioning_free = conditioning_free
-        self.conditioning_free_k = conditioning_free_k
-        self.ramp_conditioning_free = ramp_conditioning_free
-
-        # Use float64 for accuracy.
-        betas = np.array(betas, dtype=np.float64)
-        self.betas = betas
-        assert len(betas.shape) == 1, "betas must be 1-D"
-        assert (betas > 0).all() and (betas <= 1).all()
-
-        self.num_timesteps = int(betas.shape[0])
-
-        alphas = 1.0 - betas
-        self.alphas_cumprod = np.cumprod(alphas, axis=0)
-        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
-        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
-        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
-
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
-        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
-        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
-        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
-        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
-
-        # calculations for posterior q(x_{t-1} | x_t, x_0)
-        self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
-        # log calculation clipped because the posterior variance is 0 at the
-        # beginning of the diffusion chain.
-        self.posterior_log_variance_clipped = np.log(np.append(self.posterior_variance[1], self.posterior_variance[1:]))
-        self.posterior_mean_coef1 = betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
-        self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
-
-    def q_mean_variance(self, x_start, t):
-        """
-        Get the distribution q(x_t | x_0).
-
-        :param x_start: the [N x C x ...] tensor of noiseless inputs.
-        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
-        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
-        """
-        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
-        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
-        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
-        return mean, variance, log_variance
-
-    def q_sample(self, x_start, t, noise=None):
-        """
-        Diffuse the data for a given number of diffusion steps.
-
-        In other words, sample from q(x_t | x_0).
-
-        :param x_start: the initial data batch.
-        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
-        :param noise: if specified, the split-out normal noise.
-        :return: A noisy version of x_start.
-        """
-        if noise is None:
-            noise = th.randn_like(x_start)
-        assert noise.shape == x_start.shape
-        return (
-            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
-            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
-        )
-
-    def q_posterior_mean_variance(self, x_start, x_t, t):
-        """
-        Compute the mean and variance of the diffusion posterior:
-
-            q(x_{t-1} | x_t, x_0)
-
-        """
-        assert x_start.shape == x_t.shape
-        posterior_mean = (
-            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
-            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
-        )
-        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
-        posterior_log_variance_clipped = _extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
-        assert (
-            posterior_mean.shape[0]
-            == posterior_variance.shape[0]
-            == posterior_log_variance_clipped.shape[0]
-            == x_start.shape[0]
-        )
-        return posterior_mean, posterior_variance, posterior_log_variance_clipped
-
-    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
-        """
-        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
-        the initial x, x_0.
-
-        :param model: the model, which takes a signal and a batch of timesteps
-                      as input.
-        :param x: the [N x C x ...] tensor at time t.
-        :param t: a 1-D Tensor of timesteps.
-        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
-        :param denoised_fn: if not None, a function which applies to the
-            x_start prediction before it is used to sample. Applies before
-            clip_denoised.
-        :param model_kwargs: if not None, a dict of extra keyword arguments to
-            pass to the model. This can be used for conditioning.
-        :return: a dict with the following keys:
-                 - 'mean': the model mean output.
-                 - 'variance': the model variance output.
-                 - 'log_variance': the log of 'variance'.
-                 - 'pred_xstart': the prediction for x_0.
-        """
-        if model_kwargs is None:
-            model_kwargs = {}
-
-        assert self.model_var_type == ModelVarType.LEARNED_RANGE
-        assert self.model_mean_type == ModelMeanType.EPSILON
-        assert denoised_fn is None
-        assert clip_denoised is True
-        B, C = x.shape[:2]
-        assert t.shape == (B,)
-        model_output = model(x, self._scale_timesteps(t), **model_kwargs)
-        if self.conditioning_free:
-            model_output_no_conditioning = model(x, self._scale_timesteps(t), conditioning_free=True, **model_kwargs)
-
-        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
-            assert model_output.shape == (B, C * 2, *x.shape[2:])
-            model_output, model_var_values = th.split(model_output, C, dim=1)
-            if self.conditioning_free:
-                model_output_no_conditioning, _ = th.split(model_output_no_conditioning, C, dim=1)
-            if self.model_var_type == ModelVarType.LEARNED:
-                assert False
-                model_log_variance = model_var_values
-                model_variance = th.exp(model_log_variance)
-            else:
-                min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
-                max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
-                # The model_var_values is [-1, 1] for [min_var, max_var].
-                frac = (model_var_values + 1) / 2
-                model_log_variance = frac * max_log + (1 - frac) * min_log
-                model_variance = th.exp(model_log_variance)
-        else:
-            assert False
-            model_variance, model_log_variance = {
-                # for fixedlarge, we set the initial (log-)variance like so
-                # to get a better decoder log likelihood.
-                ModelVarType.FIXED_LARGE: (
-                    np.append(self.posterior_variance[1], self.betas[1:]),
-                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
-                ),
-                ModelVarType.FIXED_SMALL: (
-                    self.posterior_variance,
-                    self.posterior_log_variance_clipped,
-                ),
-            }[self.model_var_type]
-            model_variance = _extract_into_tensor(model_variance, t, x.shape)
-            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
-
-        if self.conditioning_free:
-            if self.ramp_conditioning_free:
-                assert t.shape[0] == 1  # This should only be used in inference.
-                cfk = self.conditioning_free_k * (1 - self._scale_timesteps(t)[0].item() / self.num_timesteps)
-            else:
-                cfk = self.conditioning_free_k
-            model_output = (1 + cfk) * model_output - cfk * model_output_no_conditioning
-
-        def process_xstart(x):
-            if denoised_fn is not None:
-                assert False
-                x = denoised_fn(x)
-            if clip_denoised:
-                return x.clamp(-1, 1)
-            assert False
-            return x
-
-        if self.model_mean_type == ModelMeanType.PREVIOUS_X:
-            assert False
-            pred_xstart = process_xstart(self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output))
-            model_mean = model_output
-        elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]:
-            if self.model_mean_type == ModelMeanType.START_X:
-                assert False
-                pred_xstart = process_xstart(model_output)
-            else:
-                pred_xstart = process_xstart(self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output))
-            model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
-        else:
-            raise NotImplementedError(self.model_mean_type)
-
-        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
-        return {
-            "mean": model_mean,
-            "variance": model_variance,
-            "log_variance": model_log_variance,
-            "pred_xstart": pred_xstart,
-        }
-
-    def _predict_xstart_from_eps(self, x_t, t, eps):
-        assert x_t.shape == eps.shape
-        return (
-            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
-            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
-        )
-
-    def _predict_xstart_from_xprev(self, x_t, t, xprev):
-        assert x_t.shape == xprev.shape
-        return (  # (xprev - coef2*x_t) / coef1
-            _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev
-            - _extract_into_tensor(self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape) * x_t
-        )
-
-    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
-        return (
-            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
-        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
-
-    def _scale_timesteps(self, t):
-        if self.rescale_timesteps:
-            return t.float() * (1000.0 / self.num_timesteps)
-        return t
-
-    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
-        """
-        Compute the mean for the previous step, given a function cond_fn that
-        computes the gradient of a conditional log probability with respect to
-        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
-        condition on y.
-
-        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
-        """
-        gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs)
-        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
-        return new_mean
-
-    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
-        """
-        Compute what the p_mean_variance output would have been, should the
-        model's score function be conditioned by cond_fn.
-
-        See condition_mean() for details on cond_fn.
-
-        Unlike condition_mean(), this instead uses the conditioning strategy
-        from Song et al (2020).
-        """
-        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
-
-        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
-        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, self._scale_timesteps(t), **model_kwargs)
-
-        out = p_mean_var.copy()
-        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
-        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
-        return out
-
-    def p_sample(
-        self,
-        model,
-        x,
-        t,
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        model_kwargs=None,
-    ):
-        """
-        Sample x_{t-1} from the model at the given timestep.
-
-        :param model: the model to sample from.
-        :param x: the current tensor at x_{t-1}.
-        :param t: the value of t, starting at 0 for the first diffusion step.
-        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
-        :param denoised_fn: if not None, a function which applies to the
-            x_start prediction before it is used to sample.
-        :param cond_fn: if not None, this is a gradient function that acts
-                        similarly to the model.
-        :param model_kwargs: if not None, a dict of extra keyword arguments to
-            pass to the model. This can be used for conditioning.
-        :return: a dict containing the following keys:
-                 - 'sample': a random sample from the model.
-                 - 'pred_xstart': a prediction of x_0.
-        """
-        out = self.p_mean_variance(
-            model,
-            x,
-            t,
-            clip_denoised=clip_denoised,
-            denoised_fn=denoised_fn,
-            model_kwargs=model_kwargs,
-        )
-        noise = th.randn_like(x)
-        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))  # no noise when t == 0
-        if cond_fn is not None:
-            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
-        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
-        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
-
-    def k_diffusion_sample_loop(
-        self,
-        k_sampler,
-        pbar,
-        model,
-        shape,
-        noise=None,  # all given
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        device=None,  # ALL UNUSED
-        model_kwargs=None,  # {'precomputed_aligned_embeddings': precomputed_embeddings},
-        progress=False,  # unused as well
-    ):
-        assert isinstance(model_kwargs, dict)
-        if device is None:
-            device = next(model.parameters()).device
-        s_in = noise.new_ones([noise.shape[0]])
-
-        def model_split(*args, **kwargs):
-            model_output = model(*args, **kwargs)
-            model_epsilon, model_var = th.split(model_output, model_output.shape[1] // 2, dim=1)
-            return model_epsilon, model_var
-
-        #
-        """
-        print(self.betas)
-        print(th.tensor(self.betas))
-        noise_schedule = NoiseScheduleVP(schedule='discrete', betas=th.tensor(self.betas))
-        """
-        noise_schedule = NoiseScheduleVP(schedule="linear", continuous_beta_0=0.1 / 4, continuous_beta_1=20.0 / 4)
-
-        def model_fn_prewrap(x, t, *args, **kwargs):
-            """
-                x_in = torch.cat([x] * 2)
-                t_in = torch.cat([t_continuous] * 2)
-                c_in = torch.cat([unconditional_condition, condition])
-                noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
-            print(t)
-            print(self.timestep_map)
-            exit()
-            """
-            """
-            model_output = model(x, self._scale_timesteps(t*4000), **model_kwargs)
-            out = self.p_mean_variance(model, x, t*4000, model_kwargs=model_kwargs)
-            return out['pred_xstart']
-            """
-            x, _ = x.chunk(2)
-            t, _ = (t * 1000).chunk(2)
-            res = torch.cat(
-                [
-                    model_split(x, t, conditioning_free=True, **model_kwargs)[0],
-                    model_split(x, t, **model_kwargs)[0],
-                ]
-            )
-            pbar.update(1)
-            return res
-
-        model_fn = model_wrapper(
-            model_fn_prewrap,
-            noise_schedule,
-            model_type="noise",  # "noise" or "x_start" or "v" or "score"
-            model_kwargs=model_kwargs,
-            guidance_type="classifier-free",
-            condition=th.Tensor(1),
-            unconditional_condition=th.Tensor(1),
-            guidance_scale=self.conditioning_free_k,
-        )
-        """
-        model_fn = model_wrapper(
-            model_fn_prewrap,
-            noise_schedule,
-            model_type='x_start',
-            model_kwargs={}
-        )
-        #
-        dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver")
-        x_sample = dpm_solver.sample(
-            noise,
-            steps=20,
-            order=3,
-            skip_type="time_uniform",
-            method="singlestep",
-        )
-        """
-        dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
-        x_sample = dpm_solver.sample(
-            noise,
-            steps=self.num_timesteps,
-            order=2,
-            skip_type="time_uniform",
-            method="multistep",
-        )
-        #'''
-        return x_sample
-
-        # HF DIFFUSION ATTEMPT
-        """
-        from .hf_diffusion import EulerAncestralDiscreteScheduler
-        Scheduler = EulerAncestralDiscreteScheduler()
-        Scheduler.set_timesteps(100)
-        for timestep in Scheduler.timesteps:
-            noise_input = Scheduler.scale_model_input(noise, timestep)
-            ts = s_in * timestep
-            model_output = model(noise_input, ts, **model_kwargs)
-            model_epsilon, _model_var = th.split(model_output, model_output.shape[1]//2, dim=1)
-            noise, _x0 = Scheduler.step(model_epsilon, timestep, noise)
-        return noise
-        """
-
-        # KARRAS DIFFUSION ATTEMPT
-        """
-        TRAINED_DIFFUSION_STEPS = 4000 # HARDCODED
-        ratio = TRAINED_DIFFUSION_STEPS/14.5
-        def call_model(*args, **kwargs):
-            model_output = model(*args, **kwargs)
-            model_output, model_var_values = th.split(model_output, model_output.shape[1]//2, dim=1)
-            return model_output
-        print(get_sigmas_karras(self.num_timesteps, sigma_min=0.0, sigma_max=4000, device=device))
-        exit()
-        sigmas = get_sigmas_karras(self.num_timesteps, sigma_min=0.03, sigma_max=14.5, device=device)
-        return k_sampler(call_model, noise, sigmas, extra_args=model_kwargs, disable=not progress)
-        '''
-        sigmas = get_sigmas_karras(self.num_timesteps, sigma_min=0.03, sigma_max=14.5, device=device)
-        step = 0 # LMAO
-        global_sigmas = None
-        #
-        def fakemodel(x, t, **model_kwargs):
-            print(t,global_sigmas*ratio)
-            return model(x, t, **model_kwargs)
-        def denoised(x, sigmas, **extra_args):
-            t = th.tensor([self.num_timesteps-step-1] * shape[0], device=device)
-            nonlocal global_sigmas
-            global_sigmas = sigmas
-            with th.no_grad():
-                out = self.p_sample(
-                    fakemodel,
-                    x,
-                    t,
-                    clip_denoised=clip_denoised,
-                    denoised_fn=denoised_fn,
-                    cond_fn=cond_fn,
-                    model_kwargs=model_kwargs,
-                )
-                return out["sample"]
-        def callback(d):
-            nonlocal step
-            step += 1
-
-        return k_sampler(denoised, noise, sigmas, extra_args=model_kwargs, callback=callback, disable=not progress)
-        '''
-        """
-
-    def sample_loop(self, *args, **kwargs):
-        s = self.sampler
-        if s == "p":
-            return self.p_sample_loop(*args, **kwargs)
-        elif s == "ddim":
-            return self.ddim_sample_loop(*args, **kwargs)
-        elif s == "dpm++2m":
-            if self.conditioning_free is not True:
-                raise RuntimeError("cond_free must be true")
-            with tqdm(total=self.num_timesteps) as pbar:
-                return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs)
-        else:
-            raise RuntimeError("sampler not impl")
-
-    def p_sample_loop(
-        self,
-        model,
-        shape,
-        noise=None,
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        model_kwargs=None,
-        device=None,
-        progress=False,
-    ):
-        """
-        Generate samples from the model.
-
-        :param model: the model module.
-        :param shape: the shape of the samples, (N, C, H, W).
-        :param noise: if specified, the noise from the encoder to sample.
-                      Should be of the same shape as `shape`.
-        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
-        :param denoised_fn: if not None, a function which applies to the
-            x_start prediction before it is used to sample.
-        :param cond_fn: if not None, this is a gradient function that acts
-                        similarly to the model.
-        :param model_kwargs: if not None, a dict of extra keyword arguments to
-            pass to the model. This can be used for conditioning.
-        :param device: if specified, the device to create the samples on.
-                       If not specified, use a model parameter's device.
-        :param progress: if True, show a tqdm progress bar.
-        :return: a non-differentiable batch of samples.
-        """
-        final = None
-        for sample in self.p_sample_loop_progressive(
-            model,
-            shape,
-            noise=noise,
-            clip_denoised=clip_denoised,
-            denoised_fn=denoised_fn,
-            cond_fn=cond_fn,
-            model_kwargs=model_kwargs,
-            device=device,
-            progress=progress,
-        ):
-            final = sample
-        return final["sample"]
-
-    def p_sample_loop_progressive(
-        self,
-        model,
-        shape,
-        noise=None,
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        model_kwargs=None,
-        device=None,
-        progress=False,
-    ):
-        """
-        Generate samples from the model and yield intermediate samples from
-        each timestep of diffusion.
-
-        Arguments are the same as p_sample_loop().
-        Returns a generator over dicts, where each dict is the return value of
-        p_sample().
-        """
-        if device is None:
-            device = next(model.parameters()).device
-        assert isinstance(shape, (tuple, list))
-        if noise is not None:
-            img = noise
-        else:
-            img = th.randn(*shape, device=device)
-        indices = list(range(self.num_timesteps))[::-1]
-
-        for i in tqdm(indices, disable=not progress):
-            t = th.tensor([i] * shape[0], device=device)
-            with th.no_grad():
-                out = self.p_sample(
-                    model,
-                    img,
-                    t,
-                    clip_denoised=clip_denoised,
-                    denoised_fn=denoised_fn,
-                    cond_fn=cond_fn,
-                    model_kwargs=model_kwargs,
-                )
-                yield out
-                img = out["sample"]
-
-    def ddim_sample(
-        self,
-        model,
-        x,
-        t,
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        model_kwargs=None,
-        eta=0.0,
-    ):
-        """
-        Sample x_{t-1} from the model using DDIM.
-
-        Same usage as p_sample().
-        """
-        out = self.p_mean_variance(
-            model,
-            x,
-            t,
-            clip_denoised=clip_denoised,
-            denoised_fn=denoised_fn,
-            model_kwargs=model_kwargs,
-        )
-        if cond_fn is not None:
-            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
-
-        # Usually our model outputs epsilon, but we re-derive it
-        # in case we used x_start or x_prev prediction.
-        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
-
-        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
-        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
-        sigma = eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev)
-        # Equation 12.
-        noise = th.randn_like(x)
-        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma**2) * eps
-        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))  # no noise when t == 0
-        sample = mean_pred + nonzero_mask * sigma * noise
-        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
-
-    def ddim_reverse_sample(
-        self,
-        model,
-        x,
-        t,
-        clip_denoised=True,
-        denoised_fn=None,
-        model_kwargs=None,
-        eta=0.0,
-    ):
-        """
-        Sample x_{t+1} from the model using DDIM reverse ODE.
-        """
-        assert eta == 0.0, "Reverse ODE only for deterministic path"
-        out = self.p_mean_variance(
-            model,
-            x,
-            t,
-            clip_denoised=clip_denoised,
-            denoised_fn=denoised_fn,
-            model_kwargs=model_kwargs,
-        )
-        # Usually our model outputs epsilon, but we re-derive it
-        # in case we used x_start or x_prev prediction.
-        eps = (
-            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"]
-        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
-        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
-
-        # Equation 12. reversed
-        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
-
-        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
-
-    def ddim_sample_loop(
-        self,
-        model,
-        shape,
-        noise=None,
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        model_kwargs=None,
-        device=None,
-        progress=False,
-        eta=0.0,
-    ):
-        """
-        Generate samples from the model using DDIM.
-
-        Same usage as p_sample_loop().
-        """
-        final = None
-        for sample in self.ddim_sample_loop_progressive(
-            model,
-            shape,
-            noise=noise,
-            clip_denoised=clip_denoised,
-            denoised_fn=denoised_fn,
-            cond_fn=cond_fn,
-            model_kwargs=model_kwargs,
-            device=device,
-            progress=progress,
-            eta=eta,
-        ):
-            final = sample
-        return final["sample"]
-
-    def ddim_sample_loop_progressive(
-        self,
-        model,
-        shape,
-        noise=None,
-        clip_denoised=True,
-        denoised_fn=None,
-        cond_fn=None,
-        model_kwargs=None,
-        device=None,
-        progress=False,
-        eta=0.0,
-    ):
-        """
-        Use DDIM to sample from the model and yield intermediate samples from
-        each timestep of DDIM.
-
-        Same usage as p_sample_loop_progressive().
-        """
-        if device is None:
-            device = next(model.parameters()).device
-        assert isinstance(shape, (tuple, list))
-        if noise is not None:
-            img = noise
-        else:
-            img = th.randn(*shape, device=device)
-        indices = list(range(self.num_timesteps))[::-1]
-
-        if progress:
-            # Lazy import so that we don't depend on tqdm.
-            from tqdm.auto import tqdm
-
-            indices = tqdm(indices, disable=not progress)
-
-        for i in indices:
-            t = th.tensor([i] * shape[0], device=device)
-            with th.no_grad():
-                out = self.ddim_sample(
-                    model,
-                    img,
-                    t,
-                    clip_denoised=clip_denoised,
-                    denoised_fn=denoised_fn,
-                    cond_fn=cond_fn,
-                    model_kwargs=model_kwargs,
-                    eta=eta,
-                )
-                yield out
-                img = out["sample"]
-
-    def _vb_terms_bpd(self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None):
-        """
-        Get a term for the variational lower-bound.
-
-        The resulting units are bits (rather than nats, as one might expect).
-        This allows for comparison to other papers.
-
-        :return: a dict with the following keys:
-                 - 'output': a shape [N] tensor of NLLs or KLs.
-                 - 'pred_xstart': the x_0 predictions.
-        """
-        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)
-        out = self.p_mean_variance(model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs)
-        kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], out["log_variance"])
-        kl = mean_flat(kl) / np.log(2.0)
-
-        decoder_nll = -discretized_gaussian_log_likelihood(
-            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
-        )
-        assert decoder_nll.shape == x_start.shape
-        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
-
-        # At the first timestep return the decoder NLL,
-        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
-        output = th.where((t == 0), decoder_nll, kl)
-        return {"output": output, "pred_xstart": out["pred_xstart"]}
-
-    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
-        """
-        Compute training losses for a single timestep.
-
-        :param model: the model to evaluate loss on.
-        :param x_start: the [N x C x ...] tensor of inputs.
-        :param t: a batch of timestep indices.
-        :param model_kwargs: if not None, a dict of extra keyword arguments to
-            pass to the model. This can be used for conditioning.
-        :param noise: if specified, the specific Gaussian noise to try to remove.
-        :return: a dict with the key "loss" containing a tensor of shape [N].
-                 Some mean or variance settings may also have other keys.
-        """
-        if model_kwargs is None:
-            model_kwargs = {}
-        if noise is None:
-            noise = th.randn_like(x_start)
-        x_t = self.q_sample(x_start, t, noise=noise)
-
-        terms = {}
-
-        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
-            # TODO: support multiple model outputs for this mode.
-            terms["loss"] = self._vb_terms_bpd(
-                model=model,
-                x_start=x_start,
-                x_t=x_t,
-                t=t,
-                clip_denoised=False,
-                model_kwargs=model_kwargs,
-            )["output"]
-            if self.loss_type == LossType.RESCALED_KL:
-                terms["loss"] *= self.num_timesteps
-        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
-            model_outputs = model(x_t, self._scale_timesteps(t), **model_kwargs)
-            if isinstance(model_outputs, tuple):
-                model_output = model_outputs[0]
-                terms["extra_outputs"] = model_outputs[1:]
-            else:
-                model_output = model_outputs
-
-            if self.model_var_type in [
-                ModelVarType.LEARNED,
-                ModelVarType.LEARNED_RANGE,
-            ]:
-                B, C = x_t.shape[:2]
-                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
-                model_output, model_var_values = th.split(model_output, C, dim=1)
-                # Learn the variance using the variational bound, but don't let
-                # it affect our mean prediction.
-                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
-                terms["vb"] = self._vb_terms_bpd(
-                    model=lambda *args, r=frozen_out: r,
-                    x_start=x_start,
-                    x_t=x_t,
-                    t=t,
-                    clip_denoised=False,
-                )["output"]
-                if self.loss_type == LossType.RESCALED_MSE:
-                    # Divide by 1000 for equivalence with initial implementation.
-                    # Without a factor of 1/1000, the VB term hurts the MSE term.
-                    terms["vb"] *= self.num_timesteps / 1000.0
-
-            if self.model_mean_type == ModelMeanType.PREVIOUS_X:
-                target = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0]
-                x_start_pred = torch.zeros(x_start)  # Not supported.
-            elif self.model_mean_type == ModelMeanType.START_X:
-                target = x_start
-                x_start_pred = model_output
-            elif self.model_mean_type == ModelMeanType.EPSILON:
-                target = noise
-                x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output)
-            else:
-                raise NotImplementedError(self.model_mean_type)
-            assert model_output.shape == target.shape == x_start.shape
-            terms["mse"] = mean_flat((target - model_output) ** 2)
-            terms["x_start_predicted"] = x_start_pred
-            if "vb" in terms:
-                terms["loss"] = terms["mse"] + terms["vb"]
-            else:
-                terms["loss"] = terms["mse"]
-        else:
-            raise NotImplementedError(self.loss_type)
-
-        return terms
-
-    def autoregressive_training_losses(
-        self,
-        model,
-        x_start,
-        t,
-        model_output_keys,
-        gd_out_key,
-        model_kwargs=None,
-        noise=None,
-    ):
-        """
-        Compute training losses for a single timestep.
-
-        :param model: the model to evaluate loss on.
-        :param x_start: the [N x C x ...] tensor of inputs.
-        :param t: a batch of timestep indices.
-        :param model_kwargs: if not None, a dict of extra keyword arguments to
-            pass to the model. This can be used for conditioning.
-        :param noise: if specified, the specific Gaussian noise to try to remove.
-        :return: a dict with the key "loss" containing a tensor of shape [N].
-                 Some mean or variance settings may also have other keys.
-        """
-        if model_kwargs is None:
-            model_kwargs = {}
-        if noise is None:
-            noise = th.randn_like(x_start)
-        x_t = self.q_sample(x_start, t, noise=noise)
-        terms = {}
-        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
-            assert False  # not currently supported for this type of diffusion.
-        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
-            model_outputs = model(x_t, x_start, self._scale_timesteps(t), **model_kwargs)
-            terms.update({k: o for k, o in zip(model_output_keys, model_outputs)})
-            model_output = terms[gd_out_key]
-            if self.model_var_type in [
-                ModelVarType.LEARNED,
-                ModelVarType.LEARNED_RANGE,
-            ]:
-                B, C = x_t.shape[:2]
-                assert model_output.shape == (B, C, 2, *x_t.shape[2:])
-                model_output, model_var_values = (
-                    model_output[:, :, 0],
-                    model_output[:, :, 1],
-                )
-                # Learn the variance using the variational bound, but don't let
-                # it affect our mean prediction.
-                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
-                terms["vb"] = self._vb_terms_bpd(
-                    model=lambda *args, r=frozen_out: r,
-                    x_start=x_start,
-                    x_t=x_t,
-                    t=t,
-                    clip_denoised=False,
-                )["output"]
-                if self.loss_type == LossType.RESCALED_MSE:
-                    # Divide by 1000 for equivalence with initial implementation.
-                    # Without a factor of 1/1000, the VB term hurts the MSE term.
-                    terms["vb"] *= self.num_timesteps / 1000.0
-
-            if self.model_mean_type == ModelMeanType.PREVIOUS_X:
-                target = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0]
-                x_start_pred = torch.zeros(x_start)  # Not supported.
-            elif self.model_mean_type == ModelMeanType.START_X:
-                target = x_start
-                x_start_pred = model_output
-            elif self.model_mean_type == ModelMeanType.EPSILON:
-                target = noise
-                x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output)
-            else:
-                raise NotImplementedError(self.model_mean_type)
-            assert model_output.shape == target.shape == x_start.shape
-            terms["mse"] = mean_flat((target - model_output) ** 2)
-            terms["x_start_predicted"] = x_start_pred
-            if "vb" in terms:
-                terms["loss"] = terms["mse"] + terms["vb"]
-            else:
-                terms["loss"] = terms["mse"]
-        else:
-            raise NotImplementedError(self.loss_type)
-
-        return terms
-
-    def _prior_bpd(self, x_start):
-        """
-        Get the prior KL term for the variational lower-bound, measured in
-        bits-per-dim.
-
-        This term can't be optimized, as it only depends on the encoder.
-
-        :param x_start: the [N x C x ...] tensor of inputs.
-        :return: a batch of [N] KL values (in bits), one per batch element.
-        """
-        batch_size = x_start.shape[0]
-        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
-        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
-        kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
-        return mean_flat(kl_prior) / np.log(2.0)
-
-    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
-        """
-        Compute the entire variational lower-bound, measured in bits-per-dim,
-        as well as other related quantities.
-
-        :param model: the model to evaluate loss on.
-        :param x_start: the [N x C x ...] tensor of inputs.
-        :param clip_denoised: if True, clip denoised samples.
-        :param model_kwargs: if not None, a dict of extra keyword arguments to
-            pass to the model. This can be used for conditioning.
-
-        :return: a dict containing the following keys:
-                 - total_bpd: the total variational lower-bound, per batch element.
-                 - prior_bpd: the prior term in the lower-bound.
-                 - vb: an [N x T] tensor of terms in the lower-bound.
-                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
-                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
-        """
-        device = x_start.device
-        batch_size = x_start.shape[0]
-
-        vb = []
-        xstart_mse = []
-        mse = []
-        for t in list(range(self.num_timesteps))[::-1]:
-            t_batch = th.tensor([t] * batch_size, device=device)
-            noise = th.randn_like(x_start)
-            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
-            # Calculate VLB term at the current timestep
-            with th.no_grad():
-                out = self._vb_terms_bpd(
-                    model,
-                    x_start=x_start,
-                    x_t=x_t,
-                    t=t_batch,
-                    clip_denoised=clip_denoised,
-                    model_kwargs=model_kwargs,
-                )
-            vb.append(out["output"])
-            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
-            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
-            mse.append(mean_flat((eps - noise) ** 2))
-
-        vb = th.stack(vb, dim=1)
-        xstart_mse = th.stack(xstart_mse, dim=1)
-        mse = th.stack(mse, dim=1)
-
-        prior_bpd = self._prior_bpd(x_start)
-        total_bpd = vb.sum(dim=1) + prior_bpd
-        return {
-            "total_bpd": total_bpd,
-            "prior_bpd": prior_bpd,
-            "vb": vb,
-            "xstart_mse": xstart_mse,
-            "mse": mse,
-        }
-
-
-class SpacedDiffusion(GaussianDiffusion):
-    """
-    A diffusion process which can skip steps in a base diffusion process.
-
-    :param use_timesteps: a collection (sequence or set) of timesteps from the
-                          original diffusion process to retain.
-    :param kwargs: the kwargs to create the base diffusion process.
-    """
-
-    def __init__(self, use_timesteps, **kwargs):
-        self.use_timesteps = set(use_timesteps)
-        self.timestep_map = []
-        self.original_num_steps = len(kwargs["betas"])
-
-        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
-        last_alpha_cumprod = 1.0
-        new_betas = []
-        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
-            if i in self.use_timesteps:
-                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
-                last_alpha_cumprod = alpha_cumprod
-                self.timestep_map.append(i)
-        kwargs["betas"] = np.array(new_betas)
-        super().__init__(**kwargs)
-
-    def p_mean_variance(self, model, *args, **kwargs):  # pylint: disable=signature-differs
-        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
-
-    def training_losses(self, model, *args, **kwargs):  # pylint: disable=signature-differs
-        return super().training_losses(self._wrap_model(model), *args, **kwargs)
-
-    def autoregressive_training_losses(self, model, *args, **kwargs):  # pylint: disable=signature-differs
-        return super().autoregressive_training_losses(self._wrap_model(model, True), *args, **kwargs)
-
-    def condition_mean(self, cond_fn, *args, **kwargs):
-        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
-
-    def condition_score(self, cond_fn, *args, **kwargs):
-        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
-
-    def _wrap_model(self, model, autoregressive=False):
-        if isinstance(model, _WrappedModel) or isinstance(model, _WrappedAutoregressiveModel):
-            return model
-        mod = _WrappedAutoregressiveModel if autoregressive else _WrappedModel
-        return mod(model, self.timestep_map, self.rescale_timesteps, self.original_num_steps)
-
-    def _scale_timesteps(self, t):
-        # Scaling is done by the wrapped model.
-        return t
-
-
-def space_timesteps(num_timesteps, section_counts):
-    """
-    Create a list of timesteps to use from an original diffusion process,
-    given the number of timesteps we want to take from equally-sized portions
-    of the original process.
-
-    For example, if there's 300 timesteps and the section counts are [10,15,20]
-    then the first 100 timesteps are strided to be 10 timesteps, the second 100
-    are strided to be 15 timesteps, and the final 100 are strided to be 20.
-
-    If the stride is a string starting with "ddim", then the fixed striding
-    from the DDIM paper is used, and only one section is allowed.
-
-    :param num_timesteps: the number of diffusion steps in the original
-                          process to divide up.
-    :param section_counts: either a list of numbers, or a string containing
-                           comma-separated numbers, indicating the step count
-                           per section. As a special case, use "ddimN" where N
-                           is a number of steps to use the striding from the
-                           DDIM paper.
-    :return: a set of diffusion steps from the original process to use.
-    """
-    if isinstance(section_counts, str):
-        if section_counts.startswith("ddim"):
-            desired_count = int(section_counts[len("ddim") :])
-            for i in range(1, num_timesteps):
-                if len(range(0, num_timesteps, i)) == desired_count:
-                    return set(range(0, num_timesteps, i))
-            raise ValueError(f"cannot create exactly {num_timesteps} steps with an integer stride")
-        section_counts = [int(x) for x in section_counts.split(",")]
-    size_per = num_timesteps // len(section_counts)
-    extra = num_timesteps % len(section_counts)
-    start_idx = 0
-    all_steps = []
-    for i, section_count in enumerate(section_counts):
-        size = size_per + (1 if i < extra else 0)
-        if size < section_count:
-            raise ValueError(f"cannot divide section of {size} steps into {section_count}")
-        if section_count <= 1:
-            frac_stride = 1
-        else:
-            frac_stride = (size - 1) / (section_count - 1)
-        cur_idx = 0.0
-        taken_steps = []
-        for _ in range(section_count):
-            taken_steps.append(start_idx + round(cur_idx))
-            cur_idx += frac_stride
-        all_steps += taken_steps
-        start_idx += size
-    return set(all_steps)
-
-
-class _WrappedModel:
-    def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
-        self.model = model
-        self.timestep_map = timestep_map
-        self.rescale_timesteps = rescale_timesteps
-        self.original_num_steps = original_num_steps
-
-    def __call__(self, x, ts, **kwargs):
-        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
-        new_ts = map_tensor[ts]
-        if self.rescale_timesteps:
-            new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
-        return self.model(x, new_ts, **kwargs)
-
-
-class _WrappedAutoregressiveModel:
-    def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
-        self.model = model
-        self.timestep_map = timestep_map
-        self.rescale_timesteps = rescale_timesteps
-        self.original_num_steps = original_num_steps
-
-    def __call__(self, x, x0, ts, **kwargs):
-        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
-        new_ts = map_tensor[ts]
-        if self.rescale_timesteps:
-            new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
-        return self.model(x, x0, new_ts, **kwargs)
-
-
-def _extract_into_tensor(arr, timesteps, broadcast_shape):
-    """
-    Extract values from a 1-D numpy array for a batch of indices.
-
-    :param arr: the 1-D numpy array.
-    :param timesteps: a tensor of indices into the array to extract.
-    :param broadcast_shape: a larger shape of K dimensions with the batch
-                            dimension equal to the length of timesteps.
-    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
-    """
-    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
-    while len(res.shape) < len(broadcast_shape):
-        res = res[..., None]
-    return res.expand(broadcast_shape)
diff --git a/TTS/tts/layers/xtts/vocoder.py b/TTS/tts/layers/xtts/vocoder.py
deleted file mode 100644
index 0f4991b886..0000000000
--- a/TTS/tts/layers/xtts/vocoder.py
+++ /dev/null
@@ -1,385 +0,0 @@
-import json
-from dataclasses import dataclass
-from enum import Enum
-from typing import Callable, Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-MAX_WAV_VALUE = 32768.0
-
-
-class KernelPredictor(torch.nn.Module):
-    """Kernel predictor for the location-variable convolutions"""
-
-    def __init__(
-        self,
-        cond_channels,
-        conv_in_channels,
-        conv_out_channels,
-        conv_layers,
-        conv_kernel_size=3,
-        kpnet_hidden_channels=64,
-        kpnet_conv_size=3,
-        kpnet_dropout=0.0,
-        kpnet_nonlinear_activation="LeakyReLU",
-        kpnet_nonlinear_activation_params={"negative_slope": 0.1},
-    ):
-        """
-        Args:
-            cond_channels (int): number of channel for the conditioning sequence,
-            conv_in_channels (int): number of channel for the input sequence,
-            conv_out_channels (int): number of channel for the output sequence,
-            conv_layers (int): number of layers
-        """
-        super().__init__()
-
-        self.conv_in_channels = conv_in_channels
-        self.conv_out_channels = conv_out_channels
-        self.conv_kernel_size = conv_kernel_size
-        self.conv_layers = conv_layers
-
-        kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers  # l_w
-        kpnet_bias_channels = conv_out_channels * conv_layers  # l_b
-
-        self.input_conv = nn.Sequential(
-            nn.utils.weight_norm(nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)),
-            getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
-        )
-
-        self.residual_convs = nn.ModuleList()
-        padding = (kpnet_conv_size - 1) // 2
-        for _ in range(3):
-            self.residual_convs.append(
-                nn.Sequential(
-                    nn.Dropout(kpnet_dropout),
-                    nn.utils.weight_norm(
-                        nn.Conv1d(
-                            kpnet_hidden_channels,
-                            kpnet_hidden_channels,
-                            kpnet_conv_size,
-                            padding=padding,
-                            bias=True,
-                        )
-                    ),
-                    getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
-                    nn.utils.weight_norm(
-                        nn.Conv1d(
-                            kpnet_hidden_channels,
-                            kpnet_hidden_channels,
-                            kpnet_conv_size,
-                            padding=padding,
-                            bias=True,
-                        )
-                    ),
-                    getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
-                )
-            )
-        self.kernel_conv = nn.utils.weight_norm(
-            nn.Conv1d(
-                kpnet_hidden_channels,
-                kpnet_kernel_channels,
-                kpnet_conv_size,
-                padding=padding,
-                bias=True,
-            )
-        )
-        self.bias_conv = nn.utils.weight_norm(
-            nn.Conv1d(
-                kpnet_hidden_channels,
-                kpnet_bias_channels,
-                kpnet_conv_size,
-                padding=padding,
-                bias=True,
-            )
-        )
-
-    def forward(self, c):
-        """
-        Args:
-            c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
-        """
-        batch, _, cond_length = c.shape
-        c = self.input_conv(c)
-        for residual_conv in self.residual_convs:
-            residual_conv.to(c.device)
-            c = c + residual_conv(c)
-        k = self.kernel_conv(c)
-        b = self.bias_conv(c)
-        kernels = k.contiguous().view(
-            batch,
-            self.conv_layers,
-            self.conv_in_channels,
-            self.conv_out_channels,
-            self.conv_kernel_size,
-            cond_length,
-        )
-        bias = b.contiguous().view(
-            batch,
-            self.conv_layers,
-            self.conv_out_channels,
-            cond_length,
-        )
-
-        return kernels, bias
-
-    def remove_weight_norm(self):
-        nn.utils.remove_weight_norm(self.input_conv[0])
-        nn.utils.remove_weight_norm(self.kernel_conv)
-        nn.utils.remove_weight_norm(self.bias_conv)
-        for block in self.residual_convs:
-            nn.utils.remove_weight_norm(block[1])
-            nn.utils.remove_weight_norm(block[3])
-
-
-class LVCBlock(torch.nn.Module):
-    """the location-variable convolutions"""
-
-    def __init__(
-        self,
-        in_channels,
-        cond_channels,
-        stride,
-        dilations=[1, 3, 9, 27],
-        lReLU_slope=0.2,
-        conv_kernel_size=3,
-        cond_hop_length=256,
-        kpnet_hidden_channels=64,
-        kpnet_conv_size=3,
-        kpnet_dropout=0.0,
-    ):
-        super().__init__()
-
-        self.cond_hop_length = cond_hop_length
-        self.conv_layers = len(dilations)
-        self.conv_kernel_size = conv_kernel_size
-
-        self.kernel_predictor = KernelPredictor(
-            cond_channels=cond_channels,
-            conv_in_channels=in_channels,
-            conv_out_channels=2 * in_channels,
-            conv_layers=len(dilations),
-            conv_kernel_size=conv_kernel_size,
-            kpnet_hidden_channels=kpnet_hidden_channels,
-            kpnet_conv_size=kpnet_conv_size,
-            kpnet_dropout=kpnet_dropout,
-            kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope},
-        )
-
-        self.convt_pre = nn.Sequential(
-            nn.LeakyReLU(lReLU_slope),
-            nn.utils.weight_norm(
-                nn.ConvTranspose1d(
-                    in_channels,
-                    in_channels,
-                    2 * stride,
-                    stride=stride,
-                    padding=stride // 2 + stride % 2,
-                    output_padding=stride % 2,
-                )
-            ),
-        )
-
-        self.conv_blocks = nn.ModuleList()
-        for dilation in dilations:
-            self.conv_blocks.append(
-                nn.Sequential(
-                    nn.LeakyReLU(lReLU_slope),
-                    nn.utils.weight_norm(
-                        nn.Conv1d(
-                            in_channels,
-                            in_channels,
-                            conv_kernel_size,
-                            padding=dilation * (conv_kernel_size - 1) // 2,
-                            dilation=dilation,
-                        )
-                    ),
-                    nn.LeakyReLU(lReLU_slope),
-                )
-            )
-
-    def forward(self, x, c):
-        """forward propagation of the location-variable convolutions.
-        Args:
-            x (Tensor): the input sequence (batch, in_channels, in_length)
-            c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
-
-        Returns:
-            Tensor: the output sequence (batch, in_channels, in_length)
-        """
-        _, in_channels, _ = x.shape  # (B, c_g, L')
-
-        x = self.convt_pre(x)  # (B, c_g, stride * L')
-        kernels, bias = self.kernel_predictor(c)
-
-        for i, conv in enumerate(self.conv_blocks):
-            output = conv(x)  # (B, c_g, stride * L')
-
-            k = kernels[:, i, :, :, :, :]  # (B, 2 * c_g, c_g, kernel_size, cond_length)
-            b = bias[:, i, :, :]  # (B, 2 * c_g, cond_length)
-
-            output = self.location_variable_convolution(
-                output, k, b, hop_size=self.cond_hop_length
-            )  # (B, 2 * c_g, stride * L'): LVC
-            x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh(
-                output[:, in_channels:, :]
-            )  # (B, c_g, stride * L'): GAU
-
-        return x
-
-    def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256):
-        """perform location-variable convolution operation on the input sequence (x) using the local convolution kernl.
-        Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
-        Args:
-            x (Tensor): the input sequence (batch, in_channels, in_length).
-            kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
-            bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
-            dilation (int): the dilation of convolution.
-            hop_size (int): the hop_size of the conditioning sequence.
-        Returns:
-            (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
-        """
-        batch, _, in_length = x.shape
-        batch, _, out_channels, kernel_size, kernel_length = kernel.shape
-        assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched"
-
-        padding = dilation * int((kernel_size - 1) / 2)
-        x = F.pad(x, (padding, padding), "constant", 0)  # (batch, in_channels, in_length + 2*padding)
-        x = x.unfold(2, hop_size + 2 * padding, hop_size)  # (batch, in_channels, kernel_length, hop_size + 2*padding)
-
-        if hop_size < dilation:
-            x = F.pad(x, (0, dilation), "constant", 0)
-        x = x.unfold(
-            3, dilation, dilation
-        )  # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
-        x = x[:, :, :, :, :hop_size]
-        x = x.transpose(3, 4)  # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)
-        x = x.unfold(4, kernel_size, 1)  # (batch, in_channels, kernel_length, dilation, _, kernel_size)
-
-        o = torch.einsum("bildsk,biokl->bolsd", x, kernel)
-        o = o.to(memory_format=torch.channels_last_3d)
-        bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d)
-        o = o + bias
-        o = o.contiguous().view(batch, out_channels, -1)
-
-        return o
-
-    def remove_weight_norm(self):
-        self.kernel_predictor.remove_weight_norm()
-        nn.utils.remove_weight_norm(self.convt_pre[1])
-        for block in self.conv_blocks:
-            nn.utils.remove_weight_norm(block[1])
-
-
-class UnivNetGenerator(nn.Module):
-    """
-    UnivNet Generator
-
-    Originally from https://github.com/mindslab-ai/univnet/blob/master/model/generator.py.
-    """
-
-    def __init__(
-        self,
-        noise_dim=64,
-        channel_size=32,
-        dilations=[1, 3, 9, 27],
-        strides=[8, 8, 4],
-        lReLU_slope=0.2,
-        kpnet_conv_size=3,
-        # Below are MEL configurations options that this generator requires.
-        hop_length=256,
-        n_mel_channels=100,
-    ):
-        super(UnivNetGenerator, self).__init__()
-        self.mel_channel = n_mel_channels
-        self.noise_dim = noise_dim
-        self.hop_length = hop_length
-        channel_size = channel_size
-        kpnet_conv_size = kpnet_conv_size
-
-        self.res_stack = nn.ModuleList()
-        hop_length = 1
-        for stride in strides:
-            hop_length = stride * hop_length
-            self.res_stack.append(
-                LVCBlock(
-                    channel_size,
-                    n_mel_channels,
-                    stride=stride,
-                    dilations=dilations,
-                    lReLU_slope=lReLU_slope,
-                    cond_hop_length=hop_length,
-                    kpnet_conv_size=kpnet_conv_size,
-                )
-            )
-
-        self.conv_pre = nn.utils.weight_norm(nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode="reflect"))
-
-        self.conv_post = nn.Sequential(
-            nn.LeakyReLU(lReLU_slope),
-            nn.utils.weight_norm(nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode="reflect")),
-            nn.Tanh(),
-        )
-
-    def forward(self, c, z):
-        """
-        Args:
-            c (Tensor): the conditioning sequence of mel-spectrogram (batch, mel_channels, in_length)
-            z (Tensor): the noise sequence (batch, noise_dim, in_length)
-
-        """
-        z = self.conv_pre(z)  # (B, c_g, L)
-
-        for res_block in self.res_stack:
-            res_block.to(z.device)
-            z = res_block(z, c)  # (B, c_g, L * s_0 * ... * s_i)
-
-        z = self.conv_post(z)  # (B, 1, L * 256)
-
-        return z
-
-    def eval(self, inference=False):
-        super(UnivNetGenerator, self).eval()
-        # don't remove weight norm while validation in training loop
-        if inference:
-            self.remove_weight_norm()
-
-    def remove_weight_norm(self):
-        nn.utils.remove_weight_norm(self.conv_pre)
-
-        for layer in self.conv_post:
-            if len(layer.state_dict()) != 0:
-                nn.utils.remove_weight_norm(layer)
-
-        for res_block in self.res_stack:
-            res_block.remove_weight_norm()
-
-    def inference(self, c, z=None):
-        # pad input mel with zeros to cut artifact
-        # see https://github.com/seungwonpark/melgan/issues/8
-        zero = torch.full((c.shape[0], self.mel_channel, 10), -11.5129).to(c.device)
-        mel = torch.cat((c, zero), dim=2)
-
-        if z is None:
-            z = torch.randn(c.shape[0], self.noise_dim, mel.size(2)).to(mel.device)
-
-        audio = self.forward(mel, z)
-        audio = audio[:, :, : -(self.hop_length * 10)]
-        audio = audio.clamp(min=-1, max=1)
-        return audio
-
-
-if __name__ == "__main__":
-    model = UnivNetGenerator()
-
-    c = torch.randn(3, 100, 10)
-    z = torch.randn(3, 64, 10)
-    print(c.shape)
-
-    y = model(c, z)
-    print(y.shape)
-    assert y.shape == torch.Size([3, 1, 2560])
-
-    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    print(pytorch_total_params)
diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py
index 4aaf526111..f38dace235 100644
--- a/TTS/tts/models/base_tacotron.py
+++ b/TTS/tts/models/base_tacotron.py
@@ -252,7 +252,12 @@ def compute_gst(self, inputs, style_input, speaker_embedding=None):
 
     def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None):
         """Capacitron Variational Autoencoder"""
-        (VAE_outputs, posterior_distribution, prior_distribution, capacitron_beta,) = self.capacitron_vae_layer(
+        (
+            VAE_outputs,
+            posterior_distribution,
+            prior_distribution,
+            capacitron_beta,
+        ) = self.capacitron_vae_layer(
             reference_mel_info,
             text_info,
             speaker_embedding,  # pylint: disable=not-callable
diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py
index c8cfcfdd04..16644ff95e 100644
--- a/TTS/tts/models/tortoise.py
+++ b/TTS/tts/models/tortoise.py
@@ -676,7 +676,12 @@ def inference(
         ), "Too much text provided. Break the text up into separate segments and re-try inference."
 
         if voice_samples is not None:
-            (auto_conditioning, diffusion_conditioning, _, _,) = self.get_conditioning_latents(
+            (
+                auto_conditioning,
+                diffusion_conditioning,
+                _,
+                _,
+            ) = self.get_conditioning_latents(
                 voice_samples,
                 return_mels=True,
                 latent_averaging_mode=latent_averaging_mode,
diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py
index 477f31bfc9..af94675be9 100644
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@@ -9,13 +9,10 @@
 from coqpit import Coqpit
 
 from TTS.tts.layers.tortoise.audio_utils import denormalize_tacotron_mel, wav_to_univnet_mel
-from TTS.tts.layers.tortoise.diffusion_decoder import DiffusionTts
-from TTS.tts.layers.xtts.diffusion import SpacedDiffusion, get_named_beta_schedule, space_timesteps
 from TTS.tts.layers.xtts.gpt import GPT
 from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder
 from TTS.tts.layers.xtts.stream_generator import init_stream_support
 from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer
-from TTS.tts.layers.xtts.vocoder import UnivNetGenerator
 from TTS.tts.models.base_tts import BaseTTS
 from TTS.utils.io import load_fsspec
 
@@ -168,12 +165,10 @@ class XttsAudioConfig(Coqpit):
 
     Args:
         sample_rate (int): The sample rate in which the GPT operates.
-        diffusion_sample_rate (int): The sample rate of the diffusion audio waveform.
         output_sample_rate (int): The sample rate of the output audio waveform.
     """
 
     sample_rate: int = 22050
-    diffusion_sample_rate: int = 24000
     output_sample_rate: int = 24000
 
 
@@ -189,7 +184,6 @@ class XttsArgs(Coqpit):
         clvp_checkpoint (str, optional): The checkpoint for the ConditionalLatentVariablePerseq model. Defaults to None.
         decoder_checkpoint (str, optional): The checkpoint for the DiffTTS model. Defaults to None.
         num_chars (int, optional): The maximum number of characters to generate. Defaults to 255.
-        use_hifigan (bool, optional): Whether to use hifigan with implicit enhancement or diffusion + univnet as a decoder. Defaults to True.
 
         For GPT model:
         gpt_max_audio_tokens (int, optional): The maximum mel tokens for the autoregressive model. Defaults to 604.
@@ -227,7 +221,6 @@ class XttsArgs(Coqpit):
     clvp_checkpoint: str = None
     decoder_checkpoint: str = None
     num_chars: int = 255
-    use_hifigan: bool = True
 
     # XTTS GPT Encoder params
     tokenizer_file: str = ""
@@ -324,32 +317,15 @@ def init_models(self):
                 code_stride_len=self.args.gpt_code_stride_len,
             )
 
-        if self.args.use_hifigan:
-            self.hifigan_decoder = HifiDecoder(
-                input_sample_rate=self.args.input_sample_rate,
-                output_sample_rate=self.args.output_sample_rate,
-                output_hop_length=self.args.output_hop_length,
-                ar_mel_length_compression=self.args.gpt_code_stride_len,
-                decoder_input_dim=self.args.decoder_input_dim,
-                d_vector_dim=self.args.d_vector_dim,
-                cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer,
-            )
-
-        if not self.args.use_hifigan:
-            self.diffusion_decoder = DiffusionTts(
-                model_channels=self.args.diff_model_channels,
-                num_layers=self.args.diff_num_layers,
-                in_channels=self.args.diff_in_channels,
-                out_channels=self.args.diff_out_channels,
-                in_latent_channels=self.args.diff_in_latent_channels,
-                in_tokens=self.args.diff_in_tokens,
-                dropout=self.args.diff_dropout,
-                use_fp16=self.args.diff_use_fp16,
-                num_heads=self.args.diff_num_heads,
-                layer_drop=self.args.diff_layer_drop,
-                unconditioned_percentage=self.args.diff_unconditioned_percentage,
-            )
-            self.vocoder = UnivNetGenerator()
+        self.hifigan_decoder = HifiDecoder(
+            input_sample_rate=self.args.input_sample_rate,
+            output_sample_rate=self.args.output_sample_rate,
+            output_hop_length=self.args.output_hop_length,
+            ar_mel_length_compression=self.args.gpt_code_stride_len,
+            decoder_input_dim=self.args.decoder_input_dim,
+            d_vector_dim=self.args.d_vector_dim,
+            cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer,
+        )
 
     @property
     def device(self):
@@ -430,7 +406,6 @@ def get_conditioning_latents(
         sound_norm_refs=False,
     ):
         speaker_embedding = None
-        diffusion_cond_latents = None
 
         audio, sr = torchaudio.load(audio_path)
         audio = audio[:, : sr * max_ref_length].to(self.device)
@@ -441,12 +416,9 @@ def get_conditioning_latents(
         if librosa_trim_db is not None:
             audio = librosa.effects.trim(audio, top_db=librosa_trim_db)[0]
 
-        if self.args.use_hifigan or self.args.use_hifigan:
-            speaker_embedding = self.get_speaker_embedding(audio, sr)
-        else:
-            diffusion_cond_latents = self.get_diffusion_cond_latents(audio, sr)
+        speaker_embedding = self.get_speaker_embedding(audio, sr)
         gpt_cond_latents = self.get_gpt_cond_latents(audio, sr, length=gpt_cond_len)  # [1, 1024, T]
-        return gpt_cond_latents, diffusion_cond_latents, speaker_embedding
+        return gpt_cond_latents, speaker_embedding
 
     def synthesize(self, text, config, speaker_wav, language, **kwargs):
         """Synthesize speech with the given input text.
@@ -579,7 +551,7 @@ def full_inference(
             Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length.
             Sample rate is 24kHz.
         """
-        (gpt_cond_latent, diffusion_conditioning, speaker_embedding) = self.get_conditioning_latents(
+        (gpt_cond_latent, speaker_embedding) = self.get_conditioning_latents(
             audio_path=ref_audio_path,
             gpt_cond_len=gpt_cond_len,
             max_ref_length=max_ref_len,
@@ -591,7 +563,6 @@ def full_inference(
             language,
             gpt_cond_latent,
             speaker_embedding,
-            diffusion_conditioning,
             temperature=temperature,
             length_penalty=length_penalty,
             repetition_penalty=repetition_penalty,
@@ -614,7 +585,6 @@ def inference(
         language,
         gpt_cond_latent,
         speaker_embedding,
-        diffusion_conditioning,
         # GPT inference
         temperature=0.65,
         length_penalty=1,
@@ -643,14 +613,6 @@ def inference(
             text_tokens.shape[-1] < self.args.gpt_max_text_tokens
         ), " ❗ XTTS can only generate text with a maximum of 400 tokens."
 
-        if not self.args.use_hifigan:
-            diffuser = load_discrete_vocoder_diffuser(
-                desired_diffusion_steps=decoder_iterations,
-                cond_free=cond_free,
-                cond_free_k=cond_free_k,
-                sampler=decoder_sampler,
-            )
-
         with torch.no_grad():
             gpt_codes = self.gpt.generate(
                 cond_latents=gpt_cond_latent,
@@ -692,29 +654,12 @@ def inference(
                     gpt_latents = gpt_latents[:, :k]
                     break
 
-            if decoder == "hifigan":
-                assert (
-                    hasattr(self, "hifigan_decoder") and self.hifigan_decoder is not None
-                ), "You must enable hifigan decoder to use it by setting config `use_hifigan: true`"
-                wav = self.hifigan_decoder(gpt_latents, g=speaker_embedding)
-            else:
-                assert hasattr(
-                    self, "diffusion_decoder"
-                ), "You must disable hifigan decoders to use difffusion by setting `use_hifigan: false`"
-                mel = do_spectrogram_diffusion(
-                    self.diffusion_decoder,
-                    diffuser,
-                    gpt_latents,
-                    diffusion_conditioning,
-                    temperature=diffusion_temperature,
-                )
-                wav = self.vocoder.inference(mel)
+            wav = self.hifigan_decoder(gpt_latents, g=speaker_embedding)
 
         return {
             "wav": wav.cpu().numpy().squeeze(),
             "gpt_latents": gpt_latents,
             "speaker_embedding": speaker_embedding,
-            "diffusion_conditioning": diffusion_conditioning,
         }
 
     def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len):
@@ -752,9 +697,6 @@ def inference_stream(
         decoder="hifigan",
         **hf_generate_kwargs,
     ):
-        assert hasattr(
-            self, "hifigan_decoder"
-        ), "`inference_stream` requires use_hifigan to be set to true in the config.model_args, diffusion is too slow to stream."
         text = text.strip().lower()
         text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device)
 
@@ -793,13 +735,7 @@ def inference_stream(
 
             if is_end or (stream_chunk_size > 0 and len(last_tokens) >= stream_chunk_size):
                 gpt_latents = torch.cat(all_latents, dim=0)[None, :]
-                if decoder == "hifigan":
-                    assert hasattr(
-                        self, "hifigan_decoder"
-                    ), "You must enable hifigan decoder to use it by setting config `use_hifigan: true`"
-                    wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
-                else:
-                    raise NotImplementedError("Diffusion for streaming inference not implemented.")
+                wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
                 wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks(
                     wav_gen.squeeze(), wav_gen_prev, wav_overlap, overlap_wav_len
                 )
@@ -827,10 +763,8 @@ def eval(self):  # pylint: disable=redefined-builtin
 
     def get_compatible_checkpoint_state_dict(self, model_path):
         checkpoint = load_fsspec(model_path, map_location=torch.device("cpu"))["model"]
-        ignore_keys = ["diffusion_decoder", "vocoder"] if self.args.use_hifigan else []
-        ignore_keys += [] if self.args.use_hifigan else ["hifigan_decoder"]
         # remove xtts gpt trainer extra keys
-        ignore_keys += ["torch_mel_spectrogram_style_encoder", "torch_mel_spectrogram_dvae", "dvae"]
+        ignore_keys = ["torch_mel_spectrogram_style_encoder", "torch_mel_spectrogram_dvae", "dvae"]
         for key in list(checkpoint.keys()):
             # check if it is from the coqui Trainer if so convert it
             if key.startswith("xtts."):
@@ -889,12 +823,7 @@ def load_checkpoint(
             self.load_state_dict(checkpoint, strict=strict)
 
         if eval:
-            if hasattr(self, "hifigan_decoder"):
-                self.hifigan_decoder.eval()
-            if hasattr(self, "diffusion_decoder"):
-                self.diffusion_decoder.eval()
-            if hasattr(self, "vocoder"):
-                self.vocoder.eval()
+            self.hifigan_decoder.eval()
             self.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache, use_deepspeed=use_deepspeed)
             self.gpt.eval()
 
diff --git a/recipes/ljspeech/xtts_v1/train_gpt_xtts.py b/recipes/ljspeech/xtts_v1/train_gpt_xtts.py
index 9134be0db2..268a033535 100644
--- a/recipes/ljspeech/xtts_v1/train_gpt_xtts.py
+++ b/recipes/ljspeech/xtts_v1/train_gpt_xtts.py
@@ -94,12 +94,9 @@ def main():
         gpt_num_audio_tokens=8194,
         gpt_start_audio_token=8192,
         gpt_stop_audio_token=8193,
-        use_ne_hifigan=True,  # if it is true it will keep the non-enhanced keys on the output checkpoint
     )
     # define audio config
-    audio_config = XttsAudioConfig(
-        sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
-    )
+    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
     # training parameters config
     config = GPTTrainerConfig(
         output_path=OUT_PATH,
diff --git a/recipes/ljspeech/xtts_v2/train_gpt_xtts.py b/recipes/ljspeech/xtts_v2/train_gpt_xtts.py
index ee6b22becd..d94204ca4d 100644
--- a/recipes/ljspeech/xtts_v2/train_gpt_xtts.py
+++ b/recipes/ljspeech/xtts_v2/train_gpt_xtts.py
@@ -93,14 +93,11 @@ def main():
         gpt_num_audio_tokens=8194,
         gpt_start_audio_token=8192,
         gpt_stop_audio_token=8193,
-        use_ne_hifigan=True,  # if it is true it will keep the non-enhanced keys on the output checkpoint
         gpt_use_masking_gt_prompt_approach=True,
         gpt_use_perceiver_resampler=True,
     )
     # define audio config
-    audio_config = XttsAudioConfig(
-        sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
-    )
+    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
     # training parameters config
     config = GPTTrainerConfig(
         output_path=OUT_PATH,
diff --git a/tests/xtts_tests/test_xtts_gpt_train.py b/tests/xtts_tests/test_xtts_gpt_train.py
index 47b1dd7d27..03514daa3b 100644
--- a/tests/xtts_tests/test_xtts_gpt_train.py
+++ b/tests/xtts_tests/test_xtts_gpt_train.py
@@ -86,11 +86,8 @@
     gpt_num_audio_tokens=8194,
     gpt_start_audio_token=8192,
     gpt_stop_audio_token=8193,
-    use_ne_hifigan=True,
-)
-audio_config = XttsAudioConfig(
-    sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
 )
+audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
 config = GPTTrainerConfig(
     epochs=1,
     output_path=OUT_PATH,
diff --git a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py b/tests/xtts_tests/test_xtts_v2-0_gpt_train.py
index 6b6f1330dc..8099503855 100644
--- a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py
+++ b/tests/xtts_tests/test_xtts_v2-0_gpt_train.py
@@ -86,11 +86,8 @@
     gpt_stop_audio_token=8193,
     gpt_use_masking_gt_prompt_approach=True,
     gpt_use_perceiver_resampler=True,
-    use_ne_hifigan=True,
-)
-audio_config = XttsAudioConfig(
-    sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
 )
+audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
 config = GPTTrainerConfig(
     epochs=1,
     output_path=OUT_PATH,