diff --git a/torchaudio/models/wav2vec2/components.py b/torchaudio/models/wav2vec2/components.py index 7c6b282fa5..c0b4f3d416 100644 --- a/torchaudio/models/wav2vec2/components.py +++ b/torchaudio/models/wav2vec2/components.py @@ -1037,3 +1037,15 @@ def forward(self, x: Tensor, label: Tensor, mask_m: Tensor, mask_u: Tensor) -> T label_u = label[mask_u] logit_u = _compute_logits(proj_x_u, label_u, self.label_embeddings) return logit_m, logit_u + + +class GradMultiply(torch.autograd.Function): + @staticmethod + def forward(ctx, x, scale): + ctx.scale = scale + res = x.new(x) + return res + + @staticmethod + def backward(ctx, grad): + return grad * ctx.scale, None diff --git a/torchaudio/models/wav2vec2/model.py b/torchaudio/models/wav2vec2/model.py index 4bb4bcb2f6..3778dd5695 100644 --- a/torchaudio/models/wav2vec2/model.py +++ b/torchaudio/models/wav2vec2/model.py @@ -137,6 +137,11 @@ class HuBERTPretrainModel(Module): logit_generator (torch.nn.Module): Logit generator that predicts the logits of the masked and unmasked inputs. + + feature_grad_mult (float or None): + The factor to scale the convolutional feature extraction layer gradients by. + If ``None``, the gradients of feature extraction layers are not affected. + The scale factor will not affect the forward pass. """ def __init__( @@ -144,11 +149,16 @@ def __init__( wav2vec2: Wav2Vec2Model, mask_generator: Module, logit_generator: Module, + feature_grad_mult: Optional[float], ): super().__init__() self.wav2vec2 = wav2vec2 self.mask_generator = mask_generator self.logit_generator = logit_generator + assert ( + feature_grad_mult is None or 0.0 < feature_grad_mult < 1.0 + ), f"The value of `feature_grad_mult` must be ``None`` or between (0, 1). Found {feature_grad_mult}" + self.feature_grad_mult = feature_grad_mult def forward( self, @@ -184,6 +194,8 @@ def forward( Shape: `(1,)`. """ x, lengths = self.wav2vec2.feature_extractor(waveforms, audio_lengths) + if self.feature_grad_mult is not None and self.feature_grad_mult < 1.0: + x = components.GradMultiply.apply(x, self.feature_grad_mult) features_pen = x.float().pow(2).mean() if lengths is not None: padding_mask = components._get_padding_mask(x, lengths) @@ -712,6 +724,7 @@ def hubert_pretrain_model( skip_nomask: bool, num_classes: int, final_dim: int, + feature_grad_mult: Optional[float], ) -> HuBERTPretrainModel: # Overriding the signature so that the return type is correct on Sphinx """hubert_pretrain_model(extractor_mode: str, extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]], extractor_conv_bias: bool, encoder_embed_dim: int, encoder_projection_dropout: float, encoder_pos_conv_kernel: int, encoder_pos_conv_groups: int, encoder_num_layers: int, encoder_num_heads: int, encoder_attention_dropout: float, encoder_ff_interm_features: int, encoder_ff_interm_dropout: float, encoder_dropout: float, encoder_layer_norm_first: bool, encoder_layer_drop: float, mask_prob: float, mask_selection: str, mask_other: float, mask_length: int, no_mask_overlap: bool, mask_min_space: int, mask_channel_prob: float, mask_channel_selection: str, mask_channel_other: float, mask_channel_length: int, no_mask_channel_overlap: bool, mask_channel_min_space: int, skip_masked: bool, skip_nomask: bool, num_classes: int, final_dim: int) -> torchaudio.models.HuBERTPretrainModel @@ -910,6 +923,12 @@ def hubert_pretrain_model( This option corresponds to ``final_dim`` from ``fairseq``. + feature_grad_mult (float or None): + The factor to scale the convolutional feature extraction layer gradients by. + The scale factor will not affect the forward pass. + + This option corresponds to ``feature_grad_mult`` from ``fairseq``. + Returns: HuBERTPretrainModel: The resulting model. @@ -958,7 +977,12 @@ def hubert_pretrain_model( skip_masked, skip_nomask, ) - return HuBERTPretrainModel(wav2vec2=wav2vec2, mask_generator=mask_generator, logit_generator=logit_generator) + return HuBERTPretrainModel( + wav2vec2=wav2vec2, + mask_generator=mask_generator, + logit_generator=logit_generator, + feature_grad_mult=feature_grad_mult, + ) def hubert_pretrain_base( @@ -970,10 +994,11 @@ def hubert_pretrain_base( mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10, + feature_grad_mult: Optional[float] = 0.1, num_classes: int = 100, ) -> HuBERTPretrainModel: # Overriding the signature so that the return type is correct on Sphinx - """hubert_pretrain_base(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.05, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10, num_classes: int = 100) -> torchaudio.models.HuBERTPretrainModel + """hubert_pretrain_base(encoder_projection_dropout: float = 0.1, encoder_attention_dropout: float = 0.1, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.1, encoder_layer_drop: float = 0.05, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10, feature_grad_mult: Optional[float] = 0.1, num_classes: int = 100) -> torchaudio.models.HuBERTPretrainModel Build HuBERTPretrainModel model with "base" architecture from *HuBERT* [:footcite:`hsu2021hubert`] @@ -994,6 +1019,8 @@ def hubert_pretrain_base( See :py:func:`hubert_pretrain_model`. mask_channel_length (int): See :py:func:`hubert_pretrain_model`. + feature_grad_mult (float or None): + See :py:func:`hubert_pretrain_model`. num_classes (int, optional): See :py:func:`hubert_pretrain_model`. @@ -1033,6 +1060,7 @@ def hubert_pretrain_base( skip_nomask=False, num_classes=num_classes, final_dim=256, + feature_grad_mult=feature_grad_mult, ) @@ -1045,9 +1073,10 @@ def hubert_pretrain_large( mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10, + feature_grad_mult: Optional[float] = None, ) -> HuBERTPretrainModel: # Overriding the signature so that the return type is correct on Sphinx - """hubert_pretrain_large(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10) -> torchaudio.models.HuBERTPretrainModel + """hubert_pretrain_large(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10, feature_grad_mult: Optional[float] = None) -> torchaudio.models.HuBERTPretrainModel Build HuBERTPretrainModel model for pre-training with "large" architecture from *HuBERT* [:footcite:`hsu2021hubert`] @@ -1068,6 +1097,8 @@ def hubert_pretrain_large( See :py:func:`hubert_pretrain_model`. mask_channel_length (int): See :py:func:`hubert_pretrain_model`. + feature_grad_mult (float or None): + See :py:func:`hubert_pretrain_model`. Returns: HuBERTPretrainModel: @@ -1105,6 +1136,7 @@ def hubert_pretrain_large( skip_nomask=False, num_classes=500, final_dim=768, + feature_grad_mult=feature_grad_mult, ) @@ -1117,9 +1149,10 @@ def hubert_pretrain_xlarge( mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10, + feature_grad_mult: Optional[float] = None, ) -> HuBERTPretrainModel: # Overriding the signature so that the return type is correct on Sphinx - """hubert_pretrain_xlarge(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10) -> torchaudio.models.HuBERTPretrainModel + """hubert_pretrain_xlarge(encoder_projection_dropout: float = 0.0, encoder_attention_dropout: float = 0.0, encoder_ff_interm_dropout: float = 0.0, encoder_dropout: float = 0.0, encoder_layer_drop: float = 0.0, mask_prob: float = 0.8, mask_channel_prob: float = 0.0, mask_channel_length: int = 10, feature_grad_mult: Optional[float] = None) -> torchaudio.models.HuBERTPretrainModel Build HuBERTPretrainModel model for pre-training with "extra large" architecture from *HuBERT* [:footcite:`hsu2021hubert`] @@ -1140,6 +1173,8 @@ def hubert_pretrain_xlarge( See :py:func:`hubert_pretrain_model`. mask_channel_length (int): See :py:func:`hubert_pretrain_model`. + feature_grad_mult (float or None): + See :py:func:`hubert_pretrain_model`. Returns: HuBERTPretrainModel: @@ -1177,4 +1212,5 @@ def hubert_pretrain_xlarge( skip_nomask=False, num_classes=500, final_dim=1024, + feature_grad_mult=feature_grad_mult, )