From 0148b0a10af72885fcdd9ab92aa1c02c50ffff42 Mon Sep 17 00:00:00 2001 From: zspo Date: Wed, 24 May 2023 02:50:40 +0800 Subject: [PATCH] Fix some docs what layerdrop does (#23691) * Fix some docs what layerdrop does * Update src/transformers/models/data2vec/configuration_data2vec_audio.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Fix more docs --------- Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- .../pabee/modeling_pabee_albert.py | 2 +- src/transformers/generation/logits_process.py | 2 +- src/transformers/modeling_outputs.py | 4 ++-- src/transformers/modeling_utils.py | 2 +- .../models/align/configuration_align.py | 2 +- src/transformers/models/blip/modeling_blip_text.py | 2 +- .../models/blip/modeling_tf_blip_text.py | 2 +- src/transformers/models/blip_2/modeling_blip_2.py | 2 +- src/transformers/models/bloom/modeling_bloom.py | 2 +- .../data2vec/configuration_data2vec_audio.py | 3 +++ .../configuration_deformable_detr.py | 2 +- src/transformers/models/deta/configuration_deta.py | 2 +- .../efficientnet/configuration_efficientnet.py | 2 +- .../models/hubert/configuration_hubert.py | 3 +++ src/transformers/models/lxmert/modeling_lxmert.py | 14 +++++++------- .../models/lxmert/modeling_tf_lxmert.py | 10 +++++----- .../models/mask2former/modeling_mask2former.py | 4 ++-- .../models/mpnet/tokenization_mpnet.py | 2 +- .../models/mpnet/tokenization_mpnet_fast.py | 2 +- src/transformers/models/opt/configuration_opt.py | 2 +- .../models/pegasus_x/configuration_pegasus_x.py | 4 ++-- src/transformers/models/rag/modeling_rag.py | 2 +- src/transformers/models/rag/retrieval_rag.py | 4 ++-- src/transformers/models/realm/modeling_realm.py | 2 +- src/transformers/models/sew/configuration_sew.py | 3 +++ .../models/unispeech/configuration_unispeech.py | 3 +++ .../unispeech_sat/configuration_unispeech_sat.py | 3 +++ .../models/wav2vec2/configuration_wav2vec2.py | 3 +++ .../configuration_wav2vec2_conformer.py | 3 +++ .../models/wavlm/configuration_wavlm.py | 3 +++ src/transformers/optimization_tf.py | 4 ++-- src/transformers/pipelines/audio_utils.py | 2 +- src/transformers/pipelines/base.py | 2 +- src/transformers/tokenization_utils_base.py | 2 +- src/transformers/training_args_tf.py | 2 +- ...uration_{{cookiecutter.lowercase_modelname}}.py | 4 ++-- 36 files changed, 68 insertions(+), 44 deletions(-) diff --git a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py index 5e17352dc19b..57b649ec067b 100644 --- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py +++ b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py @@ -253,7 +253,7 @@ def forward( Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: - loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Classification (or regression if config.num_labels==1) loss. logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` Classification (or regression if config.num_labels==1) scores (before SoftMax). diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index 95c8064ee404..dd51610afd43 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -678,7 +678,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor): generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more information. Args: - prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`): + prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`): This function constraints the beam search to allowed tokens only at each step. This function takes 2 arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py index c69e426ab531..aceec7abd406 100755 --- a/src/transformers/modeling_outputs.py +++ b/src/transformers/modeling_outputs.py @@ -1522,7 +1522,7 @@ class Seq2SeqTSModelOutput(ModelOutput): scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*): Scaling values of each time series' context window which is used to give the model inputs of the same magnitude and then used to rescale back to the original magnitude. - static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*): + static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*): Static features of each time series' in a batch which are copied to the covariates at inference time. """ @@ -1593,7 +1593,7 @@ class Seq2SeqTSPredictionOutput(ModelOutput): scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*): Scaling values of each time series' context window which is used to give the model inputs of the same magnitude and then used to rescale back to the original magnitude. - static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*): + static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*): Static features of each time series' in a batch which are copied to the covariates at inference time. """ diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index a8e8e4b2e241..b7b832b4640e 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -912,7 +912,7 @@ def get_head_mask( The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard). num_hidden_layers (`int`): The number of hidden layers in the model. - is_attention_chunked: (`bool`, *optional*, defaults to `False`): + is_attention_chunked (`bool`, *optional*, defaults to `False`): Whether or not the attentions scores are computed by chunks or not. Returns: diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py index 488b7f6fe458..0436b278f0a7 100644 --- a/src/transformers/models/align/configuration_align.py +++ b/src/transformers/models/align/configuration_align.py @@ -184,7 +184,7 @@ class AlignVisionConfig(PretrainedConfig): List of output channel sizes to be used in each block for convolutional layers. depthwise_padding (`List[int]`, *optional*, defaults to `[]`): List of block indices with square padding. - strides: (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`): + strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`): List of stride sizes to be used in each block for convolutional layers. num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`): List of the number of times each block is to repeated. diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index 85585f3d5dd3..1f269cf852ee 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -613,7 +613,7 @@ def get_extended_attention_mask( Mask with ones indicating tokens to attend to, zeros for tokens to ignore. input_shape (`Tuple[int]`): The shape of the input to the model. - device: (`torch.device`): + device (`torch.device`): The device of the input to the model. Returns: diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py index 6e8ed8a891c0..02f592c25902 100644 --- a/src/transformers/models/blip/modeling_tf_blip_text.py +++ b/src/transformers/models/blip/modeling_tf_blip_text.py @@ -633,7 +633,7 @@ def get_extended_attention_mask( Mask with ones indicating tokens to attend to, zeros for tokens to ignore. input_shape (`Tuple[int]`): The shape of the input to the model. - is_decoder: (`bool`): + is_decoder (`bool`): Whether the model is used as a decoder. Returns: diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index be499dc1d516..381a3fe0ed7a 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -1059,7 +1059,7 @@ def get_extended_attention_mask( Mask with ones indicating tokens to attend to, zeros for tokens to ignore. input_shape (`Tuple[int]`): The shape of the input to the model. - device: (`torch.device`): + device (`torch.device`): The device of the input to the model. Returns: diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index e954cfb44730..5c0d570cbe9c 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -256,7 +256,7 @@ def _merge_heads(self, x: torch.Tensor) -> torch.Tensor: Merge heads together over the last dimenstion Args: - x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim] + x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim] Returns: torch.tensor: [batch_size, seq_length, num_heads * head_dim] diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py index 2ec526924f36..066d81a5daed 100644 --- a/src/transformers/models/data2vec/configuration_data2vec_audio.py +++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py @@ -62,6 +62,9 @@ class Data2VecAudioConfig(PretrainedConfig): The dropout ratio for the attention probabilities. final_dropout (`float`, *optional*, defaults to 0.1): The dropout probability for the final projection layer of [`Data2VecAudioForCTC`]. + layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more + details. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py index 54614eef96f9..dbe5fd7f0a78 100644 --- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py +++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py @@ -77,7 +77,7 @@ class DeformableDetrConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. init_xavier_std (`float`, *optional*, defaults to 1): The scaling factor used for the Xavier initialization gain in the HM Attention map module. - encoder_layerdrop: (`float`, *optional*, defaults to 0.0): + encoder_layerdrop (`float`, *optional*, defaults to 0.0): The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more details. auxiliary_loss (`bool`, *optional*, defaults to `False`): diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py index 836e9732e68e..dc85ea91df36 100644 --- a/src/transformers/models/deta/configuration_deta.py +++ b/src/transformers/models/deta/configuration_deta.py @@ -71,7 +71,7 @@ class DetaConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. init_xavier_std (`float`, *optional*, defaults to 1): The scaling factor used for the Xavier initialization gain in the HM Attention map module. - encoder_layerdrop: (`float`, *optional*, defaults to 0.0): + encoder_layerdrop (`float`, *optional*, defaults to 0.0): The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more details. auxiliary_loss (`bool`, *optional*, defaults to `False`): diff --git a/src/transformers/models/efficientnet/configuration_efficientnet.py b/src/transformers/models/efficientnet/configuration_efficientnet.py index a6e4d172f442..e6b6a1c261ca 100644 --- a/src/transformers/models/efficientnet/configuration_efficientnet.py +++ b/src/transformers/models/efficientnet/configuration_efficientnet.py @@ -60,7 +60,7 @@ class EfficientNetConfig(PretrainedConfig): List of output channel sizes to be used in each block for convolutional layers. depthwise_padding (`List[int]`, *optional*, defaults to `[]`): List of block indices with square padding. - strides: (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`): + strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`): List of stride sizes to be used in each block for convolutional layers. num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`): List of the number of times each block is to repeated. diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py index 139df45bbb79..1f326871c3c9 100644 --- a/src/transformers/models/hubert/configuration_hubert.py +++ b/src/transformers/models/hubert/configuration_hubert.py @@ -62,6 +62,9 @@ class HubertConfig(PretrainedConfig): The dropout ratio for the attention probabilities. final_dropout (`float`, *optional*, defaults to 0.1): The dropout probabilitiy for the final projection layer of [`Wav2Vec2ForCTC`]. + layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more + details. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py index 9fe7ecb730fd..73586872e8a1 100644 --- a/src/transformers/models/lxmert/modeling_lxmert.py +++ b/src/transformers/models/lxmert/modeling_lxmert.py @@ -111,7 +111,7 @@ class LxmertForQuestionAnsweringOutput(ModelOutput): loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.k. - question_answering_score: (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*): + question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*): Prediction scores of question answering objective (classification). language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of @@ -153,10 +153,10 @@ class LxmertForPreTrainingOutput(ModelOutput): (classification) loss. prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - cross_relationship_score: (`torch.FloatTensor` of shape `(batch_size, 2)`): + cross_relationship_score (`torch.FloatTensor` of shape `(batch_size, 2)`): Prediction scores of the textual matching objective (classification) head (scores of True/False continuation before SoftMax). - question_answering_score: (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`): + question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`): Prediction scores of question answering objective (classification). language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of @@ -828,12 +828,12 @@ def _init_weights(self, module): [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) - visual_feats: (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`): + visual_feats (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`): This input represents visual features. They ROI pooled object features from bounding boxes using a faster-RCNN model) These are currently not provided by the transformers library. - visual_pos: (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`): + visual_pos (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`): This input represents spacial features corresponding to their relative (via index) visual features. The pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to 1. @@ -1171,7 +1171,7 @@ def forward( Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` - obj_labels: (`Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]`, *optional*): + obj_labels (`Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]`, *optional*): each key is named after each one of the visual losses and each element of the tuple is of the shape `(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id and the label score respectively @@ -1398,7 +1398,7 @@ def forward( return_dict: Optional[bool] = None, ) -> Union[LxmertForQuestionAnsweringOutput, Tuple[torch.FloatTensor]]: r""" - labels: (`Torch.Tensor` of shape `(batch_size)`, *optional*): + labels (`Torch.Tensor` of shape `(batch_size)`, *optional*): A one-hot representation of the correct answer """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py index 948053c93e28..f02d0ff6446a 100644 --- a/src/transformers/models/lxmert/modeling_tf_lxmert.py +++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py @@ -111,10 +111,10 @@ class TFLxmertForPreTrainingOutput(ModelOutput): (classification) loss. prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - cross_relationship_score: (`tf.Tensor` of shape `(batch_size, 2)`): + cross_relationship_score (`tf.Tensor` of shape `(batch_size, 2)`): Prediction scores of the textual matching objective (classification) head (scores of True/False continuation before SoftMax). - question_answering_score: (`tf.Tensor` of shape `(batch_size, n_qa_answers)`): + question_answering_score (`tf.Tensor` of shape `(batch_size, n_qa_answers)`): Prediction scores of question answering objective (classification). language_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of shape @@ -873,12 +873,12 @@ def serving(self, inputs): [`PreTrainedTokenizer.encode`] for details. [What are input IDs?](../glossary#input-ids) - visual_feats: (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`): + visual_feats (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`): This input represents visual features. They ROI pooled object features from bounding boxes using a faster-RCNN model) These are currently not provided by the transformers library. - visual_pos: (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`): + visual_pos (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`): This input represents spacial features corresponding to their relative (via index) visual features. The pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to 1. @@ -1297,7 +1297,7 @@ def call( Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` - obj_labels: (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to `None`): + obj_labels (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to `None`): each key is named after each one of the visual losses and each element of the tuple is of the shape `(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id and the label score respectively diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py index 289b4919e369..4cb2493e58c8 100644 --- a/src/transformers/models/mask2former/modeling_mask2former.py +++ b/src/transformers/models/mask2former/modeling_mask2former.py @@ -1767,7 +1767,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module): of the predicted mask for each query, instead of attending to the full feature map. Args: - config: (`Mask2FormerConfig`): + config (`Mask2FormerConfig`): Configuration used to instantiate Mask2FormerMaskedAttentionDecoder. """ @@ -2003,7 +2003,7 @@ def __init__(self, hidden_size: int, num_heads: int, mask_feature_size: torch.Te The feature dimension of the Mask2FormerMaskedAttentionDecoder num_heads (`int`): The number of heads used in the Mask2FormerMaskedAttentionDecoder - mask_feature_size: (`torch.Tensor`): + mask_feature_size (`torch.Tensor`): one of the output dimensions of the predicted masks for each query """ super().__init__() diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py index 1f5ad2f41aae..57a06beeead0 100644 --- a/src/transformers/models/mpnet/tokenization_mpnet.py +++ b/src/transformers/models/mpnet/tokenization_mpnet.py @@ -119,7 +119,7 @@ class MPNetTokenizer(PreTrainedTokenizer): This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)). - strip_accents: (`bool`, *optional*): + strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). """ diff --git a/src/transformers/models/mpnet/tokenization_mpnet_fast.py b/src/transformers/models/mpnet/tokenization_mpnet_fast.py index 288c69c62b3c..82d8ffec08d9 100644 --- a/src/transformers/models/mpnet/tokenization_mpnet_fast.py +++ b/src/transformers/models/mpnet/tokenization_mpnet_fast.py @@ -98,7 +98,7 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast): tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this issue](https://github.com/huggingface/transformers/issues/328)). - strip_accents: (`bool`, *optional*): + strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). """ diff --git a/src/transformers/models/opt/configuration_opt.py b/src/transformers/models/opt/configuration_opt.py index df13b3201998..d2b7a4347ea4 100644 --- a/src/transformers/models/opt/configuration_opt.py +++ b/src/transformers/models/opt/configuration_opt.py @@ -67,7 +67,7 @@ class OPTConfig(PretrainedConfig): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - layerdrop: (`float`, *optional*, defaults to 0.0): + layerdrop (`float`, *optional*, defaults to 0.0): The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more details. init_std (`float`, *optional*, defaults to 0.02): diff --git a/src/transformers/models/pegasus_x/configuration_pegasus_x.py b/src/transformers/models/pegasus_x/configuration_pegasus_x.py index c393a6b8a910..f48e19bdcbca 100644 --- a/src/transformers/models/pegasus_x/configuration_pegasus_x.py +++ b/src/transformers/models/pegasus_x/configuration_pegasus_x.py @@ -70,10 +70,10 @@ class PegasusXConfig(PretrainedConfig): just in case (e.g., 512 or 1024 or 2048). init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - encoder_layerdrop: (`float`, *optional*, defaults to 0.0): + encoder_layerdrop (`float`, *optional*, defaults to 0.0): The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more details. - decoder_layerdrop: (`float`, *optional*, defaults to 0.0): + decoder_layerdrop (`float`, *optional*, defaults to 0.0): The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more details. use_cache (`bool`, *optional*, defaults to `True`): diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index 6941bca09c74..019b26ef08e9 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -1430,7 +1430,7 @@ def generate( priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s default values, whose documentation should be checked to parameterize generation. - prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`, *optional*): + prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*): If provided, this function constraints the beam search to allowed tokens only at each step. If not provided no constraint is applied. This function takes 2 arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the next generation step conditioned on diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py index 261255b9f62f..88cb54115bf5 100644 --- a/src/transformers/models/rag/retrieval_rag.py +++ b/src/transformers/models/rag/retrieval_rag.py @@ -573,10 +573,10 @@ def __call__( Retrieves documents for specified `question_hidden_states`. Args: - question_input_ids: (`List[List[int]]`) batch of input ids + question_input_ids (`List[List[int]]`) batch of input ids question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`: A batch of query vectors to retrieve with. - prefix: (`str`, *optional*): + prefix (`str`, *optional*): The prefix used by the generator's tokenizer. n_docs (`int`, *optional*): The number of docs retrieved per query. diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py index 261b2b4a0b8c..339cc27e9289 100644 --- a/src/transformers/models/realm/modeling_realm.py +++ b/src/transformers/models/realm/modeling_realm.py @@ -726,7 +726,7 @@ class RealmReaderOutput(ModelOutput): The index of the retrieved span candidates in which the predicted answer is most likely. start_pos (`torch.IntTensor` of shape `()`): Predicted answer starting position in *RealmReader*'s inputs. - end_pos: (`torch.IntTensor` of shape `()`): + end_pos (`torch.IntTensor` of shape `()`): Predicted answer ending position in *RealmReader*'s inputs. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py index af7041843de3..07e3a7df26d8 100644 --- a/src/transformers/models/sew/configuration_sew.py +++ b/src/transformers/models/sew/configuration_sew.py @@ -63,6 +63,9 @@ class SEWConfig(PretrainedConfig): The dropout ratio for the attention probabilities. final_dropout (`float`, *optional*, defaults to 0.1): The dropout probability for the final projection layer of [`SEWForCTC`]. + layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more + details. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py index f4e8df659e9a..4054c49f3e0c 100644 --- a/src/transformers/models/unispeech/configuration_unispeech.py +++ b/src/transformers/models/unispeech/configuration_unispeech.py @@ -65,6 +65,9 @@ class UniSpeechConfig(PretrainedConfig): The dropout ratio for the attention probabilities. final_dropout (`float`, *optional*, defaults to 0.1): The dropout probability for the final projection layer of [`UniSpeechForCTC`]. + layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more + details. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py index 222f982fe769..8bd482f394d0 100644 --- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py @@ -66,6 +66,9 @@ class UniSpeechSatConfig(PretrainedConfig): The dropout ratio for the attention probabilities. final_dropout (`float`, *optional*, defaults to 0.1): The dropout probability for the final projection layer of [`UniSpeechSatForCTC`]. + layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more + details. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): diff --git a/src/transformers/models/wav2vec2/configuration_wav2vec2.py b/src/transformers/models/wav2vec2/configuration_wav2vec2.py index 7afcd3f0ee28..6f7709e535d1 100644 --- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py +++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py @@ -63,6 +63,9 @@ class Wav2Vec2Config(PretrainedConfig): The dropout ratio for the attention probabilities. final_dropout (`float`, *optional*, defaults to 0.1): The dropout probability for the final projection layer of [`Wav2Vec2ForCTC`]. + layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more + details. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): diff --git a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py index b9f24c7e7080..24b7dca73944 100644 --- a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py @@ -65,6 +65,9 @@ class Wav2Vec2ConformerConfig(PretrainedConfig): The dropout ratio for the attention probabilities. final_dropout (`float`, *optional*, defaults to 0.1): The dropout probability for the final projection layer of [`Wav2Vec2ConformerForCTC`]. + layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more + details. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): diff --git a/src/transformers/models/wavlm/configuration_wavlm.py b/src/transformers/models/wavlm/configuration_wavlm.py index becbe100240d..c3ac6ba196a8 100644 --- a/src/transformers/models/wavlm/configuration_wavlm.py +++ b/src/transformers/models/wavlm/configuration_wavlm.py @@ -62,6 +62,9 @@ class WavLMConfig(PretrainedConfig): The dropout ratio for the attention probabilities. final_dropout (`float`, *optional*, defaults to 0.1): The dropout probability for the final projection layer of [`WavLMForCTC`]. + layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more + details. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py index b42e04041b8f..382eae2a30e2 100644 --- a/src/transformers/optimization_tf.py +++ b/src/transformers/optimization_tf.py @@ -117,9 +117,9 @@ def create_optimizer( The beta2 to use in Adam. adam_epsilon (`float`, *optional*, defaults to 1e-8): The epsilon to use in Adam. - adam_clipnorm: (`float`, *optional*, defaults to `None`): + adam_clipnorm (`float`, *optional*, defaults to `None`): If not `None`, clip the gradient norm for each weight tensor to this value. - adam_global_clipnorm: (`float`, *optional*, defaults to `None`) + adam_global_clipnorm (`float`, *optional*, defaults to `None`) If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all weight tensors, as if they were concatenated into a single vector. weight_decay_rate (`float`, *optional*, defaults to 0): diff --git a/src/transformers/pipelines/audio_utils.py b/src/transformers/pipelines/audio_utils.py index 62c2b00c467a..f17dd68d6439 100644 --- a/src/transformers/pipelines/audio_utils.py +++ b/src/transformers/pipelines/audio_utils.py @@ -119,7 +119,7 @@ def ffmpeg_microphone_live( The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of an audio sample but without using that part to actually make the prediction. Setting this does not change the length of the chunk. - format_for_conversion: (`str`, defalts to `f32le`) + format_for_conversion (`str`, defalts to `f32le`) The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le` could also be used. Return: diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index f57d08efffd6..4fc256266449 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -514,7 +514,7 @@ def from_str( Creates an instance of the right subclass of [`~pipelines.PipelineDataFormat`] depending on `format`. Args: - format: (`str`): + format (`str`): The format of the desired pipeline. Acceptable values are `"json"`, `"csv"` or `"pipe"`. output_path (`str`, *optional*): Where to save the outgoing data. diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index a186c8338bad..ecc9d5011d78 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2093,7 +2093,7 @@ def save_pretrained( If `True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a value error is raised. - filename_prefix: (`str`, *optional*): + filename_prefix (`str`, *optional*): A prefix to add to the names of the files saved by the tokenizer. push_to_hub (`bool`, *optional*, defaults to `False`): Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index 847bbdb78a15..461c4086acc3 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -66,7 +66,7 @@ class TFTrainingArguments(TrainingArguments): The batch size per GPU/TPU core/CPU for training. per_device_eval_batch_size (`int`, *optional*, defaults to 8): The batch size per GPU/TPU core/CPU for evaluation. - gradient_accumulation_steps: (`int`, *optional*, defaults to 1): + gradient_accumulation_steps (`int`, *optional*, defaults to 1): Number of updates steps to accumulate the gradients for, before performing a backward/update pass. diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py index 3221696317bd..2898b5cf6f8f 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py @@ -107,10 +107,10 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig): just in case (e.g., 512 or 1024 or 2048). init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - encoder_layerdrop: (`float`, *optional*, defaults to 0.0): + encoder_layerdrop (`float`, *optional*, defaults to 0.0): The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more details. - decoder_layerdrop: (`float`, *optional*, defaults to 0.0): + decoder_layerdrop (`float`, *optional*, defaults to 0.0): The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more details. use_cache (`bool`, *optional*, defaults to `True`):