Skip to content

Commit

Permalink
[docstring] Update GPT2 and Whisper (huggingface#26642)
Browse files Browse the repository at this point in the history
* [DOCS] Update docstrings for  and  tokenizer

* [DOCS] add pad_token argument to whisper tokenizer docstring

* [FIX] Reword pad_token description

* [CHORE] Apply style formatting

---------

Co-authored-by: jmcdonnell <jmcdonnell@fieldbox.ai>
  • Loading branch information
2 people authored and blbadger committed Nov 8, 2023
1 parent 9f882bc commit 64b5c88
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 31 deletions.
8 changes: 6 additions & 2 deletions src/transformers/models/gpt2/configuration_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class GPT2Config(PretrainedConfig):
Number of hidden layers in the Transformer encoder.
n_head (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
n_inner (`int`, *optional*, defaults to None):
n_inner (`int`, *optional*):
Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
activation_function (`str`, *optional*, defaults to `"gelu_new"`):
Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
Expand All @@ -68,7 +68,7 @@ class GPT2Config(PretrainedConfig):
The dropout ratio for the embeddings.
attn_pdrop (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
The epsilon to use in the layer normalization layers.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Expand Down Expand Up @@ -107,6 +107,10 @@ class GPT2Config(PretrainedConfig):
Scale attention weights by dividing by sqrt(hidden_size)..
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
bos_token_id (`int`, *optional*, defaults to 50256):
Id of the beginning of sentence token in the vocabulary.
eos_token_id (`int`, *optional*, defaults to 50256):
Id of the end of sentence token in the vocabulary.
scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
Whether to additionally scale attention weights by `1 / layer_idx + 1`.
reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
Expand Down
11 changes: 8 additions & 3 deletions src/transformers/models/gpt2/tokenization_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,16 +136,21 @@ class GPT2Tokenizer(PreTrainedTokenizer):
errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token.
pad_token (`str`, *optional*):
The token used for padding, for example when batching sequences of different lengths.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (GPT2 tokenizer detect beginning of words by the preceding space).
add_bos_token (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial beginning of sentence token to the input. This allows to treat the leading
word just as any other word.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand Down
18 changes: 8 additions & 10 deletions src/transformers/models/gpt2/tokenization_gpt2_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,25 +95,23 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
refer to this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
vocab_file (`str`, *optional*):
Path to the vocabulary file.
merges_file (`str`):
merges_file (`str`, *optional*):
Path to the merges file.
errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
tokenizer_file (`str`, *optional*):
Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
contains everything needed to load the tokenizer.
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (GPT2 tokenizer detect beginning of words by the preceding space).
trim_offsets (`bool`, *optional*, defaults to `True`):
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand Down
4 changes: 3 additions & 1 deletion src/transformers/models/whisper/tokenization_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
Path to the vocabulary file.
merges_file (`str`):
Path to the merges file.
normalizer_file (`str`, *optional*, defaults to `None`):
normalizer_file (`str`, *optional*):
Path to the normalizer_file file.
errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See
Expand All @@ -237,6 +237,8 @@ class WhisperTokenizer(PreTrainedTokenizer):
`"<|startoftranscript|>"` when generating.
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token.
pad_token (`str`, *optional*):
The token used for padding, for example when batching sequences of different lengths.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word.
Expand Down
18 changes: 8 additions & 10 deletions src/transformers/models/whisper/tokenization_whisper_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,28 +95,26 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
refer to this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
vocab_file (`str`, *optional*):
Path to the vocabulary file.
merges_file (`str`):
merges_file (`str`, *optional*):
Path to the merges file.
normalizer_file (`str`, *optional*, defaults to `None`):
normalizer_file (`str`, *optional*):
Path to the normalizer_file file.
errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
tokenizer_file (`str`, *optional*):
Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
contains everything needed to load the tokenizer.
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
`"<|startoftranscript|>"` when generating.
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (Whisper tokenizer detect beginning of words by the preceding space).
trim_offsets (`bool`, *optional*, defaults to `True`):
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
language (`str`, *optional*):
The language of the transcription text. The corresponding language id token is appended to the start of the
sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token
Expand Down
5 changes: 0 additions & 5 deletions utils/check_docstrings.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,9 +316,6 @@
"FlavaTextModel",
"FocalNetModel",
"FunnelTokenizerFast",
"GPT2Config",
"GPT2Tokenizer",
"GPT2TokenizerFast",
"GPTBigCodeConfig",
"GPTJConfig",
"GPTNeoXConfig",
Expand Down Expand Up @@ -789,8 +786,6 @@
"WhisperConfig",
"WhisperFeatureExtractor",
"WhisperForAudioClassification",
"WhisperTokenizer",
"WhisperTokenizerFast",
"XCLIPTextConfig",
"XCLIPVisionConfig",
"XGLMConfig",
Expand Down

0 comments on commit 64b5c88

Please sign in to comment.