huggingface · KMJ-007 · Oct 7, 2023 · Oct 7, 2023 · Oct 7, 2023
diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -58,7 +58,7 @@ class GPT2Config(PretrainedConfig):
             Number of hidden layers in the Transformer encoder.
         n_head (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        n_inner (`int`, *optional*, defaults to None):
+        n_inner (`int`, *optional*):
             Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
         activation_function (`str`, *optional*, defaults to `"gelu_new"`):
             Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
@@ -68,7 +68,7 @@ class GPT2Config(PretrainedConfig):
             The dropout ratio for the embeddings.
         attn_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
             The epsilon to use in the layer normalization layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -107,6 +107,10 @@ class GPT2Config(PretrainedConfig):
             Scale attention weights by dividing by sqrt(hidden_size)..
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            Beginning of stream token id
+        eos_token_id (`int`, *optional*, defaults to 50256):
+            End of stream token id
         scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
             Whether to additionally scale attention weights by `1 / layer_idx + 1`.
         reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):

diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
@@ -325,7 +325,6 @@
     "FlavaTextModel",
     "FocalNetModel",
     "FunnelTokenizerFast",
-    "GPT2Config",
     "GPT2Tokenizer",
     "GPT2TokenizerFast",
     "GPTBigCodeConfig",