From 286f9b18315eefa0347a7a9fbc83b72901a33108 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 18 May 2022 15:08:56 +0200 Subject: [PATCH] [T5] Fix init in TF and Flax for pretraining (#17294) * fix init * Apply suggestions from code review * fix * finish * Update src/transformers/modeling_tf_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/models/t5/modeling_t5.py | 2 ++ src/transformers/models/t5/modeling_tf_t5.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index bcd4837867c572..d2f4e29a30a59c 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -768,6 +768,8 @@ def _init_weights(self, module): # Mesh TensorFlow embeddings initialization # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0) + if hasattr(module, "lm_head") and not self.config.tie_word_embeddings: + module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0) elif isinstance(module, T5DenseReluDense): # Mesh TensorFlow FF initialization # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56 diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py index e7bae23c871bfd..12ac789c6b4337 100644 --- a/src/transformers/models/t5/modeling_tf_t5.py +++ b/src/transformers/models/t5/modeling_tf_t5.py @@ -1112,7 +1112,9 @@ def _shift_right(self, input_ids): class TFT5Model(TFT5PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) - self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared") + self.shared = TFSharedEmbeddings( + config.vocab_size, config.d_model, name="shared", initializer_range=self.config.initializer_factor + ) # retrieve correct absolute scope for embed token wrapper with tf.compat.v1.variable_scope("shared") as shared_abs_scope_name: @@ -1259,8 +1261,9 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.model_dim = config.d_model - - self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared") + self.shared = TFSharedEmbeddings( + config.vocab_size, config.d_model, name="shared", initializer_range=self.config.initializer_factor + ) # retrieve correct absolute scope for embed token wrapper with tf.compat.v1.variable_scope("shared") as shared_abs_scope_name: @@ -1600,7 +1603,9 @@ def _reorder_cache(self, past, beam_idx): class TFT5EncoderModel(TFT5PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) - self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared") + self.shared = TFSharedEmbeddings( + config.vocab_size, config.d_model, name="shared", initializer_range=self.config.initializer_factor + ) # retrieve correct absolute scope for embed token wrapper with tf.compat.v1.variable_scope("shared") as shared_abs_scope_name: