fix bug bert model

FlagAI-Open · Aug 5, 2022 · 1fe6d3e · 1fe6d3e
1 parent 044bc80
commit 1fe6d3e
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/flagai/model/bert_model.py b/flagai/model/bert_model.py
@@ -149,10 +149,10 @@ def forward(self,
         # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
         # this attention mask is more simple than the triangular masking of causal attention
         # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = extended_attention_mask.unsqueeze(
-            1).unsqueeze(2)
         if attention_mask is not None:
             extended_attention_mask = extended_attention_mask * attention_mask
+        extended_attention_mask = extended_attention_mask.unsqueeze(
+            1).unsqueeze(2)
         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
         # masked positions, this operation will create a tensor which is 0.0 for
         # positions we want to attend and -10000.0 for masked positions.