From 1fe6d3ec56f7796420d7434c1d745e534a27d7ac Mon Sep 17 00:00:00 2001
From: xuanricheng <xuanricheng@hotmail.com>
Date: Fri, 5 Aug 2022 09:58:07 +0800
Subject: [PATCH] fix bug bert model

---
 flagai/model/bert_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flagai/model/bert_model.py b/flagai/model/bert_model.py
index e91d2c77..bf148405 100644
--- a/flagai/model/bert_model.py
+++ b/flagai/model/bert_model.py
@@ -149,10 +149,10 @@ def forward(self,
         # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
         # this attention mask is more simple than the triangular masking of causal attention
         # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = extended_attention_mask.unsqueeze(
-            1).unsqueeze(2)
         if attention_mask is not None:
             extended_attention_mask = extended_attention_mask * attention_mask
+        extended_attention_mask = extended_attention_mask.unsqueeze(
+            1).unsqueeze(2)
         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
         # masked positions, this operation will create a tensor which is 0.0 for
         # positions we want to attend and -10000.0 for masked positions.