From 1fe6d3ec56f7796420d7434c1d745e534a27d7ac Mon Sep 17 00:00:00 2001 From: xuanricheng Date: Fri, 5 Aug 2022 09:58:07 +0800 Subject: [PATCH] fix bug bert model --- flagai/model/bert_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flagai/model/bert_model.py b/flagai/model/bert_model.py index e91d2c77..bf148405 100644 --- a/flagai/model/bert_model.py +++ b/flagai/model/bert_model.py @@ -149,10 +149,10 @@ def forward(self, # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. - extended_attention_mask = extended_attention_mask.unsqueeze( - 1).unsqueeze(2) if attention_mask is not None: extended_attention_mask = extended_attention_mask * attention_mask + extended_attention_mask = extended_attention_mask.unsqueeze( + 1).unsqueeze(2) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions.