diff --git a/optimum/bettertransformer/models/encoder_models.py b/optimum/bettertransformer/models/encoder_models.py
index c56f20aae8d..20f7f4de50c 100644
--- a/optimum/bettertransformer/models/encoder_models.py
+++ b/optimum/bettertransformer/models/encoder_models.py
@@ -288,6 +288,7 @@ def __init__(self, bert_layer, config):
         self.validate_bettertransformer()
 
     def forward(self, hidden_states, attention_mask, *_):
+        # No check on output_attentions here as roformer relies on BertLayerBetterTransformer but does not pass output_attentions as keyword argument.
         if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
             if hidden_states.is_nested:
                 attention_mask = None
@@ -463,7 +464,10 @@ def __init__(self, bart_layer, config):
 
         self.validate_bettertransformer()
 
-    def forward(self, hidden_states, attention_mask, position_bias=None, *_, **__):
+    def forward(self, hidden_states, attention_mask, output_attentions: bool, position_bias=None, *_, **__):
+        if output_attentions:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+
         if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
             if not hasattr(hidden_states, "original_shape"):
                 original_shape = hidden_states.shape
@@ -655,7 +659,10 @@ def __init__(self, mbart_layer, config):
 
         self.validate_bettertransformer()
 
-    def forward(self, hidden_states, attention_mask, position_bias=None, *_, **__):
+    def forward(self, hidden_states, attention_mask, output_attentions: bool, position_bias=None, *_, **__):
+        if output_attentions:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+
         if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
             if not hasattr(hidden_states, "original_shape"):
                 original_shape = hidden_states.shape
@@ -842,7 +849,10 @@ def __init__(self, bert_layer, config):
 
         self.validate_bettertransformer()
 
-    def forward(self, hidden_states, attn_mask, head_mask=None, output_attentions=None, *_):
+    def forward(self, hidden_states, attn_mask, output_attentions: bool, head_mask=None, *_):
+        if output_attentions:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+
         if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
             if hidden_states.is_nested:
                 attn_mask = None
@@ -1019,7 +1029,10 @@ def __init__(self, whisper_layer, config):
 
         self.validate_bettertransformer()
 
-    def forward(self, hidden_states, attention_mask, *_, **__):
+    def forward(self, hidden_states, attention_mask, output_attentions: bool, *_, **__):
+        if output_attentions:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+
         if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
             attention_mask = None  # attention mask seems to be always None: https://github.com/huggingface/transformers/blob/94b3f544a1f5e04b78d87a2ae32a7ac252e22e31/src/transformers/models/whisper/modeling_whisper.py#L690
 
@@ -1139,7 +1152,10 @@ def __init__(self, vit_layer, config):
 
         self.validate_bettertransformer()
 
-    def forward(self, hidden_states, *_, **__):
+    def forward(self, hidden_states, output_attentions: bool, *_, **__):
+        if output_attentions:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+
         if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
             attention_mask = None
 
@@ -1259,7 +1275,10 @@ def __init__(self, vilt_layer, config):
 
         self.validate_bettertransformer()
 
-    def forward(self, hidden_states, *_, **__):
+    def forward(self, hidden_states, layer_head_mask, output_attentions: bool, *_, **__):
+        if output_attentions:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+
         if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
             attention_mask = None
 
@@ -1375,7 +1394,10 @@ def __init__(self, wav2vec2_layer, config):
 
         self.validate_bettertransformer()
 
-    def forward(self, hidden_states, attention_mask, **__):
+    def forward(self, hidden_states, attention_mask, output_attentions: bool, **__):
+        if output_attentions:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+
         if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
             if hidden_states.is_nested:
                 attention_mask = None
@@ -1497,7 +1519,10 @@ def __init__(self, fsmt_layer, config):
 
         self.validate_bettertransformer()
 
-    def forward(self, hidden_states, attention_mask, position_bias=None, *_, **__):
+    def forward(self, hidden_states, attention_mask, output_attentions: bool, position_bias=None, *_, **__):
+        if output_attentions:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+
         if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
             if not hasattr(hidden_states, "original_shape"):
                 original_shape = hidden_states.shape
@@ -1638,7 +1663,10 @@ def __init__(self, prophetnet_layer, config):
 
         self.validate_bettertransformer()
 
-    def forward(self, hidden_states, attention_mask, *_, **__):
+    def forward(self, hidden_states, attention_mask, output_attentions: bool, *_, **__):
+        if output_attentions:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+
         if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
             if not hasattr(hidden_states, "original_shape"):
                 original_shape = hidden_states.shape
@@ -1772,10 +1800,13 @@ def __init__(self, layer, config):
 
         self.validate_bettertransformer()
 
-    def forward(self, hidden_states, attention_mask, *_, **__):
+    def forward(self, hidden_states, attention_mask, causal_attention_mask, output_attentions: bool, *_, **__):
+        if output_attentions:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+
         if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
             # we expect attention_mask to be None in the vision model
-            if attention_mask is not None:
+            if attention_mask is not None or causal_attention_mask is not None:
                 raise ValueError(
                     "Please do not use attention masks when using `BetterTransformer` converted vision models"
                 )