[transformer] fix sdpa u2pp training nan (#2419)

wenet-e2e · Mar 19, 2024 · d51d1bc · d51d1bc
1 parent d27e61d
commit d51d1bc
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 17 deletions.
diff --git a/wenet/transformer/attention.py b/wenet/transformer/attention.py
@@ -425,8 +425,7 @@ def forward(
             assert mask.dtype != torch.bool
             mask = mask.unsqueeze(1)
             # matrix_bd as a mask bias
-            mask = torch.where(mask == get_dtype_min(mask.dtype), mask,
-                               matrix_bd / math.sqrt(self.d_k))
+            mask = (matrix_bd + mask) / math.sqrt(self.d_k)
             output = torch.nn.functional.scaled_dot_product_attention(
                 q_with_bias_u,
                 k,

diff --git a/wenet/utils/common.py b/wenet/utils/common.py
@@ -310,21 +310,9 @@ def log_add(*args) -> float:
     return a_max + lsp
 
 
-def get_dtype_min(
-    dtype: torch.dtype,
-    eps16: float = torch.finfo(torch.float16).min,
-    eps32: float = torch.finfo(torch.float32).min,
-    eps64: float = torch.finfo(torch.float64).min,
-    epsbf16: float = torch.finfo(torch.bfloat16).min,
-):
-    if dtype == torch.float16:
-        return eps16
-    elif dtype == torch.float32:
-        return eps32
-    elif dtype == torch.float64:
-        return eps64
-    elif dtype == torch.bfloat16:
-        return epsbf16
+def get_dtype_min(dtype: torch.dtype, ):
+    if dtype in [torch.float32, torch.bfloat16, torch.float16]:
+        return -1e+10
     else:
         raise RuntimeError(f"expected x to be floating-point, got {dtype}")