diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py index b0226025fb0..4e34b2cd142 100644 --- a/test/dynamo/test_repros.py +++ b/test/dynamo/test_repros.py @@ -857,6 +857,25 @@ def f(x): f(torch.ones(2, device="cuda", dtype=torch.float64)) + def test_embedding_backward_broadcasting_decomp(self): + def f(grad_output, indices): + num_weights = 10 + padding_idx = 1 + scale_grad_by_freq = True + return torch.ops.aten.embedding_dense_backward( + grad_output, indices, num_weights, padding_idx, scale_grad_by_freq + ) + + f_compiled = torch.compile(f, backend="aot_eager") + + grad_output = torch.ones(2, 4, 3, dtype=torch.float16) + indices = torch.ones(2, 4, dtype=torch.int64) + + out_ref = f(grad_output, indices) + out_test = f_compiled(grad_output, indices) + + self.assertEqual(out_ref, out_test) + def test_reformer_eval(self): with torch.no_grad(): cnt = self._reformer(nopython=True) diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py index d2964f2bbd2..54266e1bd37 100644 --- a/torch/_decomp/decompositions.py +++ b/torch/_decomp/decompositions.py @@ -1071,7 +1071,7 @@ def embedding_dense_backward( ones = torch.ones_like(indices) counts = counts.index_put([indices], ones, accumulate=True) grad_weights_scale = counts[indices] - grad_output = grad_output / grad_weights_scale.unsqueeze(1) + grad_output = grad_output / grad_weights_scale.unsqueeze(-1) mask = _unsqueeze_to_dim(indices == padding_idx, grad_output.ndim) grad = grad_output.masked_fill(mask, 0)