diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
index d93b3d55fe..57d1f8d859 100644
--- a/beginner_source/transformer_tutorial.py
+++ b/beginner_source/transformer_tutorial.py
@@ -37,7 +37,7 @@
 # ``nn.TransformerEncoder`` consists of multiple layers of
 # `nn.TransformerEncoderLayer <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html>`__.
 # Along with the input sequence, a square attention mask is required because the
-# self-attention layers in ``nn.TransformerEncoder`` are only allowed to attend
+# self-attention layers in ``nn.TransformerDecoder`` are only allowed to attend
 # the earlier positions in the sequence. For the language modeling task, any
 # tokens on the future positions should be masked. To produce a probability
 # distribution over output words, the output of the ``nn.TransformerEncoder``