diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py index d93b3d55fe..57d1f8d859 100644 --- a/beginner_source/transformer_tutorial.py +++ b/beginner_source/transformer_tutorial.py @@ -37,7 +37,7 @@ # ``nn.TransformerEncoder`` consists of multiple layers of # `nn.TransformerEncoderLayer `__. # Along with the input sequence, a square attention mask is required because the -# self-attention layers in ``nn.TransformerEncoder`` are only allowed to attend +# self-attention layers in ``nn.TransformerDecoder`` are only allowed to attend # the earlier positions in the sequence. For the language modeling task, any # tokens on the future positions should be masked. To produce a probability # distribution over output words, the output of the ``nn.TransformerEncoder``