Skip to content

Commit

Permalink
fix: Remove trailing \n from llama3 <|eot_id|>
Browse files Browse the repository at this point in the history
There's inconsistency in the documentation on whether or not there should
be a \n after <|eot_id|>, but this maintains consistency with previous
formatting

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
  • Loading branch information
gabe-l-hart committed Dec 11, 2024
1 parent 4882899 commit d624ed3
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 23 deletions.
33 changes: 11 additions & 22 deletions tests/test_chat_formatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,44 +139,33 @@ def test_llama2_chat_formatter(messages, expected):
# single user message (no system prompt)
(MSGS_NO_SYS, f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
{USER1}<|eot_id|>
"""),
{USER1}<|eot_id|>"""),
# sys, usr
(MSGS_SYS_USR, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{SYSTEM_PROMPT}<|eot_id|>
<|start_header_id|>user<|end_header_id|>
{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>
{USER1}<|eot_id|>
"""),
{USER1}<|eot_id|>"""),
# sys, usr, asst
(MSGS_SYS_USR_ASST, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{SYSTEM_PROMPT}<|eot_id|>
<|start_header_id|>user<|end_header_id|>
{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>
{USER1}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
{USER1}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{ASSISTANT1}<|eot_id|>
"""),
{ASSISTANT1}<|eot_id|>"""),
# sys, usr, asst, usr, asst
(MSGS_MULTI_TURN, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{SYSTEM_PROMPT}<|eot_id|>
<|start_header_id|>user<|end_header_id|>
{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>
{USER1}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
{USER1}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{ASSISTANT1}<|eot_id|>
<|start_header_id|>user<|end_header_id|>
{ASSISTANT1}<|eot_id|><|start_header_id|>user<|end_header_id|>
{USER2}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
{USER2}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{ASSISTANT2}<|eot_id|>
"""),
{ASSISTANT2}<|eot_id|>"""),
]
)
@pytest.mark.parametrize("add_generation_prompt", [True, False])
Expand Down
2 changes: 1 addition & 1 deletion torchchat/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def _encode_message(self, message: _ChatFormatter.MESSAGE_TYPE) -> List[int]:
self.tokenizer.encode(content["text"], bos=False, eos=False)
)

tokens.append(self.tokenizer.special_tokens["<|eot_id|>\n"])
tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
return tokens

def encode_dialog_prompt(
Expand Down

0 comments on commit d624ed3

Please sign in to comment.