From b0298aa8cc4a54bde659e57271778630785abc9b Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 28 Sep 2024 16:11:25 +0800 Subject: [PATCH] [Misc] Remove vLLM patch of `BaichuanTokenizer` (#8921) --- vllm/transformers_utils/tokenizer.py | 16 +- .../transformers_utils/tokenizers/__init__.py | 5 +- .../transformers_utils/tokenizers/baichuan.py | 255 ------------------ 3 files changed, 3 insertions(+), 273 deletions(-) delete mode 100644 vllm/transformers_utils/tokenizers/baichuan.py diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index e3b244d06660d..85c339df4a76c 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -11,8 +11,7 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.transformers_utils.tokenizers import (BaichuanTokenizer, - MistralTokenizer) +from vllm.transformers_utils.tokenizers import MistralTokenizer from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import make_async @@ -139,19 +138,6 @@ def get_tokenizer( raise RuntimeError(err_msg) from e else: raise e - except AttributeError as e: - if "BaichuanTokenizer" in str(e): - # This is for the error "'BaichuanTokenizer' object has no - # attribute 'sp_model'". - tokenizer = BaichuanTokenizer.from_pretrained( - tokenizer_name, - *args, - trust_remote_code=trust_remote_code, - revision=revision, - **kwargs, - ) - else: - raise e # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324 if type(tokenizer).__name__ in ("ChatGLMTokenizer", diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py index 9433f2d48f6f3..5f437d414e181 100644 --- a/vllm/transformers_utils/tokenizers/__init__.py +++ b/vllm/transformers_utils/tokenizers/__init__.py @@ -1,4 +1,3 @@ -from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer -from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer +from .mistral import MistralTokenizer -__all__ = ["BaichuanTokenizer", "MistralTokenizer"] +__all__ = ["MistralTokenizer"] diff --git a/vllm/transformers_utils/tokenizers/baichuan.py b/vllm/transformers_utils/tokenizers/baichuan.py deleted file mode 100644 index 76daabc41e0a2..0000000000000 --- a/vllm/transformers_utils/tokenizers/baichuan.py +++ /dev/null @@ -1,255 +0,0 @@ -# Adapted from -# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py -# This includes a fix suggested in -# https://github.com/vllm-project/vllm/issues/1403#issuecomment-1767503058 -# Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved. - -import os -from shutil import copyfile -from typing import Any, Dict, List, Optional, Tuple - -import sentencepiece as spm -from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer -from transformers.utils import logging - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} - -PRETRAINED_VOCAB_FILES_MAP = { # type: ignore - "vocab_file": {}, - "tokenizer_file": {}, -} -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} # type: ignore - - -class BaichuanTokenizer(PreTrainedTokenizer): - """ - Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding. - - Args: - vocab_file (`str`): - Path to the vocabulary file. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["input_ids", "attention_mask"] - - def __init__( - self, - vocab_file, - unk_token="", - bos_token="", - eos_token="", - pad_token=None, - sp_model_kwargs: Optional[Dict[str, Any]] = None, - add_bos_token=True, - add_eos_token=False, - clean_up_tokenization_spaces=False, - **kwargs, - ): - self.sp_model_kwargs = ({} if sp_model_kwargs is None else - sp_model_kwargs) - bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False) - if isinstance(bos_token, str) else bos_token) - eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False) - if isinstance(eos_token, str) else eos_token) - unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False) - if isinstance(unk_token, str) else unk_token) - pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False) - if isinstance(pad_token, str) else pad_token) - self.vocab_file = vocab_file - self.add_bos_token = add_bos_token - self.add_eos_token = add_eos_token - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - pad_token=pad_token, - add_bos_token=add_bos_token, - add_eos_token=add_eos_token, - sp_model_kwargs=self.sp_model_kwargs, - clean_up_tokenization_spaces=clean_up_tokenization_spaces, - **kwargs, - ) - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - return state - - def __setstate__(self, d): - self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(self.vocab_file) - - @property - def vocab_size(self): - """Returns vocab size""" - return self.sp_model.get_piece_size() - - def get_vocab(self): - """Returns vocab as a dict""" - vocab = { - self.convert_ids_to_tokens(i): i - for i in range(self.vocab_size) - } - vocab.update(self.added_tokens_encoder) - return vocab - - def _tokenize(self, text): - """Returns a tokenized string.""" - return self.sp_model.encode(text, out_type=str) - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.sp_model.piece_to_id(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - token = self.sp_model.IdToPiece(index) - return token - - def convert_tokens_to_string(self, tokens: List[str]): - """Converts a sequence of tokens (string) in a single string.""" - current_sub_tokens: List[str] = [] - out_string = "" - prev_is_special = False - for i, token in enumerate(tokens): - # make sure that special tokens are not decoded using - # sentencepiece model - if token in self.all_special_tokens: - if not prev_is_special and i != 0: - out_string += " " - out_string += self.sp_model.decode(current_sub_tokens) + token - prev_is_special = True - current_sub_tokens = [] - else: - current_sub_tokens.append(token) - prev_is_special = False - out_string += self.sp_model.decode(current_sub_tokens) - return out_string - - def save_vocabulary(self, - save_directory, - filename_prefix: Optional[str] = None) -> Tuple[str]: - """ - Save the vocabulary and special tokens file to a directory. - - Args: - save_directory (`str`): - The directory in which to save the vocabulary. - - Returns: - `Tuple(str)`: Paths to the files saved. - """ - if not os.path.isdir(save_directory): - raise ValueError(f"Vocabulary path ({save_directory}) " - "should be a directory") - - out_vocab_file = os.path.join( - save_directory, - (filename_prefix + "-" if filename_prefix else "") + - VOCAB_FILES_NAMES["vocab_file"], - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath( - out_vocab_file) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, out_vocab_file) - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - return (out_vocab_file, ) - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - bos_token_id = [self.bos_token_id] if self.add_bos_token else [] - eos_token_id = [self.eos_token_id] if self.add_eos_token else [] - - output = bos_token_id + token_ids_0 + eos_token_id - - if token_ids_1 is not None: - output = output + bos_token_id + token_ids_1 + eos_token_id - - return output - - def get_special_tokens_mask( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None, - already_has_special_tokens: bool = False, - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens - added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to - `False`): - Whether or not the token list is already formatted with - special tokens for the model. - - Returns: - `List[int]`: A list of integers in the range [0, 1]: - 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, - token_ids_1=token_ids_1, - already_has_special_tokens=True, - ) - - bos_token_id = [1] if self.add_bos_token else [] - eos_token_id = [1] if self.add_eos_token else [] - - if token_ids_1 is None: - return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id - return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + - bos_token_id + ([0] * len(token_ids_1)) + eos_token_id) - - def create_token_type_ids_from_sequences( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None) -> List[int]: - """ - Creates a mask from the two sequences passed to be used in a - sequence-pair classification task. An ALBERT - sequence pair mask has the following format: - - ``` - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence | - ``` - - if token_ids_1 is None, only returns the first portion of the mask (0s). - - Args: - token_ids_0 (`List[int]`): - List of ids. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of [token type IDs](../glossary#token-type-ids) - according to the given sequence(s). - """ - bos_token_id = [self.bos_token_id] if self.add_bos_token else [] - eos_token_id = [self.eos_token_id] if self.add_eos_token else [] - - output = [0] * len(bos_token_id + token_ids_0 + eos_token_id) - - if token_ids_1 is not None: - output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) - - return output