diff --git a/intel_extension_for_transformers/llm/runtime/graph/__init__.py b/intel_extension_for_transformers/llm/runtime/graph/__init__.py index 489cfbb35e8..716b32c275b 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/__init__.py +++ b/intel_extension_for_transformers/llm/runtime/graph/__init__.py @@ -78,13 +78,13 @@ def get_model_type(model_config): def init(self, model_name, use_quant=True, use_cache=False, use_gptq=False, **quant_kwargs): self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - model_type = Model.get_model_type(self.config) - self.__import_package(model_type) + self.model_type = Model.get_model_type(self.config) + self.__import_package(self.model_type) # check cache and quantization output_path = "runtime_outs" os.makedirs(output_path, exist_ok=True) - fp32_bin = "{}/ne_{}_f32.bin".format(output_path, model_type) + fp32_bin = "{}/ne_{}_f32.bin".format(output_path, self.model_type) quant_desc = quant_kwargs['weight_dtype'] if quant_kwargs['use_ggml']: quant_desc += "_ggml" @@ -96,7 +96,7 @@ def init(self, model_name, use_quant=True, use_cache=False, use_gptq=False, **qu quant_desc += "_g{}".format(quant_kwargs['group_size']) if use_gptq: quant_desc = "gptq" - quant_bin = "{}/ne_{}_q_{}.bin".format(output_path, model_type, quant_desc) + quant_bin = "{}/ne_{}_q_{}.bin".format(output_path, self.model_type, quant_desc) if not use_quant: self.bin_file = fp32_bin @@ -199,7 +199,7 @@ def is_token_end(self): return self.model.is_token_end() def eos_token_id(self): - if self.tokenizer.eos_token_id == None: + if self.model_type == 'qwen': return self.tokenizer.special_tokens['<|endoftext|>'] return self.tokenizer.eos_token_id diff --git a/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_llama.py b/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_llama.py index dceeeab9125..b3e6a6b7d5a 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_llama.py +++ b/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_llama.py @@ -247,7 +247,8 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: - raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") + print(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") + added_tokens = {} items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) self.added_tokens_list = [text for (text, idx) in items] self.vocab_size_base: int = vocab_size diff --git a/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_mistral.py b/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_mistral.py index 1db1fdd7f89..d35bf0b05ff 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_mistral.py +++ b/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_mistral.py @@ -247,7 +247,8 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: - raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") + print(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") + added_tokens = {} items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) self.added_tokens_list = [text for (text, idx) in items] self.vocab_size_base: int = vocab_size