diff --git a/.devops/tools.sh b/.devops/tools.sh index 3a7d274e46619..97424c3aa746a 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -8,7 +8,7 @@ arg1="$1" shift if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then - python3 ./convert.py "$@" + python3 ./convert-hf-to-gguf.py "$@" elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then ./quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then diff --git a/CMakeLists.txt b/CMakeLists.txt index ef02ff66967f3..cca17889013e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1306,7 +1306,7 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR} install(TARGETS llama LIBRARY PUBLIC_HEADER) install( - FILES convert.py + FILES convert-hf-to-gguf.py PERMISSIONS OWNER_READ OWNER_WRITE diff --git a/README.md b/README.md index 2ee267fdf6887..e44f39d22702f 100644 --- a/README.md +++ b/README.md @@ -692,7 +692,8 @@ Building the program with BLAS support may lead to some performance improvements To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. -Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face. +Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives. +It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face. ```bash # obtain the official LLaMA model weights and place them in ./models @@ -709,10 +710,10 @@ ls ./models python3 -m pip install -r requirements.txt # convert the model to ggml FP16 format -python3 convert.py models/mymodel/ +python3 convert-hf-to-gguf.py models/mymodel/ # [Optional] for models using BPE tokenizers -python convert.py models/mymodel/ --vocab-type bpe +python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe # quantize the model to 4-bits (using Q4_K_M method) ./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M diff --git a/ci/run.sh b/ci/run.sh index 9402990250a20..3fc5f48b2e2af 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -287,7 +287,7 @@ function gg_run_open_llama_7b_v2 { (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf + python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf model_f16="${path_models}/ggml-model-f16.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf" diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5a00a5e89accb..3af4d4132ef7a 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -25,8 +25,6 @@ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf -from convert import LlamaHfVocab - logger = logging.getLogger("hf-to-gguf") @@ -632,7 +630,7 @@ def _set_vocab_sentencepiece(self): special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_llama_hf(self): - vocab = LlamaHfVocab(self.dir_model) + vocab = gguf.LlamaHfVocab(self.dir_model) tokens = [] scores = [] toktypes = [] diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md index 48769cdf61092..1381242485960 100644 --- a/docs/HOWTO-add-model.md +++ b/docs/HOWTO-add-model.md @@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M ### 1. Convert the model to GGUF This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library. -Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py). +Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format). The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors. diff --git a/convert.py b/examples/convert-legacy-llama.py similarity index 82% rename from convert.py rename to examples/convert-legacy-llama.py index da1247957780c..fd840101569a9 100755 --- a/convert.py +++ b/examples/convert-legacy-llama.py @@ -24,14 +24,16 @@ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional +from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional import numpy as np -from sentencepiece import SentencePieceProcessor if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) + # use .parent.parent since we are in "examples" directory + sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py')) + import gguf +from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab if TYPE_CHECKING: from typing_extensions import Self, TypeAlias @@ -380,306 +382,6 @@ def load(metadata_path: Path) -> Metadata: return metadata -# -# vocab -# - - -@runtime_checkable -class BaseVocab(Protocol): - tokenizer_model: ClassVar[str] - name: ClassVar[str] - - -class NoVocab(BaseVocab): - tokenizer_model = "no_vocab" - name = "no_vocab" - - def __repr__(self) -> str: - return "" - - -@runtime_checkable -class Vocab(BaseVocab, Protocol): - vocab_size: int - added_tokens_dict: dict[str, int] - added_tokens_list: list[str] - fname_tokenizer: Path - - def __init__(self, base_path: Path): ... - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ... - - -class BpeVocab(Vocab): - tokenizer_model = "gpt2" - name = "bpe" - - def __init__(self, base_path: Path): - added_tokens: dict[str, int] = {} - - if (fname_tokenizer := base_path / 'vocab.json').exists(): - # "slow" tokenizer - with open(fname_tokenizer, encoding="utf-8") as f: - self.vocab = json.load(f) - - try: - # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. - with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f: - added_tokens = json.load(f) - except FileNotFoundError: - pass - else: - # "fast" tokenizer - fname_tokenizer = base_path / FAST_TOKENIZER_FILE - - # if this fails, FileNotFoundError propagates to caller - with open(fname_tokenizer, encoding="utf-8") as f: - tokenizer_json = json.load(f) - - tokenizer_model: dict[str, Any] = tokenizer_json['model'] - if ( - tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False) - or tokenizer_json['decoder']['type'] != 'ByteLevel' - ): - raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer') - - self.vocab = tokenizer_model["vocab"] - - if (added := tokenizer_json.get('added_tokens')) is not None: - # Added tokens here can be duplicates of the main vocabulary. - added_tokens = {item['content']: item['id'] - for item in added - if item['content'] not in self.vocab} - - vocab_size = len(self.vocab) - expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) - if expected_ids != actual_ids: - expected_end_id = vocab_size + len(actual_ids) - 1 - raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " - f"{vocab_size} - {expected_end_id}; got {actual_ids}") - - items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_dict = added_tokens - self.added_tokens_list = [text for (text, idx) in items] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - - def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} - - for i, _ in enumerate(self.vocab): - yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL - - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: - score = -1000.0 - yield text.encode("utf-8"), score, gguf.TokenType.CONTROL - - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - yield from self.bpe_tokens() - yield from self.added_tokens() - - def __repr__(self) -> str: - return f"" - - -class SentencePieceVocab(Vocab): - tokenizer_model = "llama" - name = "spm" - - def __init__(self, base_path: Path): - added_tokens: dict[str, int] = {} - if (fname_tokenizer := base_path / 'tokenizer.model').exists(): - # normal location - try: - with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f: - added_tokens = json.load(f) - except FileNotFoundError: - pass - elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists(): - # not found in alternate location either - raise FileNotFoundError('Cannot find tokenizer.model') - - self.sentencepiece_tokenizer = SentencePieceProcessor() - self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer)) - vocab_size = self.sentencepiece_tokenizer.vocab_size() - - new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} - expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) - actual_new_ids = sorted(new_tokens.keys()) - - if expected_new_ids != actual_new_ids: - raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") - - # Token pieces that were added to the base vocabulary. - self.added_tokens_dict = added_tokens - self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - - def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - tokenizer = self.sentencepiece_tokenizer - for i in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(i) - text = piece.encode("utf-8") - score: float = tokenizer.GetScore(i) - - toktype = gguf.TokenType.NORMAL - if tokenizer.IsUnknown(i): - toktype = gguf.TokenType.UNKNOWN - if tokenizer.IsControl(i): - toktype = gguf.TokenType.CONTROL - - # NOTE: I think added_tokens are user defined. - # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto - # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED - - if tokenizer.IsUnused(i): - toktype = gguf.TokenType.UNUSED - if tokenizer.IsByte(i): - toktype = gguf.TokenType.BYTE - - yield text, score, toktype - - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: - score = -1000.0 - yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED - - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - yield from self.sentencepiece_tokens() - yield from self.added_tokens() - - def __repr__(self) -> str: - return f"" - - -class LlamaHfVocab(Vocab): - tokenizer_model = "llama" - name = "hfft" - - def __init__(self, base_path: Path): - fname_tokenizer = base_path / FAST_TOKENIZER_FILE - # if this fails, FileNotFoundError propagates to caller - with open(fname_tokenizer, encoding='utf-8') as f: - tokenizer_json = json.load(f) - - # pre-check so we know if we need transformers - tokenizer_model: dict[str, Any] = tokenizer_json['model'] - is_llama3 = ( - tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False) - and not tokenizer_model.get('byte_fallback', True) - ) - if is_llama3: - raise TypeError('Llama 3 must be converted with BpeVocab') - - if not is_llama3 and ( - tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False) - or tokenizer_json['decoder']['type'] != 'Sequence' - ): - raise FileNotFoundError('Cannot find Llama BPE tokenizer') - - try: - from transformers import AutoTokenizer - except ImportError as e: - raise ImportError( - "To use LlamaHfVocab, please install the `transformers` package. " - "You can install it with `pip install transformers`." - ) from e - - # Allow the tokenizer to default to slow or fast versions. - # Explicitly set tokenizer to use local paths. - self.tokenizer = AutoTokenizer.from_pretrained( - base_path, - cache_dir=base_path, - local_files_only=True, - ) - assert self.tokenizer.is_fast # assume tokenizer.json is used - - # Initialize lists and dictionaries for added tokens - self.added_tokens_list = [] - self.added_tokens_dict = dict() - self.added_tokens_ids = set() - - # Process added tokens - for tok, tokidx in sorted( - self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] - ): - # Only consider added tokens that are not in the base vocabulary - if tokidx >= self.tokenizer.vocab_size: - self.added_tokens_list.append(tok) - self.added_tokens_dict[tok] = tokidx - self.added_tokens_ids.add(tokidx) - - # Store special tokens and their IDs - self.specials = { - tok: self.tokenizer.get_vocab()[tok] - for tok in self.tokenizer.all_special_tokens - } - self.special_ids = set(self.tokenizer.all_special_ids) - - # Set vocabulary sizes - self.vocab_size_base = self.tokenizer.vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - - self.fname_tokenizer = fname_tokenizer - - def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - reverse_vocab = { - id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() - } - - for token_id in range(self.vocab_size_base): - # Skip processing added tokens here - if token_id in self.added_tokens_ids: - continue - - # Convert token text to bytes - token_text = reverse_vocab[token_id].encode("utf-8") - - # Yield token text, score, and type - yield token_text, self.get_token_score(token_id), self.get_token_type( - token_id, token_text, self.special_ids # Reuse already stored special IDs - ) - - def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType: - # Special case for byte tokens - if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): - return gguf.TokenType.BYTE - - # Determine token type based on whether it's a special token - return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL - - def get_token_score(self, token_id: int) -> float: - # Placeholder for actual logic to determine the token's score - # This needs to be implemented based on specific requirements - return -1000.0 # Default score - - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: - if text in self.specials: - toktype = self.get_token_type(self.specials[text], b'', self.special_ids) - score = self.get_token_score(self.specials[text]) - else: - toktype = gguf.TokenType.USER_DEFINED - score = -1000.0 - - yield text.encode("utf-8"), score, toktype - - def has_newline_token(self): - return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab - - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - yield from self.hf_tokens() - yield from self.added_tokens() - - def __repr__(self) -> str: - return f"" - - # # data loading # TODO: reuse (probably move to gguf.py?) diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 413e433dd9c07..74f021dec5e17 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -54,10 +54,10 @@ python ./examples/llava/convert-image-encoder-to-gguf \ --projector-type ldpv2 ``` -4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: +4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF: ```sh -python ./convert.py path/to/MobileVLM-1.7B +python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B ``` 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` diff --git a/examples/llava/README.md b/examples/llava/README.md index 4fb0cf3816383..8d1ae5270e458 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -50,10 +50,10 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b ``` -5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: +5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF: ```sh -python ./convert.py ../llava-v1.5-7b --skip-unknown +python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown ``` Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory. @@ -92,7 +92,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto 6) Then convert the model to gguf format: ```console -python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown +python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown ``` 7) And finally we can run the llava-cli using the 1.6 model version: diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt index f80f727a79307..17cb4d5e5ee8e 100644 --- a/examples/llava/requirements.txt +++ b/examples/llava/requirements.txt @@ -1,3 +1,3 @@ --r ../../requirements/requirements-convert.txt +-r ../../requirements/requirements-convert-legacy-llama.txt pillow~=10.2.0 torch~=2.1.1 diff --git a/examples/make-ggml.py b/examples/make-ggml.py deleted file mode 100755 index c73485ebf1eff..0000000000000 --- a/examples/make-ggml.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python3 -""" -This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them. - -Usage: -python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)] - -Arguments: -- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub. -- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox. -- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used. -- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used. -- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'. -- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created. - -Old quant types (some base model types require these): -- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M -- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L -- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M -- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M - -New quant types (recommended): -- Q2_K: smallest, extreme quality loss - not recommended -- Q3_K: alias for Q3_K_M -- Q3_K_S: very small, very high quality loss -- Q3_K_M: very small, very high quality loss -- Q3_K_L: small, substantial quality loss -- Q4_K: alias for Q4_K_M -- Q4_K_S: small, significant quality loss -- Q4_K_M: medium, balanced quality - recommended -- Q5_K: alias for Q5_K_M -- Q5_K_S: large, low quality loss - recommended -- Q5_K_M: large, very low quality loss - recommended -- Q6_K: very large, extremely low quality loss -- Q8_0: very large, extremely low quality loss - not recommended -- F16: extremely large, virtually no quality loss - not recommended -- F32: absolutely huge, lossless - not recommended -""" -import subprocess -subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True) - -import argparse -import os -from huggingface_hub import snapshot_download - -def main(model, model_type, outname, outdir, quants, keep_fp16): - if not os.path.isdir(model): - print(f"Model not found at {model}. Downloading...") - try: - if outname is None: - outname = model.split('/')[-1] - model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache') - except Exception as e: - raise Exception(f"Could not download the model: {e}") - - if outdir is None: - outdir = f'../models/{outname}' - - if not os.path.isfile(f"{model}/config.json"): - raise Exception(f"Could not find config.json in {model}") - - os.makedirs(outdir, exist_ok=True) - - print("Building llama.cpp") - subprocess.run(f"cd .. && make quantize", shell=True, check=True) - - fp16 = f"{outdir}/{outname}.gguf.fp16.bin" - - print(f"Making unquantised GGUF at {fp16}") - if not os.path.isfile(fp16): - if model_type != "llama": - subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True) - else: - subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True) - else: - print(f"Unquantised GGML already exists at: {fp16}") - - print("Making quants") - for type in quants: - outfile = f"{outdir}/{outname}.gguf.{type}.bin" - print(f"Making {type} : {outfile}") - subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True) - - if not keep_fp16: - os.remove(fp16) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.') - parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name') - parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.') - parser.add_argument('--outname', default=None, help='Output model(s) name') - parser.add_argument('--outdir', default=None, help='Output directory') - parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types') - parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False) - - args = parser.parse_args() - - main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 3ba99be4f4489..dc574991381a8 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -1,10 +1,15 @@ from __future__ import annotations +import re import logging import json import os from pathlib import Path -from typing import Any, Callable, Sequence, Mapping, Iterable +from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVar, runtime_checkable + +from sentencepiece import SentencePieceProcessor + +import gguf from .gguf_writer import GGUFWriter @@ -163,3 +168,298 @@ def _try_load_from_config_json(self, path: Path) -> bool: for typ in self.special_token_types: self._set_special_token(typ, config.get(f'{typ}_token_id')) return True + + +@runtime_checkable +class BaseVocab(Protocol): + tokenizer_model: ClassVar[str] + name: ClassVar[str] + + +@runtime_checkable +class Vocab(BaseVocab, Protocol): + vocab_size: int + added_tokens_dict: dict[str, int] + added_tokens_list: list[str] + fname_tokenizer: Path + + def __init__(self, base_path: Path): ... + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ... + + +class NoVocab(BaseVocab): + tokenizer_model = "no_vocab" + name = "no_vocab" + + def __repr__(self) -> str: + return "" + + +class BpeVocab(Vocab): + tokenizer_model = "gpt2" + name = "bpe" + + def __init__(self, base_path: Path): + added_tokens: dict[str, int] = {} + + if (fname_tokenizer := base_path / 'vocab.json').exists(): + # "slow" tokenizer + with open(fname_tokenizer, encoding="utf-8") as f: + self.vocab = json.load(f) + + try: + # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. + with open(base_path / 'added_tokens.json', encoding="utf-8") as f: + added_tokens = json.load(f) + except FileNotFoundError: + pass + else: + # "fast" tokenizer + fname_tokenizer = base_path / 'tokenizer.json' + + # if this fails, FileNotFoundError propagates to caller + with open(fname_tokenizer, encoding="utf-8") as f: + tokenizer_json = json.load(f) + + tokenizer_model: dict[str, Any] = tokenizer_json['model'] + if ( + tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False) + or tokenizer_json['decoder']['type'] != 'ByteLevel' + ): + raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer') + + self.vocab = tokenizer_model["vocab"] + + if (added := tokenizer_json.get('added_tokens')) is not None: + # Added tokens here can be duplicates of the main vocabulary. + added_tokens = {item['content']: item['id'] + for item in added + if item['content'] not in self.vocab} + + vocab_size = len(self.vocab) + expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) + actual_ids = sorted(added_tokens.values()) + if expected_ids != actual_ids: + expected_end_id = vocab_size + len(actual_ids) - 1 + raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " + f"{vocab_size} - {expected_end_id}; got {actual_ids}") + + items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) + self.added_tokens_dict = added_tokens + self.added_tokens_list = [text for (text, idx) in items] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer + + def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} + + for i, _ in enumerate(self.vocab): + yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL + + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + for text in self.added_tokens_list: + score = -1000.0 + yield text.encode("utf-8"), score, gguf.TokenType.CONTROL + + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + yield from self.bpe_tokens() + yield from self.added_tokens() + + def __repr__(self) -> str: + return f"" + + +class SentencePieceVocab(Vocab): + tokenizer_model = "llama" + name = "spm" + + def __init__(self, base_path: Path): + added_tokens: dict[str, int] = {} + if (fname_tokenizer := base_path / 'tokenizer.model').exists(): + # normal location + try: + with open(base_path / 'added_tokens.json', encoding="utf-8") as f: + added_tokens = json.load(f) + except FileNotFoundError: + pass + elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists(): + # not found in alternate location either + raise FileNotFoundError('Cannot find tokenizer.model') + + self.sentencepiece_tokenizer = SentencePieceProcessor() + self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer)) + vocab_size = self.sentencepiece_tokenizer.vocab_size() + + new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} + expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) + actual_new_ids = sorted(new_tokens.keys()) + + if expected_new_ids != actual_new_ids: + raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") + + # Token pieces that were added to the base vocabulary. + self.added_tokens_dict = added_tokens + self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer + + def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + tokenizer = self.sentencepiece_tokenizer + for i in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(i) + text = piece.encode("utf-8") + score: float = tokenizer.GetScore(i) + + toktype = gguf.TokenType.NORMAL + if tokenizer.IsUnknown(i): + toktype = gguf.TokenType.UNKNOWN + if tokenizer.IsControl(i): + toktype = gguf.TokenType.CONTROL + + # NOTE: I think added_tokens are user defined. + # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto + # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED + + if tokenizer.IsUnused(i): + toktype = gguf.TokenType.UNUSED + if tokenizer.IsByte(i): + toktype = gguf.TokenType.BYTE + + yield text, score, toktype + + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + for text in self.added_tokens_list: + score = -1000.0 + yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED + + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + yield from self.sentencepiece_tokens() + yield from self.added_tokens() + + def __repr__(self) -> str: + return f"" + + +class LlamaHfVocab(Vocab): + tokenizer_model = "llama" + name = "hfft" + + def __init__(self, base_path: Path): + fname_tokenizer = base_path / 'tokenizer.json' + # if this fails, FileNotFoundError propagates to caller + with open(fname_tokenizer, encoding='utf-8') as f: + tokenizer_json = json.load(f) + + # pre-check so we know if we need transformers + tokenizer_model: dict[str, Any] = tokenizer_json['model'] + is_llama3 = ( + tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False) + and not tokenizer_model.get('byte_fallback', True) + ) + if is_llama3: + raise TypeError('Llama 3 must be converted with BpeVocab') + + if not is_llama3 and ( + tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False) + or tokenizer_json['decoder']['type'] != 'Sequence' + ): + raise FileNotFoundError('Cannot find Llama BPE tokenizer') + + try: + from transformers import AutoTokenizer + except ImportError as e: + raise ImportError( + "To use LlamaHfVocab, please install the `transformers` package. " + "You can install it with `pip install transformers`." + ) from e + + # Allow the tokenizer to default to slow or fast versions. + # Explicitly set tokenizer to use local paths. + self.tokenizer = AutoTokenizer.from_pretrained( + base_path, + cache_dir=base_path, + local_files_only=True, + ) + assert self.tokenizer.is_fast # assume tokenizer.json is used + + # Initialize lists and dictionaries for added tokens + self.added_tokens_list = [] + self.added_tokens_dict = dict() + self.added_tokens_ids = set() + + # Process added tokens + for tok, tokidx in sorted( + self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] + ): + # Only consider added tokens that are not in the base vocabulary + if tokidx >= self.tokenizer.vocab_size: + self.added_tokens_list.append(tok) + self.added_tokens_dict[tok] = tokidx + self.added_tokens_ids.add(tokidx) + + # Store special tokens and their IDs + self.specials = { + tok: self.tokenizer.get_vocab()[tok] + for tok in self.tokenizer.all_special_tokens + } + self.special_ids = set(self.tokenizer.all_special_ids) + + # Set vocabulary sizes + self.vocab_size_base = self.tokenizer.vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + + self.fname_tokenizer = fname_tokenizer + + def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + reverse_vocab = { + id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() + } + + for token_id in range(self.vocab_size_base): + # Skip processing added tokens here + if token_id in self.added_tokens_ids: + continue + + # Convert token text to bytes + token_text = reverse_vocab[token_id].encode("utf-8") + + # Yield token text, score, and type + yield token_text, self.get_token_score(token_id), self.get_token_type( + token_id, token_text, self.special_ids # Reuse already stored special IDs + ) + + def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType: + # Special case for byte tokens + if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + return gguf.TokenType.BYTE + + # Determine token type based on whether it's a special token + return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL + + def get_token_score(self, token_id: int) -> float: + # Placeholder for actual logic to determine the token's score + # This needs to be implemented based on specific requirements + return -1000.0 # Default score + + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + for text in self.added_tokens_list: + if text in self.specials: + toktype = self.get_token_type(self.specials[text], b'', self.special_ids) + score = self.get_token_score(self.specials[text]) + else: + toktype = gguf.TokenType.USER_DEFINED + score = -1000.0 + + yield text.encode("utf-8"), score, toktype + + def has_newline_token(self): + return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab + + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + yield from self.hf_tokens() + yield from self.added_tokens() + + def __repr__(self) -> str: + return f"" diff --git a/requirements.txt b/requirements.txt index 43f82dc2e600d..e5cfbf10b3da5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ # Package versions must stay compatible across all top-level python scripts. # --r ./requirements/requirements-convert.txt +-r ./requirements/requirements-convert-legacy-llama.txt -r ./requirements/requirements-convert-hf-to-gguf.txt -r ./requirements/requirements-convert-hf-to-gguf-update.txt diff --git a/requirements/requirements-convert-hf-to-gguf-update.txt b/requirements/requirements-convert-hf-to-gguf-update.txt index 6ac4026107fbe..6eacaf4290e0a 100644 --- a/requirements/requirements-convert-hf-to-gguf-update.txt +++ b/requirements/requirements-convert-hf-to-gguf-update.txt @@ -1,2 +1,2 @@ --r ./requirements-convert.txt +-r ./requirements-convert-legacy-llama.txt torch~=2.1.1 diff --git a/requirements/requirements-convert-hf-to-gguf.txt b/requirements/requirements-convert-hf-to-gguf.txt index 6ac4026107fbe..6eacaf4290e0a 100644 --- a/requirements/requirements-convert-hf-to-gguf.txt +++ b/requirements/requirements-convert-hf-to-gguf.txt @@ -1,2 +1,2 @@ --r ./requirements-convert.txt +-r ./requirements-convert-legacy-llama.txt torch~=2.1.1 diff --git a/requirements/requirements-convert.txt b/requirements/requirements-convert-legacy-llama.txt similarity index 100% rename from requirements/requirements-convert.txt rename to requirements/requirements-convert-legacy-llama.txt diff --git a/requirements/requirements-convert-llama-ggml-to-gguf.txt b/requirements/requirements-convert-llama-ggml-to-gguf.txt index a0f37cd1c71e4..e80c29012a674 100644 --- a/requirements/requirements-convert-llama-ggml-to-gguf.txt +++ b/requirements/requirements-convert-llama-ggml-to-gguf.txt @@ -1 +1 @@ --r ./requirements-convert.txt +-r ./requirements-convert-legacy-llama.txt diff --git a/scripts/check-requirements.sh b/scripts/check-requirements.sh index 6a7400d3c3a0b..0c6afdd591aaa 100755 --- a/scripts/check-requirements.sh +++ b/scripts/check-requirements.sh @@ -166,7 +166,7 @@ if (( do_cleanup )); then rm -rf -- "$all_venv" fi -check_convert_script convert.py +check_convert_script examples/convert-legacy-llama.py for py in convert-*.py; do # skip convert-hf-to-gguf-update.py # TODO: the check is failing for some reason: diff --git a/scripts/convert-gg.sh b/scripts/convert-gg.sh index 01fda16fd7efc..8a016843290b9 100755 --- a/scripts/convert-gg.sh +++ b/scripts/convert-gg.sh @@ -3,20 +3,20 @@ set -e # LLaMA v1 -python3 convert.py ../llama1/7B --outfile models/llama-7b/ggml-model-f16.gguf --outtype f16 -python3 convert.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16 -python3 convert.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16 -python3 convert.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16 +python3 examples/convert-legacy-llama.py ../llama1/7B --outfile models/llama-7b/ggml-model-f16.gguf --outtype f16 +python3 examples/convert-legacy-llama.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16 +python3 examples/convert-legacy-llama.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16 +python3 examples/convert-legacy-llama.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16 # LLaMA v2 -python3 convert.py ../llama2/llama-2-7b --outfile models/llama-7b-v2/ggml-model-f16.gguf --outtype f16 -python3 convert.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16 -python3 convert.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16 +python3 examples/convert-legacy-llama.py ../llama2/llama-2-7b --outfile models/llama-7b-v2/ggml-model-f16.gguf --outtype f16 +python3 examples/convert-legacy-llama.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16 +python3 examples/convert-legacy-llama.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16 # Code Llama -python3 convert.py ../codellama/CodeLlama-7b/ --outfile models/codellama-7b/ggml-model-f16.gguf --outtype f16 -python3 convert.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16 -python3 convert.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16 +python3 examples/convert-legacy-llama.py ../codellama/CodeLlama-7b/ --outfile models/codellama-7b/ggml-model-f16.gguf --outtype f16 +python3 examples/convert-legacy-llama.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16 +python3 examples/convert-legacy-llama.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16 # Falcon python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b 1 diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh index 2058ceabf9730..5dabbf60e6fc8 100644 --- a/scripts/pod-llama.sh +++ b/scripts/pod-llama.sh @@ -75,7 +75,7 @@ if [ "$1" -eq "1" ]; then cd /workspace/llama.cpp - python3 convert.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16 + python3 examples/convert-legacy-llama.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16 ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0 ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k @@ -90,7 +90,7 @@ if [ "$1" -eq "2" ]; then cd /workspace/llama.cpp - python3 convert.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16 + python3 examples/convert-legacy-llama.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16 ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0 ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k @@ -105,7 +105,7 @@ if [ "$1" -eq "3" ]; then cd /workspace/llama.cpp - python3 convert.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16 + python3 examples/convert-legacy-llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16 ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0 ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k @@ -120,7 +120,7 @@ if [ "$1" -eq "4" ]; then cd /workspace/llama.cpp - python3 convert.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16 + python3 examples/convert-legacy-llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16 ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0 ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k @@ -135,7 +135,7 @@ if [ "$1" -eq "5" ]; then cd /workspace/llama.cpp - python3 convert.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16 + python3 examples/convert-legacy-llama.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16 ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0 ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k @@ -150,7 +150,7 @@ if [ "$1" -eq "6" ]; then cd /workspace/llama.cpp - python3 convert.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16 + python3 examples/convert-legacy-llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16 ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0 ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k @@ -165,7 +165,7 @@ if [ "$1" -eq "7" ]; then cd /workspace/llama.cpp - python3 convert.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16 + python3 examples/convert-legacy-llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16 ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0 ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k