diff --git a/.gitignore b/.gitignore index c7cbd954dadcb..04bbc037e672c 100644 --- a/.gitignore +++ b/.gitignore @@ -78,7 +78,6 @@ models-mnt !models/ggml-vocab-*.gguf* # Zig - zig-out/ zig-cache/ diff --git a/Makefile b/Makefile index d41ebfd42a73b..649671ed6a72e 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,7 @@ BUILD_TARGETS = \ llama-imatrix \ llama-infill \ llama-llava-cli \ + llama-minicpmv-cli\ llama-lookahead \ llama-lookup \ llama-lookup-create \ @@ -1206,6 +1207,7 @@ clean: rm -rvf ggml/*.dll rm -rvf ggml/*.so rm -vrf ggml/src/*.o + rm -rvf ggml/src/llamafile/*.o rm -rvf common/build-info.cpp rm -vrf ggml/src/ggml-metal-embed.metal rm -vrf ggml/src/ggml-cuda/*.o @@ -1452,15 +1454,20 @@ libllava.a: examples/llava/llava.cpp \ $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual llama-llava-cli: examples/llava/llava-cli.cpp \ - examples/llava/clip.h \ - examples/llava/clip.cpp \ + examples/llava/llava.cpp \ examples/llava/llava.h \ + examples/llava/clip.cpp \ + examples/llava/clip.h \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual + +llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \ examples/llava/llava.cpp \ + examples/llava/llava.h \ + examples/llava/clip.cpp \ + examples/llava/clip.h \ $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual - $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) - $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift diff --git a/common/common.cpp b/common/common.cpp index a2f032c8df01c..7c93f6ba89ba5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1804,6 +1804,17 @@ std::string string_get_sortable_timestamp() { return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); } +void string_replace_all(std::string & s, const std::string & search, const std::string & replace) { + if (search.empty()) { + return; // Avoid infinite loop if 'search' is an empty string + } + size_t pos = 0; + while ((pos = s.find(search, pos)) != std::string::npos) { + s.replace(pos, search.length(), replace); + pos += replace.length(); + } +} + void string_process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -2172,7 +2183,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { tmp.clear(); tmp.push_back(decoder_start_token_id); } - llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); + if (llama_model_has_decoder(model)) { + llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); + } llama_kv_cache_clear(lctx); llama_synchronize(lctx); llama_reset_timings(lctx); diff --git a/common/common.h b/common/common.h index d88966ece20aa..bbc33a499afcd 100644 --- a/common/common.h +++ b/common/common.h @@ -286,6 +286,8 @@ std::vector string_split(std::string input, char separator); std::string string_strip(const std::string & str); std::string string_get_sortable_timestamp(); +void string_replace_all(std::string & s, const std::string & search, const std::string & replace); + template static std::vector string_split(const std::string & str, char delim) { std::vector values; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3513840b815e3..a3d9c9491b02d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -251,12 +251,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] - def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del name, new_name, bid, n_dims # unused - - return False - - def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: del name, new_name, bid, n_dims # unused return False @@ -285,54 +280,46 @@ def prepare_tensors(self): for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): data: np.ndarray # type hint n_dims = len(data.shape) - data_dtype = data.dtype - data_qtype: gguf.GGMLQuantizationType | None = None - - # when both are True, f32 should win - extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims) - extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims) + data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims) # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors - # Conditions should closely match those in llama_model_quantize_internal in llama.cpp - extra_f32 = any(cond for cond in ( - extra_f32, - n_dims == 1, - new_name.endswith("_norm.weight"), - )) + if n_dims <= 1 or new_name.endswith("_norm.weight"): + data_qtype = gguf.GGMLQuantizationType.F32 + # Conditions should closely match those in llama_model_quantize_internal in llama.cpp # Some tensor types are always in float32 - extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in ( - gguf.MODEL_TENSOR.FFN_GATE_INP, - gguf.MODEL_TENSOR.POS_EMBD, - gguf.MODEL_TENSOR.TOKEN_TYPES, - )) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - extra_f16 = any(cond for cond in ( - extra_f16, - (name.endswith(".weight") and n_dims >= 2), - )) - - if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: - if self.ftype == gguf.LlamaFileType.MOSTLY_BF16: - data = gguf.quantize_bf16(data) - assert data.dtype == np.uint16 - data_qtype = gguf.GGMLQuantizationType.BF16 - - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data): - data = gguf.quantize_q8_0(data) - assert data.dtype == np.uint8 - data_qtype = gguf.GGMLQuantizationType.Q8_0 + if data_qtype is False and ( + any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.POS_EMBD, + gguf.MODEL_TENSOR.TOKEN_TYPES, + ) + ) + or not name.endswith(".weight") + ): + data_qtype = gguf.GGMLQuantizationType.F32 - else: # default to float16 for quantized tensors - if data_dtype != np.float16: - data = data.astype(np.float16) + # No override (data_qtype is False), or wants to be quantized (data_qtype is True) + if isinstance(data_qtype, bool): + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + else: + raise ValueError(f"Unknown file type: {self.ftype.name}") - if data_qtype is None: # by default, convert to float32 - if data_dtype != np.float32: - data = data.astype(np.float32) - data_qtype = gguf.GGMLQuantizationType.F32 + try: + data = gguf.quants.quantize(data, data_qtype) + except gguf.QuantError as e: + logger.warning("%s, %s", e, "falling back to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape @@ -1762,7 +1749,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(new_name, data_torch)] - def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: del name, new_name, bid # unused return n_dims > 1 @@ -2783,18 +2770,22 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(new_name, data_torch)] - def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del n_dims # unused - - return bid is not None and new_name in ( - self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [ + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: + if bid is not None and new_name in ( + self.format_tensor_name( + n, bid, ".weight" if name.endswith(".weight") else "" + ) + for n in [ gguf.MODEL_TENSOR.SSM_CONV1D, gguf.MODEL_TENSOR.SSM_X, gguf.MODEL_TENSOR.SSM_DT, gguf.MODEL_TENSOR.SSM_A, gguf.MODEL_TENSOR.SSM_D, ] - ) + ): + return gguf.GGMLQuantizationType.F32 + + return super().tensor_force_quant(name, new_name, bid, n_dims) @Model.register("CohereForCausalLM") @@ -3330,6 +3321,145 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@Model.register("T5EncoderModel") +class T5EncoderModel(Model): + model_arch = gguf.MODEL_ARCH.T5ENCODER + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.shared_token_embeddings_found = False + + def set_vocab(self): + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + # many older models use spiece.model tokenizer model filename + if not tokenizer_path.is_file(): + tokenizer_path = self.dir_model / 'spiece.model' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + + # some models like Pile-T5 family use BPE tokenizer instead of Unigram + if sentencepiece_model.trainer_spec.model_type == 2: # BPE + # assure the tokenizer model file name is correct + assert tokenizer_path.name == 'tokenizer.model' + return self._set_vocab_sentencepiece() + else: + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_add_eos_token(True) + + def set_gguf_parameters(self): + if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: + logger.warning("Couldn't find context length in config.json, assuming default value of 512") + n_ctx = 512 + self.gguf_writer.add_context_length(n_ctx) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) + self.gguf_writer.add_block_count(self.hparams["num_layers"]) + self.gguf_writer.add_head_count(self.hparams["num_heads"]) + self.gguf_writer.add_key_length(self.hparams["d_kv"]) + self.gguf_writer.add_value_length(self.hparams["d_kv"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. + if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: + if not self.shared_token_embeddings_found: + name = "shared.weight" + self.shared_token_embeddings_found = True + else: + logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") + return [] + + return [(self.map_tensor_name(name), data_torch)] + + @Model.register("JAISLMHeadModel") class JaisModel(Model): model_arch = gguf.MODEL_ARCH.JAIS diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index d36ac0a158dd4..59a39fbb67395 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -80,7 +80,14 @@ The following release is verified with good quality: ### Intel GPU -**Verified devices** +SYCL backend supports Intel GPU Family: + +- Intel Data Center Max Series +- Intel Flex Series, Arc Series +- Intel Built-in Arc GPU +- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)). + +#### Verified devices | Intel GPU | Status | Verified Model | |-------------------------------|---------|---------------------------------------| @@ -88,7 +95,7 @@ The following release is verified with good quality: | Intel Data Center Flex Series | Support | Flex 170 | | Intel Arc Series | Support | Arc 770, 730M, Arc A750 | | Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake | -| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 | +| Intel iGPU | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 | *Notes:* @@ -237,6 +244,13 @@ Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA devic ### II. Build llama.cpp #### Intel GPU + +``` +./examples/sycl/build.sh +``` + +or + ```sh # Export relevant ENV variables source /opt/intel/oneapi/setvars.sh @@ -276,23 +290,26 @@ cmake --build build --config Release -j -v ### III. Run the inference -1. Retrieve and prepare model +#### Retrieve and prepare model You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. -2. Enable oneAPI running environment +##### Check device + +1. Enable oneAPI running environment ```sh source /opt/intel/oneapi/setvars.sh ``` -3. List devices information +2. List devices information Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: ```sh ./build/bin/llama-ls-sycl-device ``` + This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: ``` found 2 SYCL devices: @@ -304,12 +321,37 @@ found 2 SYCL devices: | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| ``` +#### Choose level-zero devices + +|Chosen Device ID|Setting| +|-|-| +|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action| +|1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`| +|0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`| + +#### Execute + +Choose one of following methods to run. + +1. Script + +- Use device 0: + +```sh +./examples/sycl/run_llama2.sh 0 +``` +- Use multiple devices: + +```sh +./examples/sycl/run_llama2.sh +``` -4. Launch inference +2. Command line +Launch inference There are two device selection modes: -- Single device: Use one device target specified by the user. +- Single device: Use one device assigned by user. Default device id is 0. - Multiple devices: Automatically choose the devices with the same backend. In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR. @@ -326,11 +368,6 @@ Examples: ```sh ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 ``` -or run by script: - -```sh -./examples/sycl/run_llama2.sh 0 -``` - Use multiple devices: @@ -338,12 +375,6 @@ or run by script: ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer ``` -Otherwise, you can run the script: - -```sh -./examples/sycl/run_llama2.sh -``` - *Notes:* - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow: @@ -390,7 +421,7 @@ c. Verify installation In the oneAPI command line, run the following to print the available SYCL devices: ``` -sycl-ls +sycl-ls.exe ``` There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device: @@ -411,6 +442,18 @@ b. The new Visual Studio will install Ninja as default. (If not, please install ### II. Build llama.cpp +You could download the release package for Windows directly, which including binary files and depended oneAPI dll files. + +Choose one of following methods to build from source code. + +1. Script + +```sh +.\examples\sycl\win-build-sycl.bat +``` + +2. CMake + On the oneAPI command line window, step into the llama.cpp main directory and run the following: ``` @@ -425,12 +468,8 @@ cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPI cmake --build build --config Release -j ``` -Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions: -```sh -.\examples\sycl\win-build-sycl.bat -``` - Or, use CMake presets to build: + ```sh cmake --preset x64-windows-sycl-release cmake --build build-x64-windows-sycl-release -j --target llama-cli @@ -442,7 +481,9 @@ cmake --preset x64-windows-sycl-debug cmake --build build-x64-windows-sycl-debug -j --target llama-cli ``` -Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project. +3. Visual Studio + +You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project. *Notes:* @@ -450,23 +491,25 @@ Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choos ### III. Run the inference -1. Retrieve and prepare model +#### Retrieve and prepare model -You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. +You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. -2. Enable oneAPI running environment +##### Check device + +1. Enable oneAPI running environment On the oneAPI command line window, run the following and step into the llama.cpp directory: ``` "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 ``` -3. List devices information +2. List devices information Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: ``` -build\bin\ls-sycl-device.exe +build\bin\llama-ls-sycl-device.exe ``` This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: @@ -478,10 +521,28 @@ found 2 SYCL devices: | 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| +``` +#### Choose level-zero devices + +|Chosen Device ID|Setting| +|-|-| +|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action| +|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`| +|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`| + +#### Execute + +Choose one of following methods to run. + +1. Script + +``` +examples\sycl\win-run-llama2.bat ``` +2. Command line -4. Launch inference +Launch inference There are two device selection modes: @@ -508,11 +569,7 @@ build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website ca ``` build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer ``` -Otherwise, run the following wrapper script: -``` -.\examples\sycl\win-run-llama2.bat -``` Note: @@ -526,17 +583,18 @@ Or use 1 SYCL GPUs: [0] with Max compute units:512 ``` + ## Environment Variable #### Build | Name | Value | Function | |--------------------|-----------------------------------|---------------------------------------------| -| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. | +| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model| | GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. | | GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | -| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. | -| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | +| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | +| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | #### Runtime @@ -572,9 +630,18 @@ use 1 SYCL GPUs: [0] with Max compute units:512 ``` Otherwise, please double-check the GPU driver installation steps. +- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend? + + No. We can't support Ollama issue directly, because we aren't familiar with Ollama. + + Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it. + + It's same for other projects including llama.cpp SYCL backend. + + ### **GitHub contribution**: Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay. ## TODO -- Support row layer split for multiple card runs. +- NA diff --git a/examples/embedding/README.md b/examples/embedding/README.md index e3705b4547677..12b372bf1df42 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor ### Unix-based systems (Linux, macOS, etc.): ```bash -./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null +./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null ``` ### Windows: ```powershell -llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null +llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null ``` The above command will output space-separated float values. @@ -50,11 +50,11 @@ The above command will output space-separated float values. ### Unix-based systems (Linux, macOS, etc.): ```bash -./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null ``` ### Windows: ```powershell -embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null ``` diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index cd7b448a619fa..b05aa006e7da5 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -31,13 +31,24 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke } static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) { + const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); + const struct llama_model * model = llama_get_model(ctx); + // clear previous kv_cache values (irrelevant for embeddings) llama_kv_cache_clear(ctx); // run model fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); - if (llama_decode(ctx, batch) < 0) { - fprintf(stderr, "%s : failed to decode\n", __func__); + if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) { + // encoder-only model + if (llama_encode(ctx, batch) < 0) { + fprintf(stderr, "%s : failed to encode\n", __func__); + } + } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) { + // decoder-only model + if (llama_decode(ctx, batch) < 0) { + fprintf(stderr, "%s : failed to decode\n", __func__); + } } for (int i = 0; i < batch.n_tokens; i++) { @@ -45,11 +56,22 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu continue; } - // try to get sequence embeddings - supported only when pooling_type is not NONE - const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); + const float * embd = nullptr; + int embd_pos = 0; + + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + // try to get token embeddings + embd = llama_get_embeddings_ith(ctx, i); + embd_pos = i; + GGML_ASSERT(embd != NULL && "failed to get token embeddings"); + } else { + // try to get sequence embeddings - supported only when pooling_type is not NONE + embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + embd_pos = batch.seq_id[i][0]; + GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); + } - float * out = output + batch.seq_id[i][0] * n_embd; + float * out = output + embd_pos * n_embd; llama_embd_normalize(embd, out, n_embd, embd_norm); } } @@ -93,8 +115,9 @@ int main(int argc, char ** argv) { const int n_ctx = llama_n_ctx(ctx); const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); - if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); + + if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) { + fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__); return 1; } @@ -153,13 +176,23 @@ int main(int argc, char ** argv) { const int n_prompts = prompts.size(); struct llama_batch batch = llama_batch_init(n_batch, 0, 1); + // count number of embeddings + int n_embd_count = 0; + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + for (int k = 0; k < n_prompts; k++) { + n_embd_count += inputs[k].size(); + } + } else { + n_embd_count = n_prompts; + } + // allocate output const int n_embd = llama_n_embd(model); - std::vector embeddings(n_prompts * n_embd, 0); + std::vector embeddings(n_embd_count * n_embd, 0); float * emb = embeddings.data(); // break into batches - int p = 0; // number of prompts processed already + int e = 0; // number of embeddings already stored int s = 0; // number of prompts in current batch for (int k = 0; k < n_prompts; k++) { // clamp to n_batch tokens @@ -169,11 +202,11 @@ int main(int argc, char ** argv) { // encode if at capacity if (batch.n_tokens + n_toks > n_batch) { - float * out = emb + p * n_embd; + float * out = emb + e * n_embd; batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); - llama_batch_clear(batch); - p += s; + e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s; s = 0; + llama_batch_clear(batch); } // add to batch @@ -182,39 +215,62 @@ int main(int argc, char ** argv) { } // final batch - float * out = emb + p * n_embd; + float * out = emb + e * n_embd; batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); if (params.embd_out.empty()) { - // print the first part of the embeddings or for a single prompt, the full embedding fprintf(stdout, "\n"); - for (int j = 0; j < n_prompts; j++) { - fprintf(stdout, "embedding %d: ", j); - for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { - if (params.embd_normalize == 0) { - fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); - } else { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + for (int j = 0; j < n_embd_count; j++) { + fprintf(stdout, "embedding %d: ", j); + for (int i = 0; i < std::min(3, n_embd); i++) { + if (params.embd_normalize == 0) { + fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + } else { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + } + } + fprintf(stdout, " ... "); + for (int i = n_embd - 3; i < n_embd; i++) { + if (params.embd_normalize == 0) { + fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + } else { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + } } + fprintf(stdout, "\n"); } - fprintf(stdout, "\n"); - } - - // print cosine similarity matrix - if (n_prompts > 1) { - fprintf(stdout, "\n"); - printf("cosine similarity matrix:\n\n"); - for (int i = 0; i < n_prompts; i++) { - fprintf(stdout, "%6.6s ", prompts[i].c_str()); + } else { + // print the first part of the embeddings or for a single prompt, the full embedding + for (int j = 0; j < n_prompts; j++) { + fprintf(stdout, "embedding %d: ", j); + for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { + if (params.embd_normalize == 0) { + fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + } else { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + } + } + fprintf(stdout, "\n"); } - fprintf(stdout, "\n"); - for (int i = 0; i < n_prompts; i++) { - for (int j = 0; j < n_prompts; j++) { - float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - fprintf(stdout, "%6.2f ", sim); + + // print cosine similarity matrix + if (n_prompts > 1) { + fprintf(stdout, "\n"); + printf("cosine similarity matrix:\n\n"); + for (int i = 0; i < n_prompts; i++) { + fprintf(stdout, "%6.6s ", prompts[i].c_str()); } - fprintf(stdout, "%1.10s", prompts[i].c_str()); fprintf(stdout, "\n"); + for (int i = 0; i < n_prompts; i++) { + for (int j = 0; j < n_prompts; j++) { + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f ", sim); + } + fprintf(stdout, "%1.10s", prompts[i].c_str()); + fprintf(stdout, "\n"); + } } } } @@ -233,23 +289,23 @@ int main(int argc, char ** argv) { } fprintf(stdout, notArray ? "]\n }" : "]"); j++; - if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break; + if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break; } fprintf(stdout, notArray ? "\n ]" : "]\n"); if (params.embd_out == "json+" && n_prompts > 1) { fprintf(stdout, ",\n \"cosineSimilarity\": [\n"); - for (int i = 0;;) { // at least two iteration (n_prompts > 1) + for (int i = 0;;) { // at least two iteration (n_embd_count > 1) fprintf(stdout, " ["); - for (int j = 0;;) { // at least two iteration (n_prompts > 1) + for (int j = 0;;) { // at least two iteration (n_embd_count > 1) float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); fprintf(stdout, "%6.2f", sim); j++; - if (j < n_prompts) fprintf(stdout, ", "); else break; + if (j < n_embd_count) fprintf(stdout, ", "); else break; } fprintf(stdout, " ]"); i++; - if (i < n_prompts) fprintf(stdout, ",\n"); else break; + if (i < n_embd_count) fprintf(stdout, ",\n"); else break; } fprintf(stdout, "\n ]"); } diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index d228ae66eeeec..3176d6e26ef8b 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -50,20 +50,6 @@ static struct gguf_context * load_gguf(std::string & fname, struct ggml_context return ctx_gguf; } -static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - std::string result; - for (size_t pos = 0; ; pos += search.length()) { - auto new_pos = s.find(search, pos); - if (new_pos == std::string::npos) { - result += s.substr(pos, s.size() - pos); - break; - } - result += s.substr(pos, new_pos - pos) + replace; - pos = new_pos; - } - s = std::move(result); -} - struct file_input { struct ggml_context * ctx_meta = nullptr; struct gguf_context * ctx_gguf = nullptr; diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index e9fa73acb097b..bbf5fec586feb 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -36,3 +36,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TARGET llama-minicpmv-cli) +add_executable(${TARGET} minicpmv-cli.cpp) +set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md new file mode 100644 index 0000000000000..4affc1d0f26ff --- /dev/null +++ b/examples/llava/README-minicpmv2.5.md @@ -0,0 +1,99 @@ +## MiniCPM-Llama3-V 2.5 + +### Prepare models and code + +Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder. + +Clone llama.cpp: +```bash +git clone https://github.com/ggerganov/llama.cpp +cd llama.cpp +``` + +### Usage + +Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us) + +```bash +python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 +python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 +python ./convert-hf-to-gguf.py ../MiniCPM-Llama3-V-2_5/model + +# quantize int4 version +./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M +``` + +Build for Linux or Mac + +```bash +make +make llama-minicpmv-cli +``` + +Inference on Linux or Mac +``` +# run f16 version +./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# run quantized int4 version +./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# or run in interactive mode +./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i +``` + +### Android + +#### Build on Android device using Termux +We found that build on Android device would bring better runtime performance, so we recommend to build on device. + +[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required). + +Install tools in Termux: +``` +apt update && apt upgrade -y +apt install git make cmake +``` + +It's recommended to move your model inside the `~/` directory for best performance: +``` +cd storage/downloads +mv model.gguf ~/ +``` + +#### Building the Project using Android NDK +Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. + +Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: + +```bash +mkdir build-android +cd build-android +export NDK=/your_ndk_path +cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. +make +``` + +Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). + +Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: + +(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) +``` +$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ +$cd /data/data/com.termux/files/home/bin +$chmod +x ./* +``` + +Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` + +``` +$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/ +$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/ +``` + +Now, you can start chatting: +``` +$cd /data/data/com.termux/files/home/bin +$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +``` diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 7cda5f10c4b05..54aa822c90d29 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -74,26 +74,27 @@ static std::string format(const char * fmt, ...) { // key constants // -#define KEY_FTYPE "general.file_type" -#define KEY_NAME "general.name" -#define KEY_DESCRIPTION "general.description" -#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" -#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" -#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" -#define KEY_USE_GELU "clip.use_gelu" -#define KEY_N_EMBD "clip.%s.embedding_length" -#define KEY_N_FF "clip.%s.feed_forward_length" -#define KEY_N_BLOCK "clip.%s.block_count" -#define KEY_N_HEAD "clip.%s.attention.head_count" -#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" -#define KEY_PROJ_DIM "clip.%s.projection_dim" -#define KEY_TOKENS "tokenizer.ggml.tokens" -#define KEY_N_POSITIONS "clip.text.context_length" -#define KEY_IMAGE_SIZE "clip.vision.image_size" -#define KEY_PATCH_SIZE "clip.vision.patch_size" -#define KEY_IMAGE_MEAN "clip.vision.image_mean" -#define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_PROJ_TYPE "clip.projector_type" +#define KEY_FTYPE "general.file_type" +#define KEY_NAME "general.name" +#define KEY_DESCRIPTION "general.description" +#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" +#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" +#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" +#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" +#define KEY_USE_GELU "clip.use_gelu" +#define KEY_N_EMBD "clip.%s.embedding_length" +#define KEY_N_FF "clip.%s.feed_forward_length" +#define KEY_N_BLOCK "clip.%s.block_count" +#define KEY_N_HEAD "clip.%s.attention.head_count" +#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" +#define KEY_PROJ_DIM "clip.%s.projection_dim" +#define KEY_TOKENS "tokenizer.ggml.tokens" +#define KEY_N_POSITIONS "clip.text.context_length" +#define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_PATCH_SIZE "clip.vision.patch_size" +#define KEY_IMAGE_MEAN "clip.vision.image_mean" +#define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_PROJ_TYPE "clip.projector_type" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" @@ -127,12 +128,20 @@ static std::string format(const char * fmt, ...) { #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" +#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" +#define TN_MINICPMV_QUERY "resampler.query" +#define TN_MINICPMV_PROJ "resampler.proj.weight" +#define TN_MINICPMV_KV_PROJ "resampler.kv.weight" +#define TN_MINICPMV_ATTN "resampler.attn.%s.%s" +#define TN_MINICPMV_LN "resampler.ln_%s.%s" + enum projector_type { PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDPV2, + PROJECTOR_TYPE_RESAMPLER, PROJECTOR_TYPE_UNKNOWN, }; @@ -140,6 +149,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MLP, "mlp" }, { PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDPV2, "ldpv2"}, + { PROJECTOR_TYPE_RESAMPLER, "resampler"}, }; @@ -200,17 +210,14 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int } static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - std::string result; - for (size_t pos = 0; ; pos += search.length()) { - auto new_pos = s.find(search, pos); - if (new_pos == std::string::npos) { - result += s.substr(pos, s.size() - pos); - break; - } - result += s.substr(pos, new_pos - pos) + replace; - pos = new_pos; + if (search.empty()) { + return; // Avoid infinite loop if 'search' is an empty string + } + size_t pos = 0; + while ((pos = s.find(search, pos)) != std::string::npos) { + s.replace(pos, search.length(), replace); + pos += replace.length(); } - s = std::move(result); } static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { @@ -492,12 +499,33 @@ struct clip_vision_model { struct ggml_tensor * mm_model_mlp_2_b; struct ggml_tensor * mm_model_peg_0_w; struct ggml_tensor * mm_model_peg_0_b; + + // MINICPMV projection + struct ggml_tensor * mm_model_pos_embed_k; + struct ggml_tensor * mm_model_query; + struct ggml_tensor * mm_model_proj; + struct ggml_tensor * mm_model_kv_proj; + struct ggml_tensor * mm_model_attn_q_w; + struct ggml_tensor * mm_model_attn_q_b; + struct ggml_tensor * mm_model_attn_k_w; + struct ggml_tensor * mm_model_attn_k_b; + struct ggml_tensor * mm_model_attn_v_w; + struct ggml_tensor * mm_model_attn_v_b; + struct ggml_tensor * mm_model_attn_o_w; + struct ggml_tensor * mm_model_attn_o_b; + struct ggml_tensor * mm_model_ln_q_w; + struct ggml_tensor * mm_model_ln_q_b; + struct ggml_tensor * mm_model_ln_kv_w; + struct ggml_tensor * mm_model_ln_kv_b; + struct ggml_tensor * mm_model_ln_post_w; + struct ggml_tensor * mm_model_ln_post_b; }; struct clip_ctx { bool has_text_encoder = false; bool has_vision_encoder = false; bool has_llava_projector = false; + bool has_minicpmv_projector = false; struct clip_vision_model vision_model; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -522,9 +550,11 @@ struct clip_ctx { ggml_backend_t backend = NULL; ggml_gallocr_t compute_alloc = NULL; + + struct clip_image_size * load_image_size; }; -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); return nullptr; @@ -533,20 +563,33 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; + const int image_size = hparams.image_size; + int image_size_width = image_size; + int image_size_height = image_size; + if (ctx->has_minicpmv_projector) { + if (load_image_size == nullptr) { + load_image_size = clip_image_size_init(); + } + LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); + image_size_width = load_image_size->width; + image_size_height = load_image_size->height; + if (is_inf) { + image_size_width = imgs->data->nx; + image_size_height = imgs->data->ny; + } + } const int patch_size = hparams.patch_size; - const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); - const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; - const int n_layer = hparams.n_layer; + int n_layer = hparams.n_layer; const float eps = hparams.eps; const int batch_size = imgs->size; - if (ctx->has_llava_projector) { + if (ctx->has_llava_projector || ctx->has_minicpmv_projector) { GGML_ASSERT(batch_size == 1); } @@ -559,7 +602,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = ggml_new_graph(ctx0); - struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); + struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size); ggml_set_name(inp_raw, "inp_raw"); ggml_set_input(inp_raw); @@ -572,19 +615,21 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); inp = ggml_add(ctx0, inp, model.patch_bias); } - - // concat class_embeddings and patch_embeddings struct ggml_tensor * embeddings = inp; - if (ctx->has_class_embedding) { - embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); - } + struct ggml_tensor * pos_embed = nullptr; + if (ctx->has_llava_projector) { + // concat class_embeddings and patch_embeddings + if (ctx->has_class_embedding) { + embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + ggml_set_name(embeddings, "embeddings"); + ggml_set_input(embeddings); + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + } + } struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); ggml_set_name(positions, "positions"); @@ -593,6 +638,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); + if (ctx->has_minicpmv_projector) { + int pos_w = image_size_width/patch_size; + int pos_h = image_size_height/patch_size; + pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); + ggml_set_name(pos_embed, "pos_embed"); + ggml_set_input(pos_embed); + } + // pre-layernorm if (ctx->has_pre_norm) { embeddings = ggml_norm(ctx0, embeddings, eps); @@ -602,6 +655,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // loop over layers + if (ctx->has_minicpmv_projector) { + n_layer += 1; + } for (int il = 0; il < n_layer - 1; il++) { struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states @@ -691,7 +747,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // llava projector - { + if (ctx->has_llava_projector) { embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); @@ -872,6 +928,65 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 GGML_ABORT("fatal error"); } } + // minicpmv projector + else if (ctx->has_minicpmv_projector) + { + if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + struct ggml_tensor * q = model.mm_model_query; + { // layernorm + q = ggml_norm(ctx0, q, eps); + q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + } + struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + { // layernorm + v = ggml_norm(ctx0, v, eps); + v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); + } + struct ggml_tensor * k; + { // position + // q = ggml_add(ctx0, q, model.mm_model_pos_embed); + k = ggml_add(ctx0, v, pos_embed); + } + + { // attention + const int hidden_size = 4096; + const int d_head = 128; + const int n_head = hidden_size/d_head; + const int num_query = 96; + + struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); + Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); + struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); + struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); + // permute + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_soft_max_inplace(ctx0, KQ); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); + + embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); + } + { // layernorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b); + } + embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + } + else { + GGML_ASSERT(false); + } + } // build the graph ggml_build_forward_expand(gf, embeddings); @@ -1029,7 +1144,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx); } - GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search + idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ); + if (idx != -1) { + new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx); + } + + // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search + GGML_ASSERT(new_clip->has_vision_encoder); GGML_ASSERT(!new_clip->has_text_encoder); @@ -1040,6 +1161,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); + LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); } @@ -1281,6 +1403,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight")); vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias")); } + else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) { + // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); + vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K); + vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY); + vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ); + vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ); + vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight")); + vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight")); + vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight")); + vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias")); + vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias")); + vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias")); + vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight")); + vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias")); + vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight")); + vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias")); + vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight")); + vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias")); + vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight")); + vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias")); + } else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); @@ -1319,7 +1462,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); clip_image_f32_batch batch; batch.size = 1; - ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch); + ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_gallocr_reserve(new_clip->compute_alloc, gf); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); @@ -1328,6 +1471,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { return new_clip; } +void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) { + ctx_clip->load_image_size = load_image_size; +} + +struct clip_image_size * clip_image_size_init() { + struct clip_image_size * load_image_size = new struct clip_image_size(); + load_image_size->width = 448; + load_image_size->height = 448; + return load_image_size; +} + struct clip_image_u8 * clip_image_u8_init() { return new clip_image_u8(); } @@ -1598,9 +1752,184 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im return patches; } +static int ensure_divide(int length, int patch_size) { + return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); +} + +static std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.first; + int height = original_size.second; + if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { + float r = static_cast(width) / height; + height = static_cast(scale_resolution / std::sqrt(r)); + width = static_cast(height * r); + } + int best_width = ensure_divide(width, patch_size); + int best_height = ensure_divide(height, patch_size); + return std::make_pair(best_width, best_height); +} + +static std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width, height; + std::tie(width, height) = original_size; + int grid_x, grid_y; + std::tie(grid_x, grid_y) = grid; + + int refine_width = ensure_divide(width, grid_x); + int refine_height = ensure_divide(height, grid_y); + + int grid_width = refine_width / grid_x; + int grid_height = refine_height / grid_y; + + // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) + auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair + int best_grid_width, best_grid_height; + std::tie(best_grid_width, best_grid_height) = best_grid_size; + + // std::pair refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) + std::pair refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line) + return refine_size; +} + +inline int clip(int x, int lower, int upper) { + return std::max(lower, std::min(x, upper)); +} + +static std::pair uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { + std::vector candidate_split_grids_nums; + for (int i : {multiple - 1, multiple, multiple + 1}) { + if (i == 1 || i > max_slice_nums) { + continue; + } + candidate_split_grids_nums.push_back(i); + } + + std::vector> candidate_grids; + for (int split_grids_nums : candidate_split_grids_nums) { + int m = 1; + while (m <= split_grids_nums) { + if (split_grids_nums % m == 0) { + candidate_grids.emplace_back(m, split_grids_nums / m); + } + ++m; + } + } + + std::pair best_grid{1, 1}; + float min_error = std::numeric_limits::infinity(); + for (const auto& grid : candidate_grids) { + float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second)); + if (error < min_error) { + best_grid = grid; + min_error = error; + } + } + return best_grid; +} + +// inspired from LLaVA-UHD: +// -> https://arxiv.org/pdf/2403.11703 +// -> https://github.com/thunlp/LLaVA-UHD +// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 +static std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) { + const std::pair original_size={img->nx,img->ny}; + const int original_width = img->nx; + const int original_height = img->ny; + const float log_ratio = log(1.0*original_width/original_height); + const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); + const int multiple = fmin(ceil(ratio), max_slice_nums); + + std::vector> images; + LOG_TEE("%s: multiple %d\n", __func__, multiple); + images.push_back(std::vector()); + + if (multiple <= 1) { + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); + clip_image_u8 * source_image = clip_image_u8_init(); + bicubic_resize(*img, *source_image, best_size.first, best_size.second); + // source_image = image.resize(best_size, Image.Resampling.BICUBIC) + images[images.size()-1].push_back(source_image); + } + else if (multiple > 1) { + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); + clip_image_u8 * source_image = clip_image_u8_init(); + bicubic_resize(*img, *source_image, best_size.first, best_size.second); + // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) + LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); + images[images.size()-1].push_back(source_image); + + std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); + LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); + + auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); + clip_image_u8 * refine_image = clip_image_u8_init(); + bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); + + LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); + + // split_to_patches + int width = refine_image->nx; + int height = refine_image->ny; + int grid_x = int(width / best_grid.first); + int grid_y = int(height / best_grid.second); + for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ + images.push_back(std::vector()); + for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ + clip_image_u8 * patch = clip_image_u8_init(); + patch->nx = grid_x; + patch->ny = grid_y; + patch->buf.resize(3 * patch->nx * patch->ny); + for (int y = patches_i; y < patches_i + grid_y; ++y) { + for (int x = patches_j; x < patches_j + grid_x; ++x) { + const int i = 3 * (y * refine_image->nx + x); + const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j)); + patch->buf[j] = refine_image->buf[i]; + patch->buf[j+1] = refine_image->buf[i+1]; + patch->buf[j+2] = refine_image->buf[i+2]; + } + } + images[images.size()-1].push_back(patch); + } + } + } + return images; +} + +int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { + const int max_slice_nums=9; + const int scale_resolution=448; + const int original_width = ctx_clip->load_image_size->width; + const int original_height = ctx_clip->load_image_size->height; + const float log_ratio = log(1.0*original_width/original_height); + const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); + const int multiple = fmin(ceil(ratio), max_slice_nums); + std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); + return best_grid.first; +} + // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { + if (clip_is_minicpmv(ctx)) { + std::vector> imgs = uhd_slice_image(img); + res_imgs->size = 0; + for (size_t i = 0; i < imgs.size(); ++i) { + res_imgs->size += imgs[i].size(); + } + res_imgs->data = new clip_image_f32[res_imgs->size]; + int idx = 0; + for (size_t i = 0; i < imgs.size(); ++i) { + for (size_t j = 0; j < imgs[i].size(); ++j) { + LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); + clip_image_f32 * res = clip_image_f32_init(); + normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std); + res_imgs->data[idx++] = *res; + clip_image_f32_free(res); + } + } + return true; + } + bool pad_to_square = true; if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); @@ -1816,11 +2145,99 @@ int clip_n_patches(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) { n_patches /= 4; + } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + n_patches = 96; } return n_patches; } +static std::vector>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector> & pos) { + assert(embed_dim % 2 == 0); + int H = pos.size(); + int W = pos[0].size(); + + std::vector omega(embed_dim / 2); + for (int i = 0; i < embed_dim / 2; ++i) { + omega[i] = 1.0 / pow(10000.0, static_cast(i) / (embed_dim / 2)); + } + + std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int d = 0; d < embed_dim / 2; ++d) { + float out_value = pos[h][w] * omega[d]; + emb[h][w][d] = sin(out_value); + emb[h][w][d + embed_dim / 2] = cos(out_value); + } + } + } + + return emb; +} + +static std::vector>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector>> & grid) { + assert(embed_dim % 2 == 0); + std::vector>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2) + std::vector>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2) + + int H = emb_h.size(); + int W = emb_h[0].size(); + std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); + + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int d = 0; d < embed_dim / 2; ++d) { + emb[h][w][d] = emb_h[h][w][d]; + emb[h][w][d + embed_dim / 2] = emb_w[h][w][d]; + } + } + } + return emb; +} + +static std::vector> get_2d_sincos_pos_embed(int embed_dim, const std::pair image_size) { + int grid_h_size = image_size.first; + int grid_w_size = image_size.second; + + std::vector grid_h(grid_h_size); + std::vector grid_w(grid_w_size); + + for (int i = 0; i < grid_h_size; ++i) { + grid_h[i] = static_cast(i); + } + for (int i = 0; i < grid_w_size; ++i) { + grid_w[i] = static_cast(i); + } + + std::vector> grid(grid_h_size, std::vector(grid_w_size)); + for (int h = 0; h < grid_h_size; ++h) { + for (int w = 0; w < grid_w_size; ++w) { + grid[h][w] = grid_w[w]; + } + } + std::vector>> grid_2d = {grid, grid}; + for (int h = 0; h < grid_h_size; ++h) { + for (int w = 0; w < grid_w_size; ++w) { + grid_2d[0][h][w] = grid_h[h]; + grid_2d[1][h][w] = grid_w[w]; + } + } + + std::vector>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d); + + int H = image_size.first; + int W = image_size.second; + std::vector> pos_embed_2d(H * W, std::vector(embed_dim)); + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + pos_embed_2d[w * H + h] = pos_embed_3d[h][w]; + } + } + + return pos_embed_2d; +} + bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); @@ -1843,18 +2260,27 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima if (ctx->has_llava_projector) { GGML_ASSERT(batch_size == 1); // TODO: support multiple images } + if (ctx->has_minicpmv_projector) { + GGML_ASSERT(batch_size == 1); + } // build the inference graph - ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); + ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true); ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); // set inputs const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; + const int image_size = hparams.image_size; + int image_size_width = image_size; + int image_size_height = image_size; + if (ctx->has_minicpmv_projector) { + image_size_width = imgs->data[0].nx; + image_size_height = imgs->data[0].ny; + } const int patch_size = hparams.patch_size; - const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); { @@ -1864,7 +2290,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima for (size_t i = 0; i < imgs->size; i++) { const int nx = imgs->data[i].nx; const int ny = imgs->data[i].ny; - GGML_ASSERT(nx == image_size && ny == image_size); + if (!ctx->has_minicpmv_projector) { + GGML_ASSERT(nx == image_size && ny == image_size); + } const int n = nx * ny; @@ -1881,37 +2309,75 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); free(data); } + if (ctx->has_minicpmv_projector) { + { + // inspired from siglip: + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + for (int i = 0; i < num_positions; i++) { + positions_data[i] = std::floor(70.0*i/num_positions); + } + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); + } - { - if (ctx->has_class_embedding) { - struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); + { + // inspired from resampler of Qwen-VL: + // -> https://huggingface.co/Qwen/Qwen-VL/tree/main + // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 + struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); + if(ctx->load_image_size==nullptr){ + ctx->load_image_size= clip_image_size_init(); + } + int pos_w = ctx->load_image_size->width/patch_size; + int pos_h = ctx->load_image_size->height/patch_size; + int embed_dim = 4096; + auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); + + float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); + for(int i=0;ihas_class_embedding) { + struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + void* zero_mem = malloc(ggml_nbytes(embeddings)); + memset(zero_mem, 0, ggml_nbytes(embeddings)); + ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); + free(zero_mem); + } + } + + { + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - for (int i = 0; i < num_positions; i++) { - positions_data[i] = i; + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + for (int i = 0; i < num_positions; i++) { + positions_data[i] = i; + } + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } - { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); - int* patches_data = (int*)malloc(ggml_nbytes(patches)); - for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + 1; + { + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + int* patches_data = (int*)malloc(ggml_nbytes(patches)); + for (int i = 0; i < num_patches; i++) { + patches_data[i] = i + 1; + } + ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); + free(patches_data); } - ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); - free(patches_data); } if (ggml_backend_is_cpu(ctx->backend)) { @@ -2081,7 +2547,14 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { return ctx->vision_model.mm_3_b->ne[0]; } + if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + return 4096; + } std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } + +bool clip_is_minicpmv(const struct clip_ctx * ctx) { + return ctx->has_minicpmv_projector; +} diff --git a/examples/llava/clip.h b/examples/llava/clip.h index ca36313844c13..2ff4d39929dc3 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -18,14 +18,17 @@ # define CLIP_API #endif -struct clip_ctx; - #ifdef __cplusplus extern "C" { #endif struct clip_ctx; +struct clip_image_size { + int width; + int height; +}; + struct clip_image_u8_batch { struct clip_image_u8 * data; size_t size; @@ -55,6 +58,10 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); +CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); +CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); + +CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_f32 * clip_image_f32_init(); @@ -78,6 +85,8 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); +CLIP_API bool clip_is_minicpmv(const struct clip_ctx * ctx); + #ifdef __cplusplus } #endif diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 63878d176b0bb..916d9dc401dc4 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -202,6 +202,33 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector return true; } +static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) { + int width = image->nx; + int height = image->ny; + int num_patches = (height / patch_size) * (width / patch_size); + clip_image_f32 * patch = clip_image_f32_init(); + patch->nx = patch_size * num_patches; + patch->ny = patch_size; + patch->buf.resize(3 * patch->nx * patch->ny); + + int patch_index = 0; + + for (int i = 0; i < height; i += patch_size) { + for (int j = 0; j < width; j += patch_size) { + for (int pi = 0; pi < patch_size; ++pi) { + for (int pj = 0; pj < patch_size; ++pj) { + int input_index = ((i + pi) * width + (j + pj)) * 3; + int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3; + patch->buf[output_index] = image->buf[input_index]; + patch->buf[output_index+1] = image->buf[input_index+1]; + patch->buf[output_index+2] = image->buf[input_index+2]; + } + } + patch_index++; + } + } + return patch; +} static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { // std::vector img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 @@ -218,7 +245,44 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); - if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { + if (clip_is_minicpmv(ctx_clip)) { + std::vector image_embd_v; + image_embd_v.resize(img_res_v.size); + struct clip_image_size * load_image_size = clip_image_size_init(); + for (size_t i = 0; i < img_res_v.size; i++) { + const int64_t t_img_enc_step_start_us = ggml_time_us(); + image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); + int patch_size=14; + load_image_size->width = img_res_v.data[i].nx; + load_image_size->height = img_res_v.data[i].ny; + clip_add_load_image_size(ctx_clip, load_image_size); + const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); + if (!encoded) { + LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + return false; + } + const int64_t t_img_enc_steop_batch_us = ggml_time_us(); + LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); + } + const int64_t t_img_enc_batch_us = ggml_time_us(); + LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + + int n_img_pos_out = 0; + for (size_t i = 0; i < image_embd_v.size(); i++) { + std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip)); + n_img_pos_out += clip_n_patches(ctx_clip); + } + *n_img_pos = n_img_pos_out; + for (size_t i = 0; i < image_embd_v.size(); i++) { + free(image_embd_v[i]); + } + image_embd_v.clear(); + load_image_size->width = img->nx; + load_image_size->height = img->ny; + clip_add_load_image_size(ctx_clip, load_image_size); + LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); + } + else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding *n_img_pos = clip_n_patches(ctx_clip); bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 @@ -228,7 +292,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli return false; } - } else { + } + else { // spatial_unpad llava-1.6 type embedding // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working std::vector image_embd_v; @@ -297,7 +362,11 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * } bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { - float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model + int num_max_patches = 6; + if (clip_is_minicpmv(ctx_clip)) { + num_max_patches = 10; + } + float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model if (!image_embd) { LOG_TEE("Unable to allocate memory for image embeddings\n"); return false; diff --git a/examples/llava/llava.h b/examples/llava/llava.h index 19212f6e9e9c5..b6feb3027b2da 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -17,12 +17,11 @@ # define LLAVA_API #endif -struct clip_ctx; - #ifdef __cplusplus extern "C" { #endif +struct clip_ctx; struct llava_image_embed { float * embed; int n_image_pos; @@ -37,8 +36,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); /** build an image embed from a path to an image filename */ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); -LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** free an embedding made with llava_image_embed_make_* */ +LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp new file mode 100644 index 0000000000000..f951b57b29158 --- /dev/null +++ b/examples/llava/minicpmv-cli.cpp @@ -0,0 +1,309 @@ +#include "ggml.h" +#include "log.h" +#include "common.h" +#include "clip.h" +#include "llava.h" +#include "llama.h" + +#include +#include +#include + +struct llava_context { + struct clip_ctx * ctx_clip = NULL; + struct llama_context * ctx_llama = NULL; + struct llama_model * model = NULL; +}; + +static void show_additional_info(int /*argc*/, char ** argv) { + LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); +} + +static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + LOG_TEE("%s", text); +} + +static struct llama_model * llava_init(gpt_params * params) { + llama_backend_init(); + llama_numa_init(params->numa); + + llama_model_params model_params = llama_model_params_from_gpt_params(*params); + + llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + if (model == NULL) { + LOG_TEE("%s: error: unable to load model\n" , __func__); + return NULL; + } + return model; +} + +static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + + llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); + if (params->n_ctx < 2048) { + // warn user here, "Image processing requires at least 2048 context, setting context to 2048" + LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); + ctx_params.n_ctx = 2048; + } else { + ctx_params.n_ctx = params->n_ctx; + } + + llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); + + if (ctx_llama == NULL) { + LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); + return NULL; + } + + auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + + ctx_llava->ctx_llama = ctx_llama; + ctx_llava->model = model; + return ctx_llava; +} + +static void llava_free(struct llava_context * ctx_llava) { + if (ctx_llava->ctx_clip) { + clip_free(ctx_llava->ctx_clip); + ctx_llava->ctx_clip = NULL; + } + + llama_free(ctx_llava->ctx_llama); + llama_free_model(ctx_llava->model); + llama_backend_free(); +} + +static struct clip_ctx * clip_init_context(gpt_params * params) { + const char * clip_path = params->mmproj.c_str(); + + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); + return ctx_clip; +} + +static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { + int N = (int) tokens.size(); + for (int i = 0; i < N; i += n_batch) { + int n_eval = (int) tokens.size() - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); + return false; + } + *n_past += n_eval; + } + return true; +} + +static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { + std::vector tokens; + tokens.push_back(id); + return eval_tokens(ctx_llama, tokens, 1, n_past); +} + +static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ + std::string str2 = str; + std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + return eval_tokens(ctx_llama, embd_inp, n_batch, n_past); +} + +static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) { + float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip)); + std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); + + auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); + slice_embed->embed = image_embed; + slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); + llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past); + llava_image_embed_free(slice_embed); +} + +static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) { + std::string system_prompt; + int idx = 0; + int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); + system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; + LOG_TEE("%s: image token past: %d\n", __func__, n_past); + eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (num_image_embeds > 1) { + size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { + for (size_t j = 0; j < num_image_embeds_col; ++j) { + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (j == num_image_embeds_col - 1) { + eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + } + } + } + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + } + LOG_TEE("%s: image token past: %d\n", __func__, n_past); +} + +static const char * sample(struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_llama, + int * n_past) { + const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); + llama_sampling_accept(ctx_sampling, ctx_llama, id, true); + static std::string ret; + if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { + ret = ""; + } else { + ret = llama_token_to_piece(ctx_llama, id); + } + eval_id(ctx_llama, id, n_past); + return ret.c_str(); +} + +static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ + auto ctx_clip = clip_init_context(params); + auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str()); + if (!embeds) { + std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; + return NULL; + } + + // process the prompt + if (params->prompt.empty() && params->interactive == false) { + LOG_TEE("prompt should be given or interactive mode should be on"); + return NULL; + } + + auto model = llava_init(params); + if (model == NULL) { + fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); + return NULL; + } + const int64_t t_llava_init_start_us = ggml_time_us(); + auto ctx_llava = llava_init_context(params, model); + ctx_llava->ctx_clip = ctx_clip; + const int64_t t_llava_init_end_us = ggml_time_us(); + float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; + LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); + + const int64_t t_process_image_start_us = ggml_time_us(); + process_image(ctx_llava, embeds, params, n_past); + const int64_t t_process_image_end_us = ggml_time_us(); + float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; + LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); + + llava_image_embed_free(embeds); + return ctx_llava; +} + +static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ + std::string user_prompt = prompt; + if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt; + + eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); + eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false); + // generate the response + + LOG_TEE("\n"); + + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); + return ctx_sampling; +} + +static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){ + + const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); + return tmp; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) { + show_additional_info(argc, argv); + return 1; + } + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("llava", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); + llama_log_set(llama_log_callback_logTee, nullptr); +#endif // LOG_DISABLE_LOGS + + if (params.mmproj.empty() || (params.image.empty())) { + gpt_params_print_usage(argc, argv, params); + show_additional_info(argc, argv); + return 1; + } + + for (auto & image : params.image) { + int n_past = 0; + auto ctx_llava = minicpmv_init(¶ms, image, n_past); + + if (!params.prompt.empty()) { + LOG_TEE("%s\n", params.prompt.c_str()); + LOG_TEE(""); + auto ctx_sampling = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true); + const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; + std::string response = ""; + bool have_tmp = false; + for (int i = 0; i < max_tgt_len; i++) { + auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); + response += tmp; + if (strcmp(tmp, "") == 0){ + if(!have_tmp)continue; + else break; + } + if (strstr(tmp, "###")) break; // Yi-VL behavior + have_tmp = true; + printf("%s", tmp); + if (strstr(response.c_str(), "")) break; // minicpm-v + + fflush(stdout); + } + llama_sampling_free(ctx_sampling); + }else { + while (true) { + LOG_TEE(""); + std::string prompt; + std::getline(std::cin, prompt); + LOG_TEE(""); + auto ctx_sampling = llama_init(ctx_llava, ¶ms, prompt, n_past, true); + const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; + std::string response = ""; + for (int i = 0; i < max_tgt_len; i++) { + auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); + response += tmp; + if (strcmp(tmp, "") == 0) break; + if (strstr(tmp, "###")) break; // Yi-VL behavior + printf("%s", tmp);// mistral llava-1.6 + if (strstr(response.c_str(), "")) break; // minicpm-v + fflush(stdout); + } + llama_sampling_free(ctx_sampling); + } + } + printf("\n"); + llama_print_timings(ctx_llava->ctx_llama); + + ctx_llava->model = NULL; + llava_free(ctx_llava); + } + + return 0; +} diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py new file mode 100644 index 0000000000000..12cdd1281d2ff --- /dev/null +++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py @@ -0,0 +1,382 @@ +import argparse +import os +import json +import re + +import torch +import numpy as np +from gguf import * +from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig + +TEXT = "clip.text" +VISION = "clip.vision" + + +def add_key_str(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + if has_minicpmv and name in ["visual_projection.weight"]: + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + + return False + + +def get_tensor_name(name: str) -> str: + if "projection" in name: + return name + if "mm_projector" in name: + name = name.replace("model.mm_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") + + +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument("--text-only", action="store_true", required=False, + help="Save a text-only model. It can't be used to encode images") +ap.add_argument("--vision-only", action="store_true", required=False, + help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--clip-model-is-vision", action="store_true", required=False, + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") +ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") +ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 +default_image_mean = [0.48145466, 0.4578275, 0.40821073] +default_image_std = [0.26862954, 0.26130258, 0.27577711] +ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) + +# with proper +args = ap.parse_args() + + +if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) + +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir + +if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: + vocab = None + tokens = None +else: + with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + +# if args.clip_model_is_vision or args.clip_model_is_openclip: +# model = CLIPVisionModel.from_pretrained(dir_model) +# processor = None +# else: +# model = CLIPModel.from_pretrained(dir_model) +# processor = CLIPProcessor.from_pretrained(dir_model) + +default_vision_config = { + "hidden_size": 1152, + "image_size": 980, + "intermediate_size": 4304, + "model_type": "idefics2", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "patch_size": 14, + } +vision_config = Idefics2VisionConfig(**default_vision_config) +model = Idefics2VisionTransformer(vision_config) + +processor = None +# if model.attn_pool is not None: +# model.attn_pool = torch.nn.Identity() + +# model.blocks = model.blocks[:-1] +model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip"))) + +fname_middle = None +has_text_encoder = True +has_vision_encoder = True +has_minicpmv_projector = False +if args.text_only: + fname_middle = "text-" + has_vision_encoder = False +elif args.minicpmv_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_minicpmv_projector = True +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False +else: + fname_middle = "" + +output_dir = args.output_dir if args.output_dir is not None else dir_model +os.makedirs(output_dir, exist_ok=True) +output_prefix = os.path.basename(output_dir).replace("ggml_", "") +fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="clip") + +fout.add_bool("clip.has_text_encoder", has_text_encoder) +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) +fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector) +fout.add_file_type(ftype) +if args.text_only: + fout.add_description("text-only CLIP model") +elif args.vision_only and not has_minicpmv_projector: + fout.add_description("vision-only CLIP model") +elif has_minicpmv_projector: + fout.add_description("image encoder for MiniCPM-V") + # add projector type + fout.add_string("clip.projector_type", "resampler") +else: + fout.add_description("two-tower CLIP model") + +if has_vision_encoder: + # vision_model hparams + fout.add_uint32("clip.vision.image_size", 448) + fout.add_uint32("clip.vision.patch_size", 14) + fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152) + fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304) + fout.add_uint32("clip.vision.projection_dim", 0) + fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16) + fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) + block_count = 26 + fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count) + + if processor is not None: + image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean + image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std + else: + image_mean = args.image_mean if args.image_mean is not None else default_image_mean + image_std = args.image_std if args.image_std is not None else default_image_std + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + +use_gelu = True +fout.add_bool("clip.use_gelu", use_gelu) + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2. + omega = 1. / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if isinstance(grid_size, int): + grid_h_size, grid_w_size = grid_size, grid_size + else: + grid_h_size, grid_w_size = grid_size[0], grid_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + +def _replace_name_resampler(s, v): + if re.match("resampler.pos_embed", s): + return { + s: v, + re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), + } + if re.match("resampler.proj", s): + return { + re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), + re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(), + } + if re.match("resampler.attn.in_proj_.*", s): + return { + re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0], + re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1], + re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2], + } + return {s: v} + +if has_minicpmv_projector: + projector = torch.load(args.minicpmv_projector) + new_state_dict = {} + for k, v in projector.items(): + kvs = _replace_name_resampler(k, v) + for nk, nv in kvs.items(): + new_state_dict[nk] = nv + projector = new_state_dict + ftype_cur = 0 + for name, data in projector.items(): + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + if ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + fout.add_tensor(name, data) + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + + print("Projector tensors added\n") + +def _replace_name(s, v): + s = "vision_model." + s + if re.match("vision_model.embeddings.position_embedding", s): + v = v.unsqueeze(0) + return {s: v} + + return {s: v} + +state_dict = model.state_dict() +new_state_dict = {} +for k, v in state_dict.items(): + kvs = _replace_name(k, v) + for nk, nv in kvs.items(): + new_state_dict[nk] = nv +state_dict = new_state_dict +for name, data in state_dict.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out) diff --git a/examples/llava/minicpmv-surgery.py b/examples/llava/minicpmv-surgery.py new file mode 100644 index 0000000000000..2b6bce7cfebe9 --- /dev/null +++ b/examples/llava/minicpmv-surgery.py @@ -0,0 +1,47 @@ +import argparse +import os +import torch +from transformers import AutoModel, AutoTokenizer + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", help="Path to MiniCPM-V-2.5 model") +args = ap.parse_args() + +# find the model part that includes the the multimodal projector weights +model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True) +checkpoint = model.state_dict() + +# get a list of mm tensor names +mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")] + +# store these tensors in a new dictionary and torch.save them +projector = {name: checkpoint[name].float() for name in mm_tensors} +torch.save(projector, f"{args.model}/minicpmv.projector") + +clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")] +if len(clip_tensors) > 0: + clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors} + torch.save(clip, f"{args.model}/minicpmv.clip") + + # added tokens should be removed to be able to convert Mistral models + if os.path.exists(f"{args.model}/added_tokens.json"): + with open(f"{args.model}/added_tokens.json", "w") as f: + f.write("{}\n") + +config = model.llm.config +config._name_or_path = "openbmb/MiniCPM-Llama3-V-2.5" +config.auto_map = { + "AutoConfig": "configuration_minicpm.MiniCPMConfig", + "AutoModel": "modeling_minicpm.MiniCPMModel", + "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM", + "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM", + "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification" +} +model.llm.save_pretrained(f"{args.model}/model") +tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) +tok.save_pretrained(f"{args.model}/model") +# os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM_l3/modeling_minicpm.py") + +print("Done!") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") +print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.") diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt index fbfd0cdd722e1..dfe5fbe62cea6 100644 --- a/examples/llava/requirements.txt +++ b/examples/llava/requirements.txt @@ -2,3 +2,4 @@ --extra-index-url https://download.pytorch.org/whl/cpu pillow~=10.2.0 torch~=2.2.1 +torchvision==0.17.1 diff --git a/examples/rpc/README.md b/examples/rpc/README.md index e1da801f285c6..adedc89090d37 100644 --- a/examples/rpc/README.md +++ b/examples/rpc/README.md @@ -1,5 +1,9 @@ ## Overview +> [!IMPORTANT] +> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and +> insecure. **Never run the RPC server on an open network or in a sensitive environment!** + The `rpc-server` allows running `ggml` backend on a remote host. The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them. This can be used for distributed LLM inference with `llama.cpp` in the following way: diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 7c15d2aa4acfb..6342e6488602a 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -16,7 +16,7 @@ #include struct rpc_server_params { - std::string host = "0.0.0.0"; + std::string host = "127.0.0.1"; int port = 50052; size_t backend_mem = 0; }; @@ -114,6 +114,17 @@ int main(int argc, char * argv[]) { fprintf(stderr, "Invalid parameters\n"); return 1; } + + if (params.host != "127.0.0.1") { + fprintf(stderr, "\n"); + fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str()); + fprintf(stderr, " Never expose the RPC server to an open network!\n"); + fprintf(stderr, " This is an experimental feature and is not secure!\n"); + fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + fprintf(stderr, "\n"); + } + ggml_backend_t backend = create_backend(); if (!backend) { fprintf(stderr, "Failed to create backend\n"); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 898c83ea3522b..360f571e42867 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -975,6 +975,8 @@ struct server_context { (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) || (prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) { slot.prompt = *prompt; + } else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_array()) { + slot.prompt = prompt->at(0); } else { send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST); return false; diff --git a/flake.lock b/flake.lock index c54af88ea21c4..f9e1548a2aca5 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1722421184, - "narHash": "sha256-/DJBI6trCeVnasdjUo9pbnodCLZcFqnVZiLUfqLH4jA=", + "lastModified": 1723175592, + "narHash": "sha256-M0xJ3FbDUc4fRZ84dPGx5VvgFsOzds77KiBMW/mMTnI=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "9f918d616c5321ad374ae6cb5ea89c9e04bf3e58", + "rev": "5e0ca22929f3342b19569b21b2f3462f053e497b", "type": "github" }, "original": { diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index 6c3226c37e0ef..d483cf1ac40c6 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -50,6 +50,8 @@ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); +GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); + GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); // helper to check if the device supports a specific family diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index 5c04bd7b7454f..b8c3d0e92d3c8 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -16,6 +16,8 @@ #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Woverlength-strings" +#elif defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data #endif #define UNUSED GGML_UNUSED diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index b512eb0be132e..aad189430ab0b 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -210,7 +210,7 @@ GGML_METAL_KERNEL_TYPE_COUNT }; -struct ggml_metal_context { +struct ggml_backend_metal_context { int n_cb; id device; @@ -224,6 +224,10 @@ bool support_simdgroup_mm; bool should_capture_next_compute; + + // abort ggml_metal_graph_compute if callback returns true + ggml_abort_callback abort_callback; + void * abort_callback_data; }; // MSL code @@ -289,7 +293,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ return data; } -static struct ggml_metal_context * ggml_metal_init(int n_cb) { +static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: allocating\n", __func__); #if TARGET_OS_OSX && !GGML_METAL_NDEBUG @@ -306,7 +310,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); // Configure context - struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context)); ctx->device = device; ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->queue = [ctx->device newCommandQueue]; @@ -668,7 +672,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ return ctx; } -static void ggml_metal_free(struct ggml_metal_context * ctx) { +static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) { @@ -734,7 +738,7 @@ static void ggml_metal_free(struct ggml_metal_context * ctx) { return nil; } -static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) { +static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx, const struct ggml_tensor * op) { for (size_t i = 0, n = 3; i < n; ++i) { if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) { return false; @@ -845,7 +849,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const } static enum ggml_status ggml_metal_graph_compute( - struct ggml_metal_context * ctx, + struct ggml_backend_metal_context * ctx, struct ggml_cgraph * gf) { @autoreleasepool { @@ -878,8 +882,11 @@ static enum ggml_status ggml_metal_graph_compute( id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; command_buffer_builder[cb_idx] = command_buffer; - // enqueue the command buffers in order to specify their execution order - [command_buffer enqueue]; + // always enqueue the first two command buffers + // enqueue all of the command buffers if we don't need to abort + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [command_buffer enqueue]; + } } const id *command_buffers = command_buffer_builder; @@ -2827,7 +2834,9 @@ static enum ggml_status ggml_metal_graph_compute( [encoder endEncoding]; - [command_buffer commit]; + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [command_buffer commit]; + } }); // Wait for completion and check status of each command buffer @@ -2847,6 +2856,23 @@ static enum ggml_status ggml_metal_graph_compute( return GGML_STATUS_FAILED; } + + id next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil); + if (!next_buffer) { + continue; + } + + bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); + if (next_queued) { + continue; + } + + if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { + GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i); + return GGML_STATUS_ABORTED; + } + + [next_buffer commit]; } if (should_capture) { @@ -3150,7 +3176,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, } GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) { - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; ggml_metal_free(ctx); free(backend); } @@ -3162,13 +3188,13 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe } GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; return ggml_metal_graph_compute(metal_ctx, cgraph); } GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context; return ggml_metal_supports_op(metal_ctx, op); } @@ -3213,9 +3239,9 @@ static ggml_guid_t ggml_backend_metal_guid(void) { } ggml_backend_t ggml_backend_metal_init(void) { - struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); - + struct ggml_backend_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); if (ctx == NULL) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__); return NULL; } @@ -3237,15 +3263,24 @@ bool ggml_backend_is_metal(ggml_backend_t backend) { void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { GGML_ASSERT(ggml_backend_is_metal(backend)); - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); } +void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) { + GGML_ASSERT(ggml_backend_is_metal(backend)); + + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; + + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = user_data; +} + bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { GGML_ASSERT(ggml_backend_is_metal(backend)); - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; } @@ -3253,7 +3288,7 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) { GGML_ASSERT(ggml_backend_is_metal(backend)); - struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context; ctx->should_capture_next_compute = true; } diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index b01ad267446fb..7757615f5a24b 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -197,6 +197,10 @@ static std::shared_ptr create_server_socket(const char * host, int por fprintf(stderr, "Failed to set SO_REUSEADDR\n"); return nullptr; } + if (inet_addr(host) == INADDR_NONE) { + fprintf(stderr, "Invalid host address: %s\n", host); + return nullptr; + } struct sockaddr_in serv_addr; serv_addr.sin_family = AF_INET; serv_addr.sin_addr.s_addr = inet_addr(host); @@ -879,6 +883,14 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp if (result->buffer && buffers.find(result->buffer) == buffers.end()) { return nullptr; } + + // require that the tensor data does not go beyond the buffer end + uint64_t tensor_size = (uint64_t) ggml_nbytes(result); + uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); + uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); + GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow + GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + result->op = (ggml_op) tensor->op; for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { result->op_params[i] = tensor->op_params[i]; @@ -898,7 +910,7 @@ bool rpc_server::set_tensor(const std::vector & input) { const rpc_tensor * in_tensor = (const rpc_tensor *)input.data(); uint64_t offset; memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset)); - size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset); + const size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset); struct ggml_init_params params { /*.mem_size =*/ ggml_tensor_overhead(), @@ -913,6 +925,17 @@ bool rpc_server::set_tensor(const std::vector & input) { return false; } GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size); + + // sanitize tensor->data + { + const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer); + const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer); + + if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) { + GGML_ABORT("[%s] tensor->data out of bounds\n", __func__); + } + } + const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset); ggml_backend_tensor_set(tensor, data, offset, size); ggml_free(ctx); @@ -943,6 +966,17 @@ bool rpc_server::get_tensor(const std::vector & input, std::vectorbuffer, tensor->data, offset, size); + + // sanitize tensor->data + { + const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer); + const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer); + + if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) { + GGML_ABORT("[%s] tensor->data out of bounds\n", __func__); + } + } + // output serialization format: | data (size bytes) | output.resize(size, 0); ggml_backend_tensor_get(tensor, output.data(), offset, size); diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index d7fea78d072b3..86732837254f0 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -268,6 +268,10 @@ struct vk_subbuffer { vk_buffer buffer; uint64_t offset; uint64_t size; + + operator vk::DescriptorBufferInfo() const { + return { buffer->buffer, offset, size }; + } }; struct vk_semaphore { @@ -1063,13 +1067,14 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { static void ggml_vk_sync_buffers(vk_context& ctx) { VK_LOG_DEBUG("ggml_vk_sync_buffers()"); - const std::vector mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } }; - ctx->s->buffer.pipelineBarrier( ctx->q->stage_flags, ctx->q->stage_flags, {}, - mem_barriers, + { { + {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite}, + {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite} + } }, {}, {} ); @@ -2108,9 +2113,9 @@ void ggml_vk_instance_init() { } static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { - GGML_ASSERT(idx < vk_instance.device_indices.size()); VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")"); ggml_vk_instance_init(); + GGML_ASSERT(idx < vk_instance.device_indices.size()); ctx->name = GGML_VK_NAME + std::to_string(idx); @@ -2420,28 +2425,23 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo return s; } -static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, std::vector&& buffers, size_t push_constant_size, const void* push_constants, std::array elements) { + + +static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array elements) { const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {"; - for (auto& buffer : buffers) { - std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), "; + for (auto& buffer : descriptor_buffer_infos) { + std::cerr << "(" << buffer << ", " << buffer.offset << ", " << buffer.size << "), "; } std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"); - std::vector descriptor_buffer_infos; - std::vector write_descriptor_sets; GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size()); - GGML_ASSERT(buffers.size() == pipeline->parameter_count); - vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++]; - for (uint32_t i = 0; i < pipeline->parameter_count; i++) { - descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size}); - } - for (uint32_t i = 0; i < pipeline->parameter_count; i++) { - write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); - } + GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count); - ctx->device->device.updateDescriptorSets(write_descriptor_sets, {}); + vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++]; + vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() }; + ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {}); subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); @@ -3123,7 +3123,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub } else if (qx_needs_dequant) { const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } if (y_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); @@ -3312,7 +3312,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& }; ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, - { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, + { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} }, sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); } @@ -3384,7 +3384,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c // compute const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); } static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3459,7 +3459,8 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con // compute const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, + { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); } static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3634,7 +3635,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& } else if (qx_needs_dequant) { const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, + { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } if (y_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); @@ -3834,7 +3836,8 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte }; ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, - { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23}, { d_ids, ids_buf_offset, ids_sz } }, + { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, + vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } }, sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z }); } @@ -4381,7 +4384,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (op == GGML_OP_ROPE) { // Empty src2 is possible in rope, but the shader needs a buffer vk_subbuffer subbuf_z; @@ -4392,20 +4395,20 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (op == GGML_OP_IM2COL) { // im2col uses only src1 and dst buffers ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src2) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src1) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } } else { GGML_ASSERT(op != GGML_OP_SOFT_MAX); @@ -4442,10 +4445,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co if (use_src1) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset + y_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } else { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c470eab838002..1146726759358 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -56,6 +56,9 @@ int ggml_sve_cnt_b = 0; // disable POSIX deprecation warnings // these functions are never going away, anyway #pragma warning(disable: 4996) + +// unreachable code because of multiple instances of code after GGML_ABORT +#pragma warning(disable: 4702) #endif #if defined(_WIN32) @@ -3740,7 +3743,8 @@ static struct ggml_tensor * ggml_new_tensor_impl( struct ggml_tensor * view_src, size_t view_offs) { - assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); + GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT); + GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); // find the base tensor and absolute offset if (view_src != NULL && view_src->view_src != NULL) { diff --git a/gguf-py/examples/writer.py b/gguf-py/examples/writer.py index f39eed1afe763..731873a7d666c 100755 --- a/gguf-py/examples/writer.py +++ b/gguf-py/examples/writer.py @@ -15,7 +15,6 @@ def writer_example() -> None: # Example usage with a file gguf_writer = GGUFWriter("example.gguf", "llama") - gguf_writer.add_architecture() gguf_writer.add_block_count(12) gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 59ffd92ea00cc..f63ec450a4e09 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -217,6 +217,7 @@ class MODEL_ARCH(IntEnum): CHATGLM = auto() BITNET = auto() T5 = auto() + T5ENCODER = auto() JAIS = auto() @@ -344,6 +345,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.CHATGLM: "chatglm", MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.T5: "t5", + MODEL_ARCH.T5ENCODER: "t5encoder", MODEL_ARCH.JAIS: "jais", } @@ -1036,6 +1038,21 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ENC_FFN_UP, MODEL_TENSOR.ENC_OUTPUT_NORM, ], + MODEL_ARCH.T5ENCODER: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ENC_ATTN_NORM, + MODEL_TENSOR.ENC_ATTN_Q, + MODEL_TENSOR.ENC_ATTN_K, + MODEL_TENSOR.ENC_ATTN_V, + MODEL_TENSOR.ENC_ATTN_OUT, + MODEL_TENSOR.ENC_ATTN_REL_B, + MODEL_TENSOR.ENC_FFN_NORM, + MODEL_TENSOR.ENC_FFN_GATE, + MODEL_TENSOR.ENC_FFN_DOWN, + MODEL_TENSOR.ENC_FFN_UP, + MODEL_TENSOR.ENC_OUTPUT_NORM, + ], MODEL_ARCH.JAIS: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1146,6 +1163,9 @@ class GGMLQuantizationType(IntEnum): F64 = 28 IQ1_M = 29 BF16 = 30 + Q4_0_4_4 = 31 + Q4_0_4_8 = 32 + Q4_0_8_8 = 33 # TODO: add GGMLFileType from ggml_ftype in ggml.h @@ -1158,7 +1178,7 @@ class LlamaFileType(IntEnum): MOSTLY_F16 = 1 # except 1d tensors MOSTLY_Q4_0 = 2 # except 1d tensors MOSTLY_Q4_1 = 3 # except 1d tensors - MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16 + # MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16 # MOSTLY_Q4_2 = 5 # support has been removed # MOSTLY_Q4_3 = 6 # support has been removed MOSTLY_Q8_0 = 7 # except 1d tensors @@ -1187,6 +1207,9 @@ class LlamaFileType(IntEnum): MOSTLY_IQ4_XS = 30 # except 1d tensors MOSTLY_IQ1_M = 31 # except 1d tensors MOSTLY_BF16 = 32 # except 1d tensors + MOSTLY_Q4_0_4_4 = 33 # except 1d tensors + MOSTLY_Q4_0_4_8 = 34 # except 1d tensors + MOSTLY_Q4_0_8_8 = 35 # except 1d tensors GUESSED = 1024 # not specified in the model file @@ -1260,6 +1283,9 @@ def get_type(val: Any) -> GGUFValueType: GGMLQuantizationType.F64: (1, 8), GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), GGMLQuantizationType.BF16: (1, 2), + GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16), + GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16), + GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16), } diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py index ac98d9a92a3e9..8d4fece2dca86 100644 --- a/gguf-py/gguf/lazy.py +++ b/gguf-py/gguf/lazy.py @@ -191,6 +191,8 @@ def from_eager(cls, t: Any) -> Any: class LazyNumpyTensor(LazyBase): _tensor_type = np.ndarray + shape: tuple[int, ...] # Makes the type checker happy in quants.py + @classmethod def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]: # The initial idea was to use np.nan as the fill value, diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index f4361d7517076..ff589b85245e5 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -1,40 +1,32 @@ from __future__ import annotations -from typing import Callable, Sequence +from abc import ABC, abstractmethod +from typing import Any, Callable, Sequence +from math import log2, ceil from numpy.typing import DTypeLike -from .constants import GGML_QUANT_SIZES, GGMLQuantizationType +from .constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K from .lazy import LazyNumpyTensor import numpy as np -def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType): +def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]: block_size, type_size = GGML_QUANT_SIZES[quant_type] if shape[-1] % block_size != 0: raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})") return (*shape[:-1], shape[-1] // block_size * type_size) -def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType): +def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]: block_size, type_size = GGML_QUANT_SIZES[quant_type] if shape[-1] % type_size != 0: raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})") return (*shape[:-1], shape[-1] // type_size * block_size) -# same as ggml_compute_fp32_to_bf16 in ggml-impl.h -def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray: - n = n.astype(np.float32, copy=False).view(np.uint32) - # force nan to quiet - n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n) - # round to nearest even - n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16 - return n.astype(np.uint16) - - # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time -def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray: +def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray: rows = arr.reshape((-1, arr.shape[-1])) osize = 1 for dim in oshape: @@ -46,76 +38,1151 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np. return out.reshape(oshape) -def __quantize_bf16_array(n: np.ndarray) -> np.ndarray: - return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape) +# round away from zero +# ref: https://stackoverflow.com/a/59143326/22827863 +def np_roundf(n: np.ndarray) -> np.ndarray: + a = abs(n) + floored = np.floor(a) + b = floored + np.floor(2 * (a - floored)) + return np.sign(n) * b + + +class QuantError(Exception): ... -__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.uint16) +_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {} -def quantize_bf16(n: np.ndarray): - if type(n) is LazyNumpyTensor: - return __quantize_bf16_lazy(n) +def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: + if qtype == GGMLQuantizationType.F32: + return data.astype(np.float32, copy=False) + elif qtype == GGMLQuantizationType.F16: + return data.astype(np.float16, copy=False) + elif (q := _type_traits.get(qtype)) is not None: + return q.quantize(data) else: - return __quantize_bf16_array(n) + raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented") -__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0] +def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: + if qtype == GGMLQuantizationType.F32: + return data.view(np.float32) + elif qtype == GGMLQuantizationType.F16: + return data.view(np.float16).astype(np.float32) + elif (q := _type_traits.get(qtype)) is not None: + return q.dequantize(data) + else: + raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented") -def can_quantize_to_q8_0(n: np.ndarray) -> bool: - return n.shape[-1] % __q8_block_size == 0 +class __Quant(ABC): + qtype: GGMLQuantizationType + block_size: int + type_size: int + grid: np.ndarray[Any, np.dtype[np.float32]] | None = None + grid_shape: tuple[int, int] = (0, 0) + grid_map: tuple[int | float, ...] = () + grid_hex: bytes | None = None -# round away from zero -# ref: https://stackoverflow.com/a/59143326/22827863 -def np_roundf(n: np.ndarray) -> np.ndarray: - a = abs(n) - floored = np.floor(a) - b = floored + np.floor(2 * (a - floored)) - return np.sign(n) * b + def __init__(self): + return TypeError("Quant conversion classes can't have instances") + def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None: + cls.qtype = qtype + cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype] + cls.__quantize_lazy = LazyNumpyTensor._wrap_fn( + cls.__quantize_array, + meta_noop=(np.uint8, cls.__shape_to_bytes) + ) + cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn( + cls.__dequantize_array, + meta_noop=(np.float32, cls.__shape_from_bytes) + ) + assert qtype not in _type_traits + _type_traits[qtype] = cls -def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]: - return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size) + @classmethod + def init_grid(cls): + if cls.grid is not None or cls.grid_hex is None: + return + bits_per_elem = ceil(log2(len(cls.grid_map))) + assert bits_per_elem != 0, cls.qtype.name + elems_per_byte = 8 // bits_per_elem -# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c -def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray: - shape = n.shape - assert shape[-1] % __q8_block_size == 0 + grid = np.frombuffer(cls.grid_hex, dtype=np.uint8) + # decode hexadecimal chars from grid + grid = grid.reshape((-1, 2)) + grid = (np.where(grid > 0x40, grid + 9, grid) & 0x0F) << np.array([4, 0], dtype=np.uint8).reshape((1, 2)) + grid = grid[..., 0] | grid[..., 1] + # unpack the grid values + grid = grid.reshape((-1, 1)) >> np.array([i for i in range(0, 8, 8 // elems_per_byte)], dtype=np.uint8).reshape((1, elems_per_byte)) + grid = (grid & ((1 << bits_per_elem) - 1)).reshape((-1, 1)) + grid_map = np.array(cls.grid_map, dtype=np.float32).reshape((1, -1)) + grid = np.take_along_axis(grid_map, grid, axis=-1) + cls.grid = grid.reshape((1, 1, *cls.grid_shape)) - n_blocks = n.size // __q8_block_size + @classmethod + @abstractmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + raise NotImplementedError - blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False) + @classmethod + @abstractmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + raise NotImplementedError - d = abs(blocks).max(axis=1, keepdims=True) / 127 - with np.errstate(divide="ignore"): - id = np.where(d == 0, 0, 1 / d) - qs = np_roundf(blocks * id) + @classmethod + def quantize_rows(cls, rows: np.ndarray) -> np.ndarray: + rows = rows.astype(np.float32, copy=False) + shape = rows.shape + n_blocks = rows.size // cls.block_size + blocks = rows.reshape((n_blocks, cls.block_size)) + blocks = cls.quantize_blocks(blocks) + assert blocks.dtype == np.uint8 + assert blocks.shape[-1] == cls.type_size + return blocks.reshape(cls.__shape_to_bytes(shape)) - # (n_blocks, 2) - d = d.astype(np.float16).view(np.uint8) - # (n_blocks, block_size) - qs = qs.astype(np.int8).view(np.uint8) + @classmethod + def dequantize_rows(cls, rows: np.ndarray) -> np.ndarray: + rows = rows.view(np.uint8) + shape = rows.shape + n_blocks = rows.size // cls.type_size + blocks = rows.reshape((n_blocks, cls.type_size)) + blocks = cls.dequantize_blocks(blocks) + assert blocks.dtype == np.float32 + assert blocks.shape[-1] == cls.block_size + return blocks.reshape(cls.__shape_from_bytes(shape)) - assert d.shape[1] + qs.shape[1] == __q8_type_size + @classmethod + def __shape_to_bytes(cls, shape: Sequence[int]): + return quant_shape_to_byte_shape(shape, cls.qtype) - return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape)) + @classmethod + def __shape_from_bytes(cls, shape: Sequence[int]): + return quant_shape_from_byte_shape(shape, cls.qtype) + @classmethod + def __quantize_array(cls, array: np.ndarray) -> np.ndarray: + return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.uint8, oshape=cls.__shape_to_bytes(array.shape)) -def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray: - return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape)) + @classmethod + def __dequantize_array(cls, array: np.ndarray) -> np.ndarray: + cls.init_grid() + return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape)) + @classmethod + def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any: + pass -__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn( - __quantize_q8_0_array, - meta_noop=(np.uint8, __quantize_q8_0_shape_change), -) + @classmethod + def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any: + pass + @classmethod + def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool: + return tensor.shape[-1] % cls.block_size == 0 -def quantize_q8_0(data: np.ndarray): - if type(data) is LazyNumpyTensor: - return __quantize_q8_0_lazy(data) - else: - return __quantize_q8_0_array(data) + @classmethod + def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray: + if not cls.can_quantize(tensor): + raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}") + if isinstance(tensor, LazyNumpyTensor): + return cls.__quantize_lazy(tensor) + else: + return cls.__quantize_array(tensor) + + @classmethod + def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray: + if isinstance(tensor, LazyNumpyTensor): + return cls.__dequantize_lazy(tensor) + else: + return cls.__dequantize_array(tensor) + + +class BF16(__Quant, qtype=GGMLQuantizationType.BF16): + @classmethod + # same as ggml_compute_fp32_to_bf16 in ggml-impl.h + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n = blocks.view(np.uint32) + # force nan to quiet + n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n) + # round to nearest even + n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16 + return n.astype(np.uint16).view(np.uint8) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32) + + +class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + imax = abs(blocks).argmax(axis=-1, keepdims=True) + max = np.take_along_axis(blocks, imax, axis=-1) + + d = max / -8 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + # FIXME: Q4_0's reference rounding is cursed and depends on FMA + qs = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15) + + qs = qs.reshape((n_blocks, 2, cls.block_size // 2)) + qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4)) + + d = d.astype(np.float16).view(np.uint8) + + return np.concatenate([d, qs], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, qs = np.hsplit(blocks, [2]) + + d = d.view(np.float16).astype(np.float32) + + qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.int8) - np.int8(8) + + return (d * qs.astype(np.float32)) + + +class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + max = blocks.max(axis=-1, keepdims=True) + min = blocks.min(axis=-1, keepdims=True) + + d = (max - min) / 15 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + qs = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 15) + + qs = qs.reshape((n_blocks, 2, cls.block_size // 2)) + qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4)) + + d = d.astype(np.float16).view(np.uint8) + m = min.astype(np.float16).view(np.uint8) + + return np.concatenate([d, m, qs], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + m, qs = np.hsplit(rest, [2]) + + d = d.view(np.float16).astype(np.float32) + m = m.view(np.float16).astype(np.float32) + + qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.float32) + + return (d * qs) + m + + +class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + imax = abs(blocks).argmax(axis=-1, keepdims=True) + max = np.take_along_axis(blocks, imax, axis=-1) + + d = max / -16 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + # FIXME: Q5_0's reference rounding is cursed and depends on FMA + q = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31) + + qs = q.reshape((n_blocks, 2, cls.block_size // 2)) + qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) + + qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4) + + d = d.astype(np.float16).view(np.uint8) + + return np.concatenate([d, qh, qs], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qh, qs = np.hsplit(rest, [4]) + + d = d.view(np.float16).astype(np.float32) + qh = qh.view(np.uint32) + + qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32)) + ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qh = (qh & np.uint32(0x01)).astype(np.uint8) + ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1)) + + qs = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(16) + + return (d * qs.astype(np.float32)) + + +class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + max = blocks.max(axis=-1, keepdims=True) + min = blocks.min(axis=-1, keepdims=True) + + d = (max - min) / 31 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + q = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 31) + + qs = q.reshape((n_blocks, 2, cls.block_size // 2)) + qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) + + qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4) + + d = d.astype(np.float16).view(np.uint8) + m = min.astype(np.float16).view(np.uint8) + + return np.concatenate([d, m, qh, qs], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + m, rest = np.hsplit(rest, [2]) + qh, qs = np.hsplit(rest, [4]) + + d = d.view(np.float16).astype(np.float32) + m = m.view(np.float16).astype(np.float32) + qh = qh.view(np.uint32) + + qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32)) + ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qh = (qh & np.uint32(0x01)).astype(np.uint8) + ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1)) + + qs = (ql | (qh << np.uint8(4))).astype(np.float32) + + return (d * qs) + m + + +class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0): + @classmethod + # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + + d = abs(blocks).max(axis=1, keepdims=True) / 127 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + qs = np_roundf(blocks * id) + + # (n_blocks, 2) + d = d.astype(np.float16).view(np.uint8) + # (n_blocks, block_size) + qs = qs.astype(np.int8).view(np.uint8) + + return np.concatenate([d, qs], axis=1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + d, x = np.split(blocks, [2], axis=1) + d = d.view(np.float16).astype(np.float32) + x = x.view(np.int8).astype(np.float32) + + return (x * d) + + +class Q2_K(__Quant, qtype=GGMLQuantizationType.Q2_K): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + scales, rest = np.hsplit(blocks, [QK_K // 16]) + qs, rest = np.hsplit(rest, [QK_K // 4]) + d, dmin = np.hsplit(rest, [2]) + + d = d.view(np.float16).astype(np.float32) + dmin = dmin.view(np.float16).astype(np.float32) + + # (n_blocks, 16, 1) + dl = (d * (scales & 0xF).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1)) + ml = (dmin * (scales >> 4).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1)) + + shift = np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) + + qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & np.uint8(3) + + qs = qs.reshape((n_blocks, QK_K // 16, 16)).astype(np.float32) + + qs = dl * qs - ml + + return qs.reshape((n_blocks, -1)) + + +class Q3_K(__Quant, qtype=GGMLQuantizationType.Q3_K): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + hmask, rest = np.hsplit(blocks, [QK_K // 8]) + qs, rest = np.hsplit(rest, [QK_K // 4]) + scales, d = np.hsplit(rest, [12]) + + d = d.view(np.float16).astype(np.float32) + + # The scales are packed at 6-bit each in this pattern: + # 0: IIIIAAAA + # 1: JJJJBBBB + # 2: KKKKCCCC + # 3: LLLLDDDD + # 4: MMMMEEEE + # 5: NNNNFFFF + # 6: OOOOGGGG + # 7: PPPPHHHH + # 8: MMIIEEAA + # 9: NNJJFFBB + # 10: OOKKGGCC + # 11: PPLLHHDD + lscales, hscales = np.hsplit(scales, [8]) + lscales = lscales.reshape((n_blocks, 1, 8)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 2, 1)) + lscales = lscales.reshape((n_blocks, 16)) + hscales = hscales.reshape((n_blocks, 1, 4)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 4, 1)) + hscales = hscales.reshape((n_blocks, 16)) + scales = (lscales & np.uint8(0x0F)) | ((hscales & np.uint8(0x03)) << np.uint8(4)) + scales = (scales.astype(np.int8) - np.int8(32)).astype(np.float32) + + dl = (d * scales).reshape((n_blocks, 16, 1)) + + ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) + qh = hmask.reshape(n_blocks, -1, 1, 32) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8, 1)) + ql = ql.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(3) + qh = (qh.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(1)) + qh = qh ^ np.uint8(1) # strangely, the offset is zero when the bitmask is 1 + q = (ql.astype(np.int8) - (qh << np.uint8(2)).astype(np.int8)).astype(np.float32) + + return (dl * q).reshape((n_blocks, QK_K)) + + +class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K): + K_SCALE_SIZE = 12 + + @staticmethod + def get_scale_min(scales: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + n_blocks = scales.shape[0] + scales = scales.view(np.uint8) + ### Unpacking the following: ### + # 0 EEAAAAAA + # 1 FFBBBBBB + # 2 GGCCCCCC + # 3 HHDDDDDD + # 4 eeaaaaaa + # 5 ffbbbbbb + # 6 ggcccccc + # 7 hhdddddd + # 8 eeeeEEEE + # 9 ffffFFFF + # 10 ggggGGGG + # 11 hhhhHHHH + scales = scales.reshape((n_blocks, 3, 4)) + d, m, m_d = np.split(scales, 3, axis=-2) + + sc = np.concatenate([d & 0x3F, (m_d & 0x0F) | ((d >> 2) & 0x30)], axis=-1) + min = np.concatenate([m & 0x3F, (m_d >> 4) | ((m >> 2) & 0x30)], axis=-1) + + return (sc.reshape((n_blocks, 8)), min.reshape((n_blocks, 8))) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + dmin, rest = np.hsplit(rest, [2]) + scales, qs = np.hsplit(rest, [cls.K_SCALE_SIZE]) + + d = d.view(np.float16).astype(np.float32) + dmin = dmin.view(np.float16).astype(np.float32) + + sc, m = Q4_K.get_scale_min(scales) + + d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1)) + dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1)) + + qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 32)).astype(np.float32) + + return (d * qs - dm).reshape((n_blocks, QK_K)) + + +class Q5_K(__Quant, qtype=GGMLQuantizationType.Q5_K): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + dmin, rest = np.hsplit(rest, [2]) + scales, rest = np.hsplit(rest, [Q4_K.K_SCALE_SIZE]) + qh, qs = np.hsplit(rest, [QK_K // 8]) + + d = d.view(np.float16).astype(np.float32) + dmin = dmin.view(np.float16).astype(np.float32) + + sc, m = Q4_K.get_scale_min(scales) + + d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1)) + dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1)) + + ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8, 1)) + ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32)) + qh = (qh & np.uint8(0x01)).reshape((n_blocks, -1, 32)) + q = (ql | (qh << np.uint8(4))).astype(np.float32) + + return (d * q - dm).reshape((n_blocks, QK_K)) + + +class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + ql, rest = np.hsplit(blocks, [QK_K // 2]) + qh, rest = np.hsplit(rest, [QK_K // 4]) + scales, d = np.hsplit(rest, [QK_K // 16]) + + scales = scales.view(np.int8).astype(np.float32) + d = d.view(np.float16).astype(np.float32) + d = (d * scales).reshape((n_blocks, QK_K // 16, 1)) + + ql = ql.reshape((n_blocks, -1, 1, 64)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32)) + qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) + qh = (qh & np.uint8(0x03)).reshape((n_blocks, -1, 32)) + q = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(32) + q = q.reshape((n_blocks, QK_K // 16, -1)).astype(np.float32) + + return (d * q).reshape((n_blocks, QK_K)) + + +class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS): + ksigns: bytes = ( + b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f" + b"\x90\x11\x12\x93\x14\x95\x96\x17\x18\x99\x9a\x1b\x9c\x1d\x1e\x9f" + b"\xa0\x21\x22\xa3\x24\xa5\xa6\x27\x28\xa9\xaa\x2b\xac\x2d\x2e\xaf" + b"\x30\xb1\xb2\x33\xb4\x35\x36\xb7\xb8\x39\x3a\xbb\x3c\xbd\xbe\x3f" + b"\xc0\x41\x42\xc3\x44\xc5\xc6\x47\x48\xc9\xca\x4b\xcc\x4d\x4e\xcf" + b"\x50\xd1\xd2\x53\xd4\x55\x56\xd7\xd8\x59\x5a\xdb\x5c\xdd\xde\x5f" + b"\x60\xe1\xe2\x63\xe4\x65\x66\xe7\xe8\x69\x6a\xeb\x6c\xed\xee\x6f" + b"\xf0\x71\x72\xf3\x74\xf5\xf6\x77\x78\xf9\xfa\x7b\xfc\x7d\x7e\xff" + ) + + # iq2xxs_grid, but with each byte of the original packed in 2 bits, + # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2. + grid_shape = (256, 8) + grid_map = (0x08, 0x19, 0x2b) + grid_hex = ( + b"00000200050008000a00110014002000220028002a0041004400500058006100" + b"6400800082008a00a20001010401100115014001840198010002020222028202" + b"010404041004210424044004420448046004810484049004a404000502050805" + b"200546056905800591050906100640068406a406000805080808140828084108" + b"440850085208880804094009020a140a01100410101021104010601084109010" + b"951000110811201150115a118011241245120014081420142514491480141815" + b"6215001616160118041810184018811800190519a019511a002002200a204420" + b"6120802082202921482100220222012404241024402456240025412564259026" + b"082820289428442a014004401040184021402440404048405640604081408440" + b"9040004120416141804185410142104248425642684200440844204480449944" + b"124524450046014804481048404845480049584961498249454a904a00500850" + b"1150195020508050885004514251a4519152905492540a550156545600581158" + b"195864584059085a046010604060686000615561186260620064056410651265" + b"84654268008002800a8041808280048118814081118201840484108415844084" + b"608400854685948509864086608602880489118a0490109024904090a1901691" + b"8091459200942294449451958198209902a050a085a009a100a218a450a804a9" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, qs = np.hsplit(blocks, [2]) + + d = d.view(np.float16).astype(np.float32) + + qs = qs.view(np.uint32).reshape(n_blocks, -1, 2) + + db = d * (np.float32(0.5) + (qs[..., 1] >> 28).astype(np.float32)) * np.float32(0.25) + db = db.reshape((n_blocks, -1, 1, 1)) + + # get the sign indices and unpack the bits + signs = qs[..., 1].reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4)) + ksigns = np.frombuffer(cls.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128)) + signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1)) + signs = np.take_along_axis(ksigns, signs, axis=-1) + signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 4, 8)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs[..., 0].copy().view(np.uint8).reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 4, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ2_XS(__Quant, qtype=GGMLQuantizationType.IQ2_XS): + # iq2xs_grid, but with each byte of the original packed in 2 bits, + # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2. + grid_shape = (512, 8) + grid_map = (0x08, 0x19, 0x2b) + grid_hex = ( + b"00000200050008000a0011001400160019002000220025002800410044004600" + b"49005000520055005800610064008000820085008800910094009900a0000101" + b"04010601090110011201150118011a0121012401400142014501480151015401" + b"6001680181018401900100020202050208021102140220024102440250025502" + b"80028a0201040404060409041004120415041804210424044004420445044804" + b"5104540456046004810484049004000502050505080511051405200541054405" + b"500561058005010604061006260640064206840600080208050808080a081108" + b"14082008250841084408500858088008a008aa08010904091009400981098909" + b"000a200a280a960aa00a01100410061009101010121015101810211024104010" + b"4210451048105110541060106a10811084109010001102110511081111111411" + b"2011411144115011801194119611011204120612101240126012001402140514" + b"0814111414142014411444144914501464148014011504151015401500161416" + b"49160118041810181218401854188618001905196619511aa91a002002200520" + b"08200a201120142020204120442050208020a020012104211021402148216521" + b"002222228022a82201240424102429244024002541255225992501261a26a626" + b"002808280a28202855288828a22868299029082a202a822a882a8a2a01400440" + b"0640094010401240154018402140244040404240454048404a40514054406040" + b"6540814084409040004102410541084111411441204141414441504180418541" + b"a241014204421042124229424042004402440544084411441444194420444144" + b"4444504480449444014504451045244540459a4500460a464446504601480448" + b"1048404845485448624800491149444950496949044a00500250055008501150" + b"145020502850415044505050805001510451105115514051425100524452aa52" + b"0154045410542154405460548154a154005508558055885521566856a1560058" + b"14584158505899581a5940594259855a0160046010604060546062608660a960" + b"006124624a62926200641664106540654565a46501686a682569066a546a626a" + b"00800280058008801180148020802a8041804480508080808280a880aa800181" + b"0481068110814081518159810082208280828282a082a8820184048410841284" + b"158440846084898400854485a58518866a860088088825885a8880888288a888" + b"0689228a808a888a968aa88a0190049010904090569084900091229164915692" + b"89920094059444945094589429959095929541965198a6984999159a609a00a0" + b"02a008a00aa020a02aa0a0a051a159a1a6a100a202a208a22aa280a2a0a240a4" + b"95a465a698a60aa820a822a828a8a0a8a8a804a984a986a928aa2aaa91aaaaaa" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, scales = np.hsplit(rest, [2 * QK_K // 8]) + + d = d.view(np.float16).astype(np.float32) + qs = qs.view(np.uint16) + + scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) + scales = (scales & 0x0F).reshape((n_blocks, -1)) + db = d * (np.float32(0.5) + scales) * np.float32(0.25) + db = db.reshape((n_blocks, -1, 1, 1)) + + # get the sign indices and unpack the bits + signs = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape(1, 1, 128) + signs = np.take_along_axis(signs, (qs >> 9).reshape((n_blocks, -1, 1)), axis=-1) + signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 2, 8)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, (qs & np.uint16(511)).reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 2, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ2_S(__Quant, qtype=GGMLQuantizationType.IQ2_S): + # iq2s_grid, but with each byte of the original packed in 2 bits, + # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2. + grid_shape = (1024, 8) + grid_map = (0x08, 0x19, 0x2b) + grid_hex = ( + b"00000200050008000a0011001400160019002000220025002800410044004600" + b"490050005200550058006100640066006900800082008500880091009400a000" + b"a500aa0001010401060109011001120115011801210124014001420145014801" + b"510154015601590160016501680181018401900192019501a101a40100020202" + b"050208021102140220022a02410244024602490250025502800285028a029402" + b"a202010404040604090410041204150418042104240426042904400442044504" + b"48044a0451045404560459046004620465048104840486048904900495049804" + b"a104a40400050205050508050a05110514051605190520052505280541054405" + b"46054905500552055505580561056405800582058505880591059405a0050106" + b"0406060609061006150640064506480651065406600681068406900600080208" + b"050808081108140816081908200825082a084108440846084908500852085508" + b"580861086408800885089408aa08010904091009120915091809210940094509" + b"480951095409600981099009000a110a140a220a280a2a0a500a990a01100410" + b"0610091010101210151018102110241026104010421045104810511054105610" + b"59106010621065106810811084108610901095109810a110a410001102110511" + b"08110a1111111411161119112011221125112811411144114611491150115211" + b"5511581161116411801182118511881191119411011204120912101215122112" + b"2412401245125112541281128412901200140214051408141114141416141914" + b"2014251428144114441446144914501452145514581461146414801482148514" + b"881491149414a014011504150615091510151215151518152115241540154215" + b"4515481551155415601581158415901500160516081611161416201641164416" + b"50168016aa160118041806180918101815181818211840184218451848185118" + b"541860188118841800190219051908191119141920194119441950196919a219" + b"041a101a401a561a00200220052008201120142016201920202025202a204120" + b"4420502052205520642080208a209420aa200121042110211221152121214021" + b"4221452151215421602181218421902100220a22222228222a22442250228822" + b"8a22a82201240424062409241024152418242124242440244224452448245124" + b"5424602481248424902400250525082511251425202541254425502566258025" + b"0126042610264026592600280528112814284128442850288a28aa2801290429" + b"102995290a2a222a642a882a8a2a014004400640094010401240154018401a40" + b"21402440264040404240454048404a4051405440564059406040624065408140" + b"8440904095409840a140a4400041024105410841114114411641194120412241" + b"2541414144414641494150415241554158416141644180418241854188419141" + b"9441a04101420442104212421542184224424042454248425142544260428142" + b"844200440244054408440a441144144416441944204422442544284441444444" + b"46444944504452445544584461446444804482448544884491449444a0440145" + b"0445064509451045124515451845214524454045424545454845514554456045" + b"6a4581458445904500460246054608461146144620464146444650468046a546" + b"0148044809481048124815481848214824484048424845484848514854486048" + b"84489048004902490549084911491449204941494449504980499649014a044a" + b"104a404a00500250055008501150145016501950205022502550285041504450" + b"4650495050505250555058506150645080508250855088509150945001510451" + b"0651095110511251155118512151245140514251455148515151545160518151" + b"8451905100520552085211521452205241524452505269528052015404540654" + b"0954105412541554185421542454405442544554485451545454605481548454" + b"9054005502550555085511551455205541554455505580550156045610562656" + b"405600580258055808581158145820584158445850585a588058015904591059" + b"4059005a195a855aa85a01600460066010601260156018602160246040604560" + b"4860516054606060846090600061026105610861116114612061416144615061" + b"806199610462106240625662a162006405640864116414642064416444645064" + b"806401650465106540654a656865926500669466016804681068656898680069" + b"2a69426aa16a0080028005800880118014801980208025804180448050805280" + b"5580588061808080858091809480018104810981108112811581188121812481" + b"408142814581488151815481818184819081a981008205820a82118214824182" + b"4482508201840484068409841084128415841884218440844284458448845184" + b"5484608481848484908400850285058508851185148520854185448550858085" + b"8a85018604861086298640860088058811881488418844885088a28801890489" + b"40896589228a588a5a8a828aa28a019004900990109012901590189024904090" + b"4290459048905190549060908190849090900091059111911491419144915091" + b"5a910192049210924092a6920094029405940894119414942094419444945094" + b"8094969401950495109540959895a19500964696649601980498109826984098" + b"a998009949995299909a00a005a00aa014a022a02aa041a044a050a0a2a0aaa0" + b"40a165a102a20aa222a228a22aa282a288a28aa2a8a201a404a410a440a489a4" + b"a4a400a519a551a60aa828a8a2a854a986a908aa0aaa20aa22aa28aa88aaaaaa" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, rest = np.hsplit(rest, [QK_K // 8]) + signs, rest = np.hsplit(rest, [QK_K // 8]) + qh, scales = np.hsplit(rest, [QK_K // 32]) + + d = d.view(np.float16).astype(np.float32) + + scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) + scales = (scales & 0x0F).reshape((n_blocks, -1)) + db = d * (np.float32(0.5) + scales) * np.float32(0.25) + db = db.reshape((n_blocks, -1, 1, 1)) + + # unpack the sign bits + signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 2, 8)) + + qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4)) + qs = qs.astype(np.uint16) | ((qh & 0x03).astype(np.uint16) << 8).reshape((n_blocks, -1)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 2, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ3_XXS(__Quant, qtype=GGMLQuantizationType.IQ3_XXS): + grid_shape = (256, 4) + grid_map = (0x04, 0x0c, 0x14, 0x1c, 0x24, 0x2c, 0x34, 0x3e) + grid_hex = ( + b"0000020004001100130017002000220031004200730075000101030110011201" + b"2101250130013201410154017001000202020402110220022202310233023702" + b"5102570275020103070310031203250370031304370444045704730475040105" + b"0705320552053506640610071407160743076107011003101010121021102310" + b"3010321034104710501000110211111120112211011203121012121221123012" + b"7212001302132013311346136613011405145014201524154615711505162217" + b"4017002002201120132020202220262031204220012103210521102112212121" + b"3021632167217021002202221122172220222222372240225522012310231423" + b"7023742335245324032527254125742501270327162745270130103012302130" + b"2330503065307230003102312031313144314631013203321032253252327232" + b"1133333330344734723400350635223555351436363663363337603704401740" + b"3540374053405740744120423742404260426642074345430444514464442545" + b"4345704505471047124730471250415070500051065126515551145232527252" + b"0253535310542354275472540255315550562457425724604460466064602161" + b"6161176264623063366344640565526533660367216703700570077010703270" + b"5270267140711272457252720073157333736073217441740075027524753076" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, scales = np.hsplit(rest, [QK_K // 4]) + + d = d.view(np.float16).astype(np.float32) + scales = scales.view(np.uint32) + + db = d * (np.float32(0.5) + (scales >> 28).astype(np.float32)) * np.float32(0.5) + db = db.reshape((n_blocks, -1, 1, 1)) + + # get the sign indices and unpack the bits + signs = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4)) + ksigns = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128)) + signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1)) + signs = np.take_along_axis(ksigns, signs, axis=-1) + signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 4, 8)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 4, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ3_S(__Quant, qtype=GGMLQuantizationType.IQ3_S): + grid_shape = (512, 4) + grid_map = (0x01, 0x03, 0x05, 0x07, 0x09, 0x0b, 0x0d, 0x0f) + grid_hex = ( + b"0000010002000500070010001100120014001600200021002500330040004200" + b"4500470051005300600062007100740077000001010102010401100111011501" + b"2001230127013101350144016101650172010002010205020702100213021602" + b"2102250230023402420245024702510253027002730203031103150320032203" + b"3103330336034403500352036703710375030004130417042104240432044004" + b"4304510470040205040520052205260533054105450547056605730506061106" + b"1306310652067106000702070407200722072607330750075407001001100210" + b"0410101011101310151017102010221031103410361054105610611072100011" + b"0111031106111011141121113011331141115011521170117611001212121512" + b"1712201224123212401243125512601272120113041307131013131321132713" + b"3013341341136213701303140514121414143114331442144614501454140115" + b"1015131521153015321551152016241627164416461601170317101712172117" + b"3517411762177017002001200320052007201020122014201620212023202720" + b"3020322041204320452050205220672070207320752000210221102113211721" + b"2221252131213421422151210122042207222122232230223722412253225722" + b"7122742200230223052311232223242331233323422350236623012407242024" + b"2324322435244124722475240425112522253725402553257025002602260726" + b"2126552661260527112726273027432750270230113013301530173022303130" + b"3330353042304430473051306330713001310331053114312131233140316031" + b"7231763100321232203232323432503201331033143321332333273330334133" + b"4333473355337333033411341634223431345234603464340135103512352535" + b"3235443556357335163641360137033720372237353700400440124020402440" + b"2740324041405040704002410741114113412241304135414341514155410142" + b"0342104215422142334240425742624270420443114313432043224331433543" + b"0044024424443744404471440545074521456245134634466046104715473047" + b"4347514702501050145022504050445047505250665074500151035105511251" + b"2151325172510052115223523052365253520253075310532753445351536553" + b"7353015404542054325446541255265551555355425602570457225711601360" + b"1560316033606060006120612761646112623462426255626262706200631463" + b"2163406325644364626400650365346560650566406611671367007004700770" + b"2070227036704070547062700271117124714371457101720472107216722172" + b"3072517202733273357353730174057413742074507422754275027631760077" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, rest = np.hsplit(rest, [QK_K // 4]) + qh, rest = np.hsplit(rest, [QK_K // 32]) + signs, scales = np.hsplit(rest, [QK_K // 8]) + + d = d.view(np.float16).astype(np.float32) + + scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) + scales = (scales & 0x0F).reshape((n_blocks, -1)) + db = d * (1 + 2 * scales) + db = db.reshape((n_blocks, -1, 1, 1)) + + # unpack the sign bits + signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 4, 8)) + + qh = qh.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8) + qh = (qh & 0x01).astype(np.uint16).reshape((n_blocks, -1)) + qs = qs.astype(np.uint16) | (qh << 8) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 4, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ1_S(__Quant, qtype=GGMLQuantizationType.IQ1_S): + # iq1s_grid, with each byte packed into 2 bits + # -1, 0, 1 <=> 0, 1, 2 + grid_shape = (2048, 8) + grid_map = (-1, 0, 1) + grid_hex = ( + b"00000200050008000a00110015002000220028002a0045005100540056006500" + b"8000820088008a009500a000a200a800aa000401050111011401160119011a01" + b"2501410146014901520155015a0161016401660168018501910194019601a501" + b"0002020208020a0215022002220228022a024502510259026402690280028202" + b"88028a02910295029902a002a202a802aa021104140416042504410449045504" + b"5a046404650491049904a5040105040505050605150518051a05290540054505" + b"4a0550055105540555055605590560056205650568056a058105910595059805" + b"9a05a105a405a505a605a9051406190641064406500652065506580660066106" + b"6606690685069106940699060008020808080a0815082008220828082a084508" + b"5108560865088008820888088a089508a008a208a808aa080509110914091909" + b"2409250941095009510955096109640969099109940996099909a509000a020a" + b"080a0a0a150a200a220a280a2a0a450a510a590a610a650a800a820a850a880a" + b"8a0a950aa00aa20aa80aaa0a1010111014101910241025104110441050105510" + b"58106110641065106910911094109610a110a510011104110611091110111211" + b"1511181121112411291145114a11501151115211541155115611591160116511" + b"841192119511a111a41111121412161225124012461249125212551258125a12" + b"641266128512911294129612a512011406140914141415141814191421142614" + b"41144514461448144a1451145414551456145914621465146814841489149014" + b"94149514981499149a14a114a414a514a914021505150a151115141515151615" + b"191520152215251528152a154115441545154615511552155415551556155915" + b"5a1561156415651566156915801582158415851588158a159015911594159515" + b"961599159a15a015a215a51501160416051606161516161618161a1621162616" + b"401642164416451648164a165116551656165816591661166416651668166916" + b"6a1686168a1692169516a416a916111816182518411844184618491850185518" + b"58185a1860186118641866186918851891189418a5181019121915191a192119" + b"25194219441945194819511954195519561959195a19601965196a1989199119" + b"921995199819a119a619a919091a161a241a261a441a461a491a501a521a551a" + b"581a611a661a691a851a911a961a9a1a0020022008200a201520202022202520" + b"28202a20452051205920612065208020822088208a209520a020a220a520a820" + b"aa2005211121142119212521422144214921552158215a216121642165216621" + b"8521902196219921a521012208220a22112215222022222228222a2245225122" + b"562259226522812288228a2291229522a022a222a822aa220524142416241924" + b"252444244524462449245224552458245a2466248524912494249924a124a524" + b"0925152521252925402545254825512554255525592562256525682589259025" + b"9425952598259a25a125a425a625a92505261026122619262526412649265526" + b"6026612669268426862690269a260028022808280a2815282028222828282a28" + b"45285128542865288028822888288a28a028a228a828aa280929112914291929" + b"2529462949295229552961296429662969298529902996299929a429a529002a" + b"022a082a0a2a202a222a282a2a2a452a512a562a592a652a802a822a882a8a2a" + b"952aa02aa22aa82aaa2a054011401640254049405240554058405a4061406440" + b"664094409940a140a6400041014104410641094112411541164118411a412141" + b"26412941454148414a41514154415541564159415a41654168416a4181418441" + b"8641904192419541a041a141a241054211421442164225424142524255425a42" + b"6442694289429442a5420144154419442944454448444a445144544455445644" + b"61446244654468446a44814486448944904492449544a044a144a94401450245" + b"05450a4511451445154516451945204525452a45414544454545464549455045" + b"5145544555455645584559456145644565456645694582458445854588459145" + b"94459545964599459a45a545a845aa450146054609461446154618461a462146" + b"2446294640464246454648465046514652465546564659466246654668468146" + b"85468a4694469546a146a446a6460548114815481a4825484248494850485548" + b"5848614864486648694885489148944896489948a5480149054906490a491049" + b"144915491849214924492649404945494a495149524954495549564959496049" + b"6249654966496a49864989499249954996499849a149a449a649a949164a444a" + b"464a494a554a584a5a4a644a694a944aa54a0150045005500650095012501550" + b"1a50215024502950405045504850515054505550565059506550685086508950" + b"95509850a050a150a650a9500551085109510a51115114511551165118511951" + b"20512551265128512a5141514451455146514951505151515251545155515651" + b"585159515a51615164516551665169518251855191519451955196519951a051" + b"a551aa5101520652125215521a5221522452425245524a525152545255525652" + b"595262526552855290529252955299529a52a452045405541154145415541654" + b"185419542154255428542a54415444544554465449544a545054515454545554" + b"5654585459545a54615462546454655466546954805488548a54915494549554" + b"96549954a154a454a554aa540155025504550555065509551055115512551455" + b"1555165519551a55215524552555265529554055415542554455455546554855" + b"4955505551555255545555555655585559555a55605561556455655566556855" + b"69556a5581558455855589558a559055915594559555965598559955a155a455" + b"a555a655a9550056015602560456065608560956115614561556185619562056" + b"2156225624562556265628562956415645564656485649564a56505651565256" + b"545655565656585659565a566156645665566956825685568656885689568a56" + b"915695569a56a256a556a656a856a95604580558065809581058155818582158" + b"2a58455848584a58515854585558565858585958605862586458655882588958" + b"9058925895589858a158a9580159025905590a59115914591559165919592559" + b"41594459455946594959505951595259545955595659585959595a5961596459" + b"655966596959815985598959915994599559965998599959a559045a085a155a" + b"1a5a205a255a265a295a455a485a495a515a555a565a585a595a625a655a685a" + b"6a5a815a8a5a925a955a965a985a9a5aa15a0560146016601960256044605060" + b"5560566058605a60616064606660696081609660a56001610461066109611261" + b"15612161226126612961456149615161556156615961656166616a6184618a61" + b"92619561a161a661a96111621662196240624162466255625662586260628562" + b"91629662a56211641264156416641a6421642664296440644264456448644a64" + b"516454645564566459645a646064626465648464856489649064926494649564" + b"966498649a64a164a464a964056508650a651165156516651965446545654665" + b"496550655165546555655665596561656465656566656965866589658a659165" + b"9565966599659a65a265a565a665a86502660966156620662666286629664066" + b"456648664a66516654665566566658665a666066656668668066826685668a66" + b"9466966698669966a066a466a666aa661668196825684168526855685a686168" + b"6968856891689868a66801690469106915692169246926692969406941694569" + b"4669486951695469556956695969606965696a69826984698a699569a169a469" + b"a569a969116a166a186a416a446a496a506a556a586a5a6a646a656a696a866a" + b"946a986a9a6aa66a0080028008800a802080228028802a804580508051805480" + b"5680598065808080828088808a809580a080a280a880aa800581118114811681" + b"1981258141814481498150815281558156815881598164816681698185818981" + b"948196819981a5810082028208820a8215822082228228822a82518254825982" + b"65828082828288828a829582a082a282a882aa82148419844184448451845584" + b"5a846184648469849484998401850985128515851a8526852985408541854585" + b"4885518554855585568559855a856585668568856a8581858485868589859085" + b"928595859885a68511861686198625864186448649864a865086558659865a86" + b"618666866a86858691869a86a4860088028808880a8815882088228828882a88" + b"41884588518854885988658869888088828888888a889588a088a288a888aa88" + b"05890689118914891689258941894489468949895089528955895a8961896489" + b"858996899989a589008a028a088a0a8a158a208a228a288a2a8a458a518a548a" + b"568a808a828a888a8a8a958aa08aa28aa88aaa8a059011901690189019902590" + b"419046904990559058905a9069906a9085909190949096909990a59001910491" + b"069109911091159118911a912191249126912991409145915091519154915591" + b"569159916291659184918691929195919891a191a491a691a991059211921492" + b"19922592449246924992509252925592589266926992859294929692a9920194" + b"04940694109415941894269440944a9451945494559456945894599460946194" + b"62946594849486949294949495949894a194a9940095059508950a9510951195" + b"14951595169519952195259529952a9541954495459546954995509551955295" + b"549555955695589559955a956195649565956695699581958595889591959295" + b"94959595969599959a95a095a295a595a895aa95019604961096159619962096" + b"2696299645964896499651965296559656965996659668968296849689968a96" + b"929694969596a496a696a9960598169819982598419846985098529855985698" + b"5a98649865988598919896989998a59804990699099910991299159918991a99" + b"209921992499269940994299459948994a995199549955995699599962996599" + b"66996a99819984999099929995999a99a199a699059a159a259a449a469a499a" + b"509a559a589a619a859a919a949a959a969a00a002a008a00aa015a020a022a0" + b"28a02aa045a051a054a056a059a080a082a088a08aa095a0a0a0a2a0a8a0aaa0" + b"05a109a111a114a116a119a11aa146a149a151a155a158a15aa161a164a185a1" + b"90a192a196a199a102a208a20aa210a219a222a228a22aa245a251a256a259a2" + b"65a280a282a288a28aa295a2a0a2a2a2a8a2aaa219a425a441a444a450a454a4" + b"55a458a45aa461a465a466a468a469a485a406a509a510a512a515a518a526a5" + b"29a542a545a551a554a555a556a559a565a56aa581a584a585a586a589a592a5" + b"95a598a505a611a616a61aa621a625a644a646a64aa652a655a656a658a660a6" + b"62a686a690a695a696a699a6a1a6a4a6a6a600a802a808a80aa820a822a828a8" + b"2aa851a854a856a859a880a882a888a88aa895a8a0a8a2a8a8a8aaa805a914a9" + b"19a921a925a941a950a955a95aa961a966a969a990a996a900aa02aa08aa0aaa" + b"20aa22aa28aa2aaa51aa54aa56aa80aa82aa88aa8aaa95aaa0aaa2aaa8aaaaaa" + ) + + delta = np.float32(0.125) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, qh = np.hsplit(rest, [QK_K // 8]) + + d = d.view(np.float16).astype(np.float32) + qh = qh.view(np.uint16) + + dl = d * (2 * ((qh >> 12) & 7) + 1) + dl = dl.reshape((n_blocks, -1, 1, 1)) + delta = np.where((qh & np.uint16(0x8000)) == 0, cls.delta, -cls.delta) + delta = delta.reshape((n_blocks, -1, 1, 1)) + + qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4)) + qs = qs.astype(np.uint16) | ((qh & 7) << 8).reshape((n_blocks, -1)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 4, 8)) + + return (dl * (grid + delta)).reshape((n_blocks, -1)) + + +class IQ1_M(__Quant, qtype=GGMLQuantizationType.IQ1_M): + grid_shape = IQ1_S.grid_shape + grid_map = IQ1_S.grid_map + grid_hex = IQ1_S.grid_hex + + delta = IQ1_S.delta + + # Okay *this* type is weird. It's the only one which stores the f16 scales in multiple parts. + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + qs, rest = np.hsplit(blocks, [QK_K // 8]) + qh, scales = np.hsplit(rest, [QK_K // 16]) + + # The f16 scale is packed across multiple bytes + scales = scales.view(np.uint16) + d = (scales.reshape((n_blocks, 4)) & np.uint16(0xF000)) >> np.array([12, 8, 4, 0], dtype=np.uint16).reshape((1, 4)) + d = d[..., 0] | d[..., 1] | d[..., 2] | d[..., 3] + d = d.view(np.float16).astype(np.float32).reshape((n_blocks, 1)) + + scales = scales.reshape(n_blocks, -1, 1) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4)) + scales = (scales & 0x07).reshape((n_blocks, -1)) + dl = d * (2 * scales + 1) + dl = dl.reshape((n_blocks, -1, 2, 1, 1)) + + qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) + qs = qs.astype(np.uint16) | ((qh & 0x07).astype(np.uint16) << 8).reshape((n_blocks, -1)) + + delta = np.where(qh & 0x08 == 0, cls.delta, -cls.delta) + delta = delta.reshape((n_blocks, -1, 2, 2, 1)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 2, 2, 8)) + + return (dl * (grid + delta)).reshape((n_blocks, -1)) + + +class IQ4_NL(__Quant, qtype=GGMLQuantizationType.IQ4_NL): + kvalues = (-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, qs = np.hsplit(blocks, [2]) + + d = d.view(np.float16).astype(np.float32) + + qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + + qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 1)) + + kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16) + qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1)) + + return (d * qs) + + +class IQ4_XS(__Quant, qtype=GGMLQuantizationType.IQ4_XS): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + scales_h, rest = np.hsplit(rest, [2]) + scales_l, qs = np.hsplit(rest, [QK_K // 64]) + + d = d.view(np.float16).astype(np.float32) + scales_h = scales_h.view(np.uint16) + + scales_l = scales_l.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2)) + scales_h = scales_h.reshape((n_blocks, 1, -1)) >> np.array([2 * i for i in range(QK_K // 32)], dtype=np.uint16).reshape((1, -1, 1)) + scales_l = scales_l.reshape((n_blocks, -1)) & np.uint8(0x0F) + scales_h = scales_h.reshape((n_blocks, -1)).astype(np.uint8) & np.uint8(0x03) + + scales = (scales_l | (scales_h << np.uint8(4))).astype(np.int8) - np.int8(32) + dl = (d * scales.astype(np.float32)).reshape((n_blocks, -1, 1)) + + qs = qs.reshape((n_blocks, -1, 1, 16)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1)) + qs = qs.reshape((n_blocks, -1, 32, 1)) & np.uint8(0x0F) + + kvalues = np.array(IQ4_NL.kvalues, dtype=np.int8).reshape((1, 1, 1, -1)) + qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1, 32)) + + return (dl * qs).reshape((n_blocks, -1)) diff --git a/gguf-py/tests/test_quants.py b/gguf-py/tests/test_quants.py new file mode 100755 index 0000000000000..8b7a85c2c36d7 --- /dev/null +++ b/gguf-py/tests/test_quants.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 + +# Test gguf.quants so that it exactly matches the C implementation of the (de)quantization + +# NOTE: this is kind of a mess, but at least it worked for initially testing the Python implementations. + +from __future__ import annotations + +import argparse +from math import prod +import os +import sys +from pathlib import Path +import ctypes +import logging +import numpy as np + +# Necessary to load the local gguf package +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): + sys.path.insert(0, str(Path(__file__).parent.parent)) + +import gguf +from gguf.constants import GGMLQuantizationType + + +logger = logging.getLogger("test-quants") + + +c_float_p = ctypes.POINTER(ctypes.c_float) + + +class ggml_init_params(ctypes.Structure): + _fields_ = [ + ("mem_size", ctypes.c_size_t), + ("mem_buffer", ctypes.c_void_p), + ("no_alloc", ctypes.c_bool), + ] + + +class GGMLQuants: + libggml: ctypes.CDLL + + def __init__(self, libggml: Path): + self.libggml = ctypes.CDLL(str(libggml)) + self.libggml.ggml_quantize_chunk.restype = ctypes.c_size_t + # enum ggml_type type, + # const float * src, + # void * dst, + # int64_t start, + # int64_t nrows, + # int64_t n_per_row, + # const float * imatrix) { + self.libggml.ggml_quantize_chunk.argtypes = ( + ctypes.c_int, + ctypes.POINTER(ctypes.c_float), + ctypes.c_void_p, + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_int64, + ctypes.POINTER(ctypes.c_float), + ) + + self.libggml.ggml_quantize_requires_imatrix.restype = ctypes.c_bool + self.libggml.ggml_quantize_requires_imatrix.argtypes = (ctypes.c_int,) + + for t in ( + "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", + "q2_K", "q3_K", "q4_K", "q5_K", "q6_K", + "iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m", + "iq4_nl", "iq4_xs", + ): + dequant_func: ctypes._NamedFuncPointer = getattr(self.libggml, "dequantize_row_" + t) + dequant_func.restype = None + dequant_func.argtypes = (ctypes.c_void_p, ctypes.POINTER(ctypes.c_float), ctypes.c_int64) + + self.libggml.ggml_fp16_to_fp32_row.restype = None + self.libggml.ggml_fp16_to_fp32_row.argtypes = (ctypes.POINTER(ctypes.c_uint16), ctypes.POINTER(ctypes.c_float), ctypes.c_int64) + self.libggml.ggml_bf16_to_fp32_row.restype = None + self.libggml.ggml_bf16_to_fp32_row.argtypes = (ctypes.POINTER(ctypes.c_uint16), ctypes.POINTER(ctypes.c_float), ctypes.c_int64) + + self.libggml.ggml_init.argtypes = (ggml_init_params,) + + self.libggml.ggml_init(ggml_init_params(1 * 1024 * 1024, 0, False)) + + def dequantize(self, tensor: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: + result = np.zeros(gguf.quant_shape_from_byte_shape(tensor.shape, qtype), dtype=np.float32, order="C") + if qtype == GGMLQuantizationType.F32: + # no-op + result = tensor.view(np.float32) + elif qtype == GGMLQuantizationType.F16: + self.libggml.ggml_fp16_to_fp32_row(tensor.ctypes.data_as(ctypes.POINTER(ctypes.c_uint16)), result.ctypes.data_as(c_float_p), result.size) + elif qtype == GGMLQuantizationType.BF16: + self.libggml.ggml_bf16_to_fp32_row(tensor.ctypes.data_as(ctypes.POINTER(ctypes.c_uint16)), result.ctypes.data_as(c_float_p), result.size) + else: + lw_qname = qtype.name.lower() + if lw_qname[-1] == "k": + lw_qname = lw_qname[:-1] + "K" + dequant_func: ctypes._NamedFuncPointer = getattr(self.libggml, "dequantize_row_" + lw_qname) + dequant_func(tensor.ctypes.data_as(ctypes.c_void_p), result.ctypes.data_as(c_float_p), result.size) + return result + + def quantize(self, data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: + result = np.zeros(gguf.quant_shape_to_byte_shape(data.shape, qtype), dtype=np.uint8, order="C") + if self.libggml.ggml_quantize_requires_imatrix(qtype.value): + # TODO: is a column-wise sum of squares appropriate? + qw = np.sum((data * data).reshape((-1, data.shape[-1])), axis=0).ctypes.data_as(c_float_p) + else: + qw = ctypes.cast(0, c_float_p) + result_size = self.libggml.ggml_quantize_chunk(qtype.value, data.ctypes.data_as(c_float_p), result.ctypes.data_as(ctypes.c_void_p), 0, prod(data.shape[:-1]), data.shape[-1], qw) + assert result.size == result_size + return result + + +def compare_tensors(t1: np.ndarray, t2: np.ndarray, qtype: GGMLQuantizationType) -> bool: + same = np.array_equal(t1, t2) + if same: + return True + else: + block_size, type_size = gguf.GGML_QUANT_SIZES[qtype] + if t1.dtype == np.float32: + t1 = t1.reshape((-1, block_size)) + t2 = t2.reshape((-1, block_size)) + else: + t1 = t1.reshape((-1, type_size)) + t2 = t2.reshape((-1, type_size)) + x = t1.view(np.uint8) ^ t2.view(np.uint8) + diff_bits = np.count_nonzero(np.unpackbits(x, axis=-1), axis=-1) + num_bad_blocks = np.count_nonzero(diff_bits, axis=0) + if num_bad_blocks == 0 and t1.shape == t2.shape: + logger.debug("Bits are equal, but arrays don't match, likely contains NANs") + return True + logger.debug(f"{num_bad_blocks} bad blocks ({100 * num_bad_blocks / x.shape[0]:.6f}%)") + bad_block_id = np.argmax(diff_bits, axis=0) + logger.debug(f"Worst block id: {bad_block_id}") + logger.debug(f"Sample bad block ({diff_bits[bad_block_id]} differing bits):\n{t1[bad_block_id]}\nReference:\n{t2[bad_block_id]}") + + sum_diff_bits = np.sum(diff_bits) + logger.debug(f"{sum_diff_bits} bits differ ({100 * sum_diff_bits/(x.size * 8):.6f}%)") + return False + + +def do_test(libggml_path: Path, quick: bool = False): + ggml_quants = GGMLQuants(libggml_path) + + np.set_printoptions(precision=None, threshold=(4 * 256) + 1, formatter={"int": lambda n: "0x%02X" % n}) + + r = np.random.randn(8, 1024, 1024).astype(np.float32, copy=False) + + for qtype in (GGMLQuantizationType.F16, *gguf.quants._type_traits.keys()): + has_dequantize = False + has_quantize = False + + try: + gguf.dequantize(np.zeros((gguf.GGML_QUANT_SIZES[qtype][1]), dtype=np.uint8), qtype) + has_dequantize = True + except (NotImplementedError, AssertionError) as e: + if isinstance(e, AssertionError): + logger.error(f"Error with {qtype.name}: {e}") + raise e + try: + gguf.quantize(np.zeros((gguf.GGML_QUANT_SIZES[qtype][0]), dtype=np.float32), qtype) + has_quantize = True + except (NotImplementedError, AssertionError) as e: + if isinstance(e, AssertionError): + logger.error(f"Error with {qtype.name}: {e}") + raise e + + if not has_dequantize and not has_quantize: + continue + + logger.info(f"Testing {qtype.name}") + + rc = r.copy(order="C") + + pyq = None + ggq = None + + if has_quantize: + logger.debug(f"Quantizing to {qtype.name} with Python") + pyq = gguf.quants.quantize(rc, qtype) + + logger.debug(f"Quantizing to {qtype.name} with C") + ggq = ggml_quants.quantize(rc, qtype) + + if qtype == GGMLQuantizationType.F16: + pyq = pyq.view(np.uint8) + quant_equal = compare_tensors(pyq, ggq, qtype) + + if not quant_equal: + logger.error(f"Quantization to {qtype.name} does not match ❌") + else: + logger.info(f"Quantization to {qtype.name} matches exactly ✅") + + if has_dequantize: + if ggq is None and not quick: + logger.debug(f"Quantizing to {qtype.name} with C") + ggq = ggml_quants.quantize(rc, qtype) + + if ggq is not None: + logger.debug(f"Dequantizing from {qtype.name} with Python") + pydq = gguf.quants.dequantize(ggq, qtype) + logger.debug(f"Dequantizing from {qtype.name} with C") + ggdq = ggml_quants.dequantize(ggq, qtype) + + dequant_equal = compare_tensors(pydq, ggdq, qtype) + + if not dequant_equal: + logger.error(f"Dequantization from {qtype.name} does not match ❌") + else: + logger.info(f"Dequantization from {qtype.name} matches exactly ✅") + + rq_shape = gguf.quants.quant_shape_to_byte_shape((8, 1024, 1024 // 2), qtype) + rq = np.random.random(rq_shape).astype(np.float16).view(np.uint8) + + logger.debug(f"Dequantizing random f16 data as {qtype.name} with Python") + pydq = gguf.quants.dequantize(rq, qtype) + logger.debug(f"Dequantizing random f16 data as {qtype.name} with C") + ggdq = ggml_quants.dequantize(rq, qtype) + + dequant_equal = compare_tensors(pydq, ggdq, qtype) + + if not dequant_equal: + logger.error(f"Dequantization from random f16 data as {qtype.name} does not match ❌") + else: + logger.info(f"Dequantization from random f16 data as {qtype.name} matches exactly ✅") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test Python (de)quantization against the reference C implementation") + parser.add_argument("--libggml", type=Path, default=Path(__file__).parent.parent.parent / "build" / "ggml" / "src" / "libggml.so", help="The path to libggml.so") + parser.add_argument("--quick", action="store_true", help="Don't quantize with C when it's not strictly necessary") + + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG) + + do_test(args.libggml, args.quick) diff --git a/include/llama.h b/include/llama.h index 97884b2cdbd84..be0b7d4e83e48 100644 --- a/include/llama.h +++ b/include/llama.h @@ -504,6 +504,9 @@ extern "C" { // Returns true if the model contains an encoder that requires llama_encode() call LLAMA_API bool llama_model_has_encoder(const struct llama_model * model); + // Returns true if the model contains a decoder that requires llama_decode() call + LLAMA_API bool llama_model_has_decoder(const struct llama_model * model); + // For encoder-decoder models, this function returns id of the token that must be provided // to the decoder to start generating output sequence. For other models, it returns -1. LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model); diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index aa4895c6d6ca4..b29892565209f 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -62,6 +62,7 @@ while read c; do src/ggml*.m \ src/ggml*.metal \ src/ggml*.cu \ + src/ggml-cann/* \ src/ggml-cuda/* \ src/ggml-sycl/* \ src/vulkan-shaders/* \ @@ -108,6 +109,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-alloc.c -> ggml/src/ggml-alloc.c # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h # src/ggml-backend.c -> ggml/src/ggml-backend.c + # src/ggml-cann/* -> ggml/src/ggml-cann/ + # src/ggml-cann.cpp -> ggml/src/ggml-cann.cpp # src/ggml-common.h -> ggml/src/ggml-common.h # src/ggml-cuda/* -> ggml/src/ggml-cuda/ # src/ggml-cuda.cu -> ggml/src/ggml-cuda.cu @@ -126,6 +129,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # include/ggml-alloc.h -> ggml/include/ggml-alloc.h # include/ggml-backend.h -> ggml/include/ggml-backend.h # include/ggml-blas.h -> ggml/include/ggml-blas.h + # include/ggml-cann.h -> ggml/include/ggml-cann.h # include/ggml-cuda.h -> ggml/include/ggml-cuda.h # include/ggml-kompute.h -> ggml/include/ggml-kompute.h # include/ggml-metal.h -> ggml/include/ggml-metal.h @@ -152,6 +156,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\.cpp/\1ggml\/src\/ggml-cann.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \ @@ -169,18 +175,19 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-cann\.h/\1ggml\/include\/ggml-cann.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \ - -e 's/([[:space:]]|[ab]\/)examples\/common\.h/examples\/common.h/g' \ - -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/examples\/common.cpp/g' \ - -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/examples\/common-ggml.h/g' \ - -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \ - -e 's/([[:space:]]|[ab]\/)LICENSE/LICENSE/g' \ - -e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common\.h/\1examples\/common.h/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/\1examples\/common.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/\1examples\/common-ggml.h/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.cpp/\1examples\/common-ggml.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)LICENSE/\1LICENSE/g' \ + -e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/\1scripts\/gen-authors.sh/g' \ > ggml-src.patch.tmp mv ggml-src.patch.tmp ggml-src.patch diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 1b82b1047a024..eef6768b149db 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -18703ad600cc68dbdb04d57434c876989a841d12 +797faa25af14126eb30134d4033139ae3c5428ed diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index d6d7d0a60bfbd..30a62e0888953 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -10,6 +10,8 @@ cp -rpv ../ggml/src/ggml-aarch64.h ./ggml/src/ggml-aarch64.h cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h cp -rpv ../ggml/src/ggml-backend.c ./ggml/src/ggml-backend.c +cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/ +cp -rpv ../ggml/src/ggml-cann.cpp ./ggml/src/ggml-cann.cpp cp -rpv ../ggml/src/ggml-common.h ./ggml/src/ggml-common.h cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/ cp -rpv ../ggml/src/ggml-cuda.cu ./ggml/src/ggml-cuda.cu @@ -29,6 +31,7 @@ cp -rpv ../ggml/include/ggml.h ./ggml/include/ggml.h cp -rpv ../ggml/include/ggml-alloc.h ./ggml/include/ggml-alloc.h cp -rpv ../ggml/include/ggml-backend.h ./ggml/include/ggml-backend.h cp -rpv ../ggml/include/ggml-blas.h ./ggml/include/ggml-blas.h +cp -rpv ../ggml/include/ggml-cann.h ./ggml/include/ggml-cann.h cp -rpv ../ggml/include/ggml-cuda.h ./ggml/include/ggml-cuda.h cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h cp -rpv ../ggml/include/ggml-metal.h ./ggml/include/ggml-metal.h diff --git a/src/llama-impl.h b/src/llama-impl.h index dcc8c1c15a1b1..399b134a7f9bc 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -24,3 +24,18 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) + +// +// helpers +// + +static void replace_all(std::string & s, const std::string & search, const std::string & replace) { + if (search.empty()) { + return; // Avoid infinite loop if 'search' is an empty string + } + size_t pos = 0; + while ((pos = s.find(search, pos)) != std::string::npos) { + s.replace(pos, search.length(), replace); + pos += replace.length(); + } +} diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index e6d6059d03482..749f8571829df 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -16,20 +16,6 @@ // helpers // -static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - std::string result; - for (size_t pos = 0; ; pos += search.length()) { - auto new_pos = s.find(search, pos); - if (new_pos == std::string::npos) { - result += s.substr(pos, s.size() - pos); - break; - } - result += s.substr(pos, new_pos - pos) + replace; - pos = new_pos; - } - s = std::move(result); -} - LLAMA_ATTRIBUTE_FORMAT(1, 2) static std::string format(const char * fmt, ...) { va_list ap; diff --git a/src/llama.cpp b/src/llama.cpp index d6d03fe0406b8..5878c0271f245 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -121,17 +121,6 @@ static std::string trim(const std::string & str) { return str.substr(start, end - start); } -static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - if (search.empty()) { - return; // Avoid infinite loop if 'search' is an empty string - } - size_t pos = 0; - while ((pos = s.find(search, pos)) != std::string::npos) { - s.replace(pos, search.length(), replace); - pos += replace.length(); - } -} - static bool is_float_close(float a, float b, float abs_tol) { // Check for non-negative tolerance if (abs_tol < 0.0) { @@ -219,6 +208,7 @@ enum llm_arch { LLM_ARCH_CHATGLM, LLM_ARCH_BITNET, LLM_ARCH_T5, + LLM_ARCH_T5ENCODER, LLM_ARCH_JAIS, LLM_ARCH_UNKNOWN, }; @@ -263,6 +253,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, + { LLM_ARCH_T5ENCODER, "t5encoder" }, { LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1272,6 +1263,24 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_T5ENCODER, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" }, + { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" }, + { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" }, + { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" }, + { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" }, + { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" }, + { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" }, + { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" }, + { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" }, + { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" }, + { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_JAIS, { @@ -4892,7 +4901,6 @@ static void llm_load_hparams( } break; case LLM_ARCH_PHI3: { - ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { @@ -4901,6 +4909,22 @@ static void llm_load_hparams( case 40: model.type = e_model::MODEL_14B; break; default: model.type = e_model::MODEL_UNKNOWN; } + + // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931 + if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) { + // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct + hparams.n_swa = 2047; + } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) { + // default value for Phi-3-mini-128k-instruct + hparams.n_swa = 262144; + } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) { + // default value for Phi-3-medium-128k-instruct + hparams.n_swa = 131072; + } + bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (!found_swa && hparams.n_swa == 0) { + throw std::runtime_error("invalid value for sliding_window"); + } } break; case LLM_ARCH_PLAMO: { @@ -5198,6 +5222,12 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_T5ENCODER: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts); + model.type = e_model::MODEL_UNKNOWN; + } break; case LLM_ARCH_JAIS: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -7432,6 +7462,42 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_T5ENCODER: + { + const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts; + + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + } + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED); + + layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); + layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}); + + layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; case LLM_ARCH_JAIS: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -13146,7 +13212,7 @@ struct llm_build_context { return gf; } - struct ggml_cgraph * build_t5() { + struct ggml_cgraph * build_t5_encoder() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens @@ -13161,303 +13227,323 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - if (lctx.is_encoding) { - struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); + GGML_ASSERT(lctx.is_encoding); + struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; - // self-attention - { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_enc, cur); - cb(Qcur, "Qcur", il); + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm_enc, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_enc, cur); - cb(Kcur, "Kcur", il); + // self-attention + { + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur); + cb(Qcur, "Qcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_enc, cur); - cb(Vcur, "Vcur", il); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur); + cb(Kcur, "Kcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur); + cb(Vcur, "Vcur", il); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; - struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b); - struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); - kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; + struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b); + struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + cb(kq_b, "kq_b", il); - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); - cb(v, "v", il); + kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); + struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); + cb(v, "v", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); + cb(kqv, "kqv", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); - ggml_build_forward_expand(gf, cur); + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); - cur = ggml_mul_mat(ctx0, model.layers[il].wo_enc, cur); - cb(cur, "kqv_out", il); - } + ggml_build_forward_expand(gf, cur); - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur); + cb(cur, "kqv_out", il); + } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up_enc, NULL, NULL, - model.layers[il].ffn_gate_enc, NULL, NULL, - model.layers[il].ffn_down_enc, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm_enc, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); - cur = ggml_add(ctx0, cur, ffn_inp); + // T5 uses relu, flan-T5 uses gelu-gated + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up_enc, NULL, NULL, + model.layers[il].ffn_gate_enc, NULL, NULL, + model.layers[il].ffn_down_enc, NULL, NULL, + NULL, + model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + cb, il); cb(cur, "ffn_out", il); + } - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); - // input for next layer - inpL = cur; + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); } + cb(cur, "l_out", il); - cur = inpL; - cb(cur, "result_embd", -1); + // input for next layer + inpL = cur; + } - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm_enc, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - } else { - GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); + cur = inpL; + cb(cur, "result_embd", -1); - struct ggml_tensor * embd_enc = llm_build_inp_embd_enc(); - struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true); + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm_enc, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); - struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); - struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross(); + ggml_build_forward_expand(gf, cur); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + return gf; + } - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + struct ggml_cgraph * build_t5_decoder() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - // self-attention - { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); + struct ggml_tensor * cur; + struct ggml_tensor * inpL; - llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); - struct ggml_tensor * k = - ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - 0); - cb(k, "k", il); + GGML_ASSERT(!lctx.is_encoding); + GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); - struct ggml_tensor * v = - ggml_view_3d(ctx0, kv_self.v_l[il], - n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv_self.v_l[il])*n_ctx, - ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, - 0); - cb(v, "v", il); + struct ggml_tensor * embd_enc = llm_build_inp_embd_enc(); + struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); + struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross(); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; - struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b); - struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); + // self-attention + { + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); - kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - cb(kqv, "kqv", il); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + struct ggml_tensor * k = + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_kv, n_head_kv, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + 0); + cb(k, "k", il); - ggml_build_forward_expand(gf, cur); + struct ggml_tensor * v = + ggml_view_3d(ctx0, kv_self.v_l[il], + n_kv, n_embd_head_v, n_head_kv, + ggml_element_size(kv_self.v_l[il])*n_ctx, + ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, + 0); + cb(v, "v", il); - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "kqv_out", il); - } + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "cross_inp", il); + struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * inpCA = cur; + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); - // norm - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].attn_norm_cross, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm_cross", il); + struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; + struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b); + struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + cb(kq_b, "kq_b", il); - // cross-attention - { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur); - cb(Qcur, "Qcur", il); + kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_cross, embd_enc); - cb(Kcur, "Kcur", il); + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + cb(kqv, "kqv", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_cross, embd_enc); - cb(Vcur, "Vcur", il); + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + ggml_build_forward_expand(gf, cur); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cb(cur, "kqv_out", il); + } - kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "cross_inp", il); - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); - cb(v, "v", il); + struct ggml_tensor * inpCA = cur; - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); + // norm + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].attn_norm_cross, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm_cross", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + // cross-attention + { + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur); + cb(Qcur, "Qcur", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc); + cb(Kcur, "Kcur", il); - ggml_build_forward_expand(gf, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc); + cb(Vcur, "Vcur", il); - cur = ggml_mul_mat(ctx0, model.layers[il].wo_cross, cur); - cb(cur, "kqv_out", il); - } + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); - } + struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); - cb(ffn_inp, "ffn_inp", il); + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } + struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); + cb(v, "v", il); - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); + cb(kqv, "kqv", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); - // input for next layer - inpL = cur; + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); + + ggml_build_forward_expand(gf, cur); + + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur); + cb(cur, "kqv_out", il); } - cur = inpL; - cb(cur, "result_embd", -1); + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); + } - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); + // T5 uses relu, flan-T5 uses gelu-gated + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } + cur = inpL; + cb(cur, "result_embd", -1); + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + ggml_build_forward_expand(gf, cur); return gf; @@ -13909,7 +13995,15 @@ static struct ggml_cgraph * llama_build_graph( } break; case LLM_ARCH_T5: { - result = llm.build_t5(); + if (lctx.is_encoding) { + result = llm.build_t5_encoder(); + } else { + result = llm.build_t5_decoder(); + } + } break; + case LLM_ARCH_T5ENCODER: + { + result = llm.build_t5_encoder(); } break; case LLM_ARCH_JAIS: { @@ -14357,7 +14451,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { // TODO: use a per-batch flag for logits presence instead const bool has_logits = !cparams.embeddings; - const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE)); + const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0; @@ -14628,12 +14722,15 @@ static int llama_decode_internal( res = nullptr; embd = nullptr; } else if (cparams.embeddings) { - res = nullptr; // do not extract logits for embedding case - embd = gf->nodes[gf->n_nodes - 1]; - if (strcmp(embd->name, "result_embd_pooled") != 0) { - embd = gf->nodes[gf->n_nodes - 2]; + res = nullptr; // do not extract logits for embedding case + embd = nullptr; + for (int i = gf->n_nodes - 1; i >= 0; --i) { + if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) { + embd = gf->nodes[i]; + break; + } } - GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); + GGML_ASSERT(embd != nullptr && "missing embeddings tensor"); } else { embd = nullptr; // do not extract embeddings when not needed GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); @@ -14840,9 +14937,24 @@ static int llama_encode_internal( ggml_cgraph * gf = llama_build_graph(lctx, batch, false); // the output embeddings after the final encoder normalization - struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embd = nullptr; - GGML_ASSERT(strcmp(embd->name, "result_norm") == 0); + // there are two cases here + if (llama_model_has_decoder(&lctx.model)) { + // first case is an encoder-decoder T5 model where embeddings are passed to decoder + embd = gf->nodes[gf->n_nodes - 1]; + GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor"); + } else { + // second case is an encoder-only T5 model + if (cparams.embeddings) { + // only output embeddings if required + embd = gf->nodes[gf->n_nodes - 1]; + if (strcmp(embd->name, "result_embd_pooled") != 0) { + embd = gf->nodes[gf->n_nodes - 2]; + } + GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); + } + } ggml_backend_sched_alloc_graph(lctx.sched, gf); @@ -14855,20 +14967,54 @@ static int llama_encode_internal( ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd); GGML_ASSERT(backend_embd != nullptr); - // extract token embeddings - GGML_ASSERT(lctx.embd != nullptr); + if (llama_model_has_decoder(&lctx.model)) { + lctx.embd_enc.resize(n_tokens*n_embd); + float * embd_out = lctx.embd_enc.data(); - lctx.embd_enc.resize(n_tokens*n_embd); - float * embd_out = lctx.embd_enc.data(); + ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); - ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + // remember the sequence ids used during the encoding - needed for cross attention later + lctx.seq_ids_enc.resize(n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + for (int s = 0; s < batch.n_seq_id[i]; s++) { + llama_seq_id seq_id = batch.seq_id[i][s]; + lctx.seq_ids_enc[i].insert(seq_id); + } + } + } else { + GGML_ASSERT(lctx.embd != nullptr); - // remember the sequence ids used during the encoding - needed for cross attention later - lctx.seq_ids_enc.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - for (int s = 0; s < batch.n_seq_id[i]; s++) { - llama_seq_id seq_id = batch.seq_id[i][s]; - lctx.seq_ids_enc[i].insert(seq_id); + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + GGML_ASSERT(lctx.embd != nullptr); + float * embd_out = lctx.embd; + + GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size); + ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings + auto & embd_seq_out = lctx.embd_seq; + embd_seq_out.clear(); + + for (uint32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = batch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } } } } @@ -15304,7 +15450,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { if (n_expert > 1) { - // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly + // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work // for getting the current layer as I initially thought, and we need to resort to parsing the // tensor name. @@ -16578,6 +16724,8 @@ struct llama_context * llama_new_context_with_model( ctx->sampling.rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; + // build worst-case graph for encoder if a model contains encoder + ctx->is_encoding = llama_model_has_encoder(model); uint32_t kv_size = cparams.n_ctx; ggml_type type_k = params.type_k; @@ -16892,6 +17040,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MAMBA: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_T5: + case LLM_ARCH_T5ENCODER: case LLM_ARCH_JAIS: return LLAMA_ROPE_TYPE_NONE; @@ -17039,8 +17188,16 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch bool llama_model_has_encoder(const struct llama_model * model) { switch (model->arch) { - case LLM_ARCH_T5: return true; - default: return false; + case LLM_ARCH_T5: return true; + case LLM_ARCH_T5ENCODER: return true; + default: return false; + } +} + +bool llama_model_has_decoder(const struct llama_model * model) { + switch (model->arch) { + case LLM_ARCH_T5ENCODER: return false; + default: return true; } } @@ -17343,6 +17500,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi // TODO: replace all non-fatal assertions with returned errors or exceptions struct llama_data_write { virtual void write(const void * src, size_t size) = 0; + virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0; virtual size_t get_size_written() = 0; virtual ~llama_data_write() = default; @@ -17465,9 +17623,8 @@ struct llama_data_write { // Read each range of cells of k_size length each into tmp_buf and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * k_size_row); - ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row); - write(tmp_buf.data(), tmp_buf.size()); + const size_t buf_size = range_size * k_size_row; + write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size); } } @@ -17486,9 +17643,8 @@ struct llama_data_write { // Read each range of cells of v_size length each into tmp_buf and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * v_size_row); - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row); - write(tmp_buf.data(), tmp_buf.size()); + const size_t buf_size = range_size * v_size_row; + write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size); } } } else { @@ -17514,9 +17670,8 @@ struct llama_data_write { for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * kv_size) * v_size_el; - tmp_buf.resize(range_size * v_size_el); - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size()); - write(tmp_buf.data(), tmp_buf.size()); + const size_t buf_size = range_size * v_size_el; + write_tensor_data(kv_self.v_l[il], src_offset, buf_size); } } } @@ -17875,12 +18030,14 @@ struct llama_data_write_dummy : llama_data_write { llama_data_write_dummy() {} - // TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context - void write(const void * /* src */, size_t size) override { size_written += size; } + void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override { + size_written += size; + } + size_t get_size_written() override { return size_written; } @@ -17903,6 +18060,16 @@ struct llama_data_write_buffer : llama_data_write { buf_size -= size; } + void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + ggml_backend_tensor_get(tensor, ptr, offset, size); + ptr += size; + size_written += size; + buf_size -= size; + } + size_t get_size_written() override { return size_written; } @@ -17938,6 +18105,7 @@ struct llama_data_read_buffer : llama_data_read { struct llama_data_write_file : llama_data_write { llama_file * file; size_t size_written = 0; + std::vector temp_buffer; llama_data_write_file(llama_file * f) : file(f) {} @@ -17946,6 +18114,12 @@ struct llama_data_write_file : llama_data_write { size_written += size; } + void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { + temp_buffer.resize(size); + ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size); + write(temp_buffer.data(), temp_buffer.size()); + } + size_t get_size_written() override { return size_written; }