From f5d7b268ec4bf8628aa6ccc9f6631d0230dde76f Mon Sep 17 00:00:00 2001 From: Joan Fontanals Date: Thu, 6 Jun 2024 09:22:41 +0200 Subject: [PATCH] llama : add jina v2 base code (#7596) * feat: add changes to handle jina v2 base code * fix: do not complicate things * fix: fix the usage of the code model * fix: fix comments * fix: fix linting issues * fix: remove ollama patches * style : minor --------- Co-authored-by: Georgi Gerganov --- convert-hf-to-gguf-update.py | 1 + convert-hf-to-gguf.py | 7 ++++++- gguf-py/gguf/constants.py | 1 + gguf-py/gguf/tensor_mapping.py | 3 +++ llama.cpp | 17 +++++++++++++---- 5 files changed, 24 insertions(+), 5 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 6dae1a59465e0..f43b15760e1b2 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -83,6 +83,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, + {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, ] diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index afb9704c8e218..a86864f04861b 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -475,6 +475,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct res = "smaug-bpe" + if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code + res = "jina-v2-code" if res is None: logger.warning("\n") @@ -2452,11 +2455,13 @@ def __init__(self, *args, **kwargs): def get_tensors(self): for name, data in super().get_tensors(): - if 'gated_layers' in name: + if 'gated_layer' in name: d1 = data[:self.intermediate_size, :] name1 = name.replace('gated_layers', 'gated_layers_w') + name1 = name1.replace('up_gated_layer', 'gated_layers_v') d2 = data[self.intermediate_size:, :] name2 = name.replace('gated_layers', 'gated_layers_v') + name2 = name2.replace('up_gated_layer', 'gated_layers_w') yield name1, d1 yield name2, d2 continue diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a3c024c8975f5..8908585ccf957 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -415,6 +415,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, MODEL_TENSOR.TOKEN_TYPES, + MODEL_TENSOR.ATTN_NORM_2, MODEL_TENSOR.ATTN_OUT_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_Q_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 83e3c4c3381a0..81b4992a51eed 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -102,6 +102,7 @@ class TensorNameMap: # Attention norm 2 MODEL_TENSOR.ATTN_NORM_2: ( "transformer.h.{bid}.ln_attn", # falcon40b + "encoder.layer.{bid}.layer_norm_1", # jina-v2-code ), # Attention query-key-value @@ -311,6 +312,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.c_proj", # starcoder2 "encoder.layer.{bid}.mlp.wo", # jina-bert-v2 "model.layers.{bid}.residual_mlp.w2", # arctic + "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2 ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -350,6 +352,7 @@ class TensorNameMap: "encoder.layers.{bid}.norm2", # nomic-bert "transformer.decoder_layer.{bid}.rms_norm_3", # Grok "encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2 + "encoder.layer.{bid}.layer_norm_2" # jina-v2-code ), MODEL_TENSOR.SSM_IN: ( diff --git a/llama.cpp b/llama.cpp index 414d390e891a1..cefb4d1d52dc3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -704,6 +704,7 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, @@ -4653,8 +4654,7 @@ static void llm_load_vocab( LLAMA_LOG_WARN("%s: ************************************ \n", __func__); LLAMA_LOG_WARN("%s: \n", __func__); vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; - } else if ( - tokenizer_pre == "default") { + } else if (tokenizer_pre == "default") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || @@ -4681,7 +4681,8 @@ static void llm_load_vocab( tokenizer_pre == "jina-es" || tokenizer_pre == "jina-de" || tokenizer_pre == "jina-v2-es" || - tokenizer_pre == "jina-v2-de") { + tokenizer_pre == "jina-v2-de" || + tokenizer_pre == "jina-v2-code") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; } else if ( tokenizer_pre == "refact") { @@ -5515,7 +5516,7 @@ static bool llm_load_tensors( layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); } else { - layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); } layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); @@ -5556,6 +5557,9 @@ static bool llm_load_tensors( layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}); + layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); @@ -8519,6 +8523,11 @@ struct llm_build_context { // attention layer norm cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il); + if (model.layers[il].attn_norm_2 != nullptr) { + cur = ggml_add(ctx0, cur, inpL); // re-add the layer input + cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il); + } + struct ggml_tensor * ffn_inp = cur; cb(ffn_inp, "ffn_inp", il);