From 1d8de31565ae6b87d60cab9d3e4ccc8dc41b23ab Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 13:52:11 +0200 Subject: [PATCH 01/77] model: dbrx convert to gguf #6344 --- convert-hf-to-gguf.py | 54 ++++++++++++ gguf-py/gguf/constants.py | 17 ++++ gguf-py/gguf/gguf_writer.py | 3 + gguf-py/gguf/tensor_mapping.py | 88 ++++++++++--------- .../requirements-convert-hf-to-gguf.txt | 1 + 5 files changed, 123 insertions(+), 40 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 7e601170e925a..9304bb56187bf 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1424,6 +1424,60 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) +@Model.register("DbrxForCausalLM") +class Qwen2MoeModel(Model): + model_arch = gguf.MODEL_ARCH.DBRX + + def set_gguf_parameters(self): + ffn_config = self.hparams["ffn_config"] + attn_config = self.hparams["attn_config"] + self.gguf_writer.add_name(self.hparams["model_type"]) + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_block_count(self.hparams["n_layers"]) + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) + self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) + self.gguf_writer.add_clip_kqv(attn_config["clip_qkv"]) + self.gguf_writer.add_file_type(self.ftype) + + self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) + self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) + + def _set_vocab_gpt2(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = tokenizer.vocab_size + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()} + added_vocab = tokenizer.get_added_vocab() + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + @Model.register("MiniCPMForCausalLM") class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5214764a9ea98..33524487b578b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -54,6 +54,7 @@ class Attention: LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" CAUSAL = "{arch}.attention.causal" + CLIP_KQV = "{arch}.attention.clip_kqv" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" @@ -125,6 +126,7 @@ class MODEL_ARCH(IntEnum): MAMBA = auto() XVERSE = auto() COMMAND_R = auto() + DBRX = auto() class MODEL_TENSOR(IntEnum): @@ -194,6 +196,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.COMMAND_R: "command-r", + MODEL_ARCH.DBRX: "dbrx", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -639,6 +642,20 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.DBRX: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_NORM_2, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.LAYER_OUT_NORM, + ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 2ae6c814b52de..a73c8edbde0cf 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -379,6 +379,9 @@ def add_layer_norm_rms_eps(self, value: float) -> None: def add_causal_attention(self, value: bool) -> None: self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) + def add_clip_kqv(self, value: int) -> None: + self.add_uint32(Keys.Attention.CLIP_KQV.format(arch=self.arch), value) + def add_pooling_type(self, value: PoolingType) -> None: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 345b1b0c72212..48c9bd08d5030 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -79,28 +79,30 @@ class TensorNameMap: block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { # Attention norm MODEL_TENSOR.ATTN_NORM: ( - "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen - "transformer.blocks.{bid}.norm_1", # mpt - "transformer.h.{bid}.input_layernorm", # falcon7b - "h.{bid}.input_layernorm", # bloom - "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf - "layers.{bid}.attention_norm", # llama-pth - "language_model.encoder.layers.{bid}.input_layernorm", # persimmon - "model.layers.{bid}.ln1", # yi - "h.{bid}.ln_1", # gpt2 - "transformer.h.{bid}.ln", # phi2 - "model.layers.layers.{bid}.norm", # plamo - "model.layers.{bid}.attention_norm", # internlm2 - "model.layers.{bid}.norm", # mamba-qbert - "backbone.layers.{bid}.norm", # mamba - "transformer.decoder_layer.{bid}.rms_norm", # Grok + "gpt_neox.layers.{bid}.input_layernorm", # gptneox + "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen + "transformer.blocks.{bid}.norm_1", # mpt + "transformer.h.{bid}.input_layernorm", # falcon7b + "h.{bid}.input_layernorm", # bloom + "transformer.h.{bid}.ln_mlp", # falcon40b + "model.layers.{bid}.input_layernorm", # llama-hf + "layers.{bid}.attention_norm", # llama-pth + "language_model.encoder.layers.{bid}.input_layernorm", # persimmon + "model.layers.{bid}.ln1", # yi + "h.{bid}.ln_1", # gpt2 + "transformer.h.{bid}.ln", # phi2 + "model.layers.layers.{bid}.norm", # plamo + "model.layers.{bid}.attention_norm", # internlm2 + "model.layers.{bid}.norm", # mamba-qbert + "backbone.layers.{bid}.norm", # mamba + "transformer.decoder_layer.{bid}.rms_norm", # Grok + "transformer.blocks.{bid}.norm_attn_norm.norm_1.weight", # DBRX ), # Attention norm 2 MODEL_TENSOR.ATTN_NORM_2: ( - "transformer.h.{bid}.ln_attn", # falcon40b + "transformer.h.{bid}.ln_attn", # falcon40b + "transformer.blocks.{bid}.norm_attn_norm.norm_2.weight", # DBRX ), # Attention query-key-value @@ -108,6 +110,7 @@ class TensorNameMap: "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox "transformer.h.{bid}.attn.c_attn", # gpt2 qwen "transformer.blocks.{bid}.attn.Wqkv", # mpt + "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv.weight", # DBRX "transformer.h.{bid}.self_attention.query_key_value", # falcon "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon @@ -152,23 +155,24 @@ class TensorNameMap: # Attention output MODEL_TENSOR.ATTN_OUT: ( - "gpt_neox.layers.{bid}.attention.dense", # gptneox - "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen - "transformer.blocks.{bid}.attn.out_proj", # mpt - "transformer.h.{bid}.self_attention.dense", # falcon - "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf - "layers.{bid}.attention.wo", # llama-pth - "encoder.layer.{bid}.attention.output.dense", # bert - "transformer.h.{bid}.attn.out_proj", # gpt-j - "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon - "model.layers.{bid}.self_attn.dense", # persimmon - "h.{bid}.attn.c_proj", # gpt2 - "transformer.h.{bid}.mixer.out_proj", # phi2 - "model.layers.layers.{bid}.self_attn.o_proj", # plamo - "model.layers.{bid}.attention.wo", # internlm2 - "encoder.layers.{bid}.attn.out_proj", # nomic-bert - "transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok + "gpt_neox.layers.{bid}.attention.dense", # gptneox + "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen + "transformer.blocks.{bid}.attn.out_proj", # mpt + "transformer.h.{bid}.self_attention.dense", # falcon + "h.{bid}.self_attention.dense", # bloom + "model.layers.{bid}.self_attn.o_proj", # llama-hf + "layers.{bid}.attention.wo", # llama-pth + "encoder.layer.{bid}.attention.output.dense", # bert + "transformer.h.{bid}.attn.out_proj", # gpt-j + "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon + "model.layers.{bid}.self_attn.dense", # persimmon + "h.{bid}.attn.c_proj", # gpt2 + "transformer.h.{bid}.mixer.out_proj", # phi2 + "model.layers.layers.{bid}.self_attn.o_proj", # plamo + "model.layers.{bid}.attention.wo", # internlm2 + "encoder.layers.{bid}.attn.out_proj", # nomic-bert + "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok + "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # DBRX ), # Attention output norm @@ -202,9 +206,10 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_INP: ( - "layers.{bid}.feed_forward.gate", # mixtral - "model.layers.{bid}.block_sparse_moe.gate", # mixtral - "transformer.decoder_layer.{bid}.router" # Grok + "layers.{bid}.feed_forward.gate", # mixtral + "model.layers.{bid}.block_sparse_moe.gate", # mixtral + "transformer.decoder_layer.{bid}.router", # Grok + "transformer.blocks.{bid}.ffn.router.layer.weight", # DBRX ), # Feed-forward up @@ -233,6 +238,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.w2", # DBRX ), # AWQ-activation gate @@ -251,8 +257,9 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_EXP: ( - "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) - "transformer.decoder_layer.{bid}.moe.linear" # Grok (merged) + "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # DBRX ), # Feed-forward down @@ -280,6 +287,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_DOWN_EXP: ( "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # DBRX ), MODEL_TENSOR.ATTN_Q_NORM: ( diff --git a/requirements/requirements-convert-hf-to-gguf.txt b/requirements/requirements-convert-hf-to-gguf.txt index 6ce840d73cb73..db8888caca774 100644 --- a/requirements/requirements-convert-hf-to-gguf.txt +++ b/requirements/requirements-convert-hf-to-gguf.txt @@ -1,3 +1,4 @@ -r ./requirements-convert.txt torch~=2.1.1 einops~=0.7.0 +tiktoken~=0.6.0 From ed582c1dde11b68aff432d2c831be0f37a9fd4ec Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 15:16:42 +0200 Subject: [PATCH 02/77] llama: support dbrx #6344 --- convert-hf-to-gguf.py | 2 +- gguf-py/gguf/constants.py | 1 - llama.cpp | 255 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 256 insertions(+), 2 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 9304bb56187bf..4b9b0fc8ecbf6 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1438,7 +1438,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count(self.hparams["n_heads"]) self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) - self.gguf_writer.add_clip_kqv(attn_config["clip_qkv"]) + self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 33524487b578b..eb628d9d80e05 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -54,7 +54,6 @@ class Attention: LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" CAUSAL = "{arch}.attention.causal" - CLIP_KQV = "{arch}.attention.clip_kqv" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" diff --git a/llama.cpp b/llama.cpp index 9a1c11043b94a..cb1d1e567d01f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -220,6 +220,7 @@ enum llm_arch { LLM_ARCH_MAMBA, LLM_ARCH_XVERSE, LLM_ARCH_COMMAND_R, + LLM_ARCH_DBRX, LLM_ARCH_UNKNOWN, }; @@ -252,6 +253,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_XVERSE, "xverse" }, { LLM_ARCH_COMMAND_R, "command-r" }, + { LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -926,6 +928,23 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_DBRX, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1692,6 +1711,7 @@ enum e_model { MODEL_40B, MODEL_65B, MODEL_70B, + MODEL_132B, MODEL_314B, MODEL_SMALL, MODEL_MEDIUM, @@ -3961,6 +3981,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_DBRX: + { + ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); + + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_132B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -4635,6 +4664,46 @@ static bool llm_load_tensors( layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); } } break; + case LLM_ARCH_DBRX: + { + if (n_expert == 0) { + throw std::runtime_error("DBRX model cannot have zero experts"); + } + + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2,"weight", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false); + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + + layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); + } + } break; case LLM_ARCH_BAICHUAN: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -7030,6 +7099,187 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_dbrx() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + + // self-attention + { + if (model.layers[il].attn_norm_2) { + // DBRX + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm_2, + NULL, + LLM_NORM, cb, il); + cb(cur, "attn_norm_2", il); + } + + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + if (hparams.f_clamp_kqv > 0.0f) { + cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(cur, "wqkv_clamped", il); + } + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + cb(Vcur, "Vcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + // MoE branch + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] + cb(logits, "ffn_moe_logits", il); + + ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] + cb(probs, "ffn_moe_probs", il); + + // select experts + ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok] + cb(selected_experts->src[0], "ffn_moe_argsort", il); + + ggml_tensor * weights = ggml_get_rows(ctx0, + ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); + cb(weights, "ffn_moe_weights", il); + + weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok] + + ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); + cb(weights_sum, "ffn_moe_weights_sum", il); + + weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok] + cb(weights, "ffn_moe_weights_norm", il); + + // compute expert outputs + ggml_tensor * moe_out = nullptr; + + for (int i = 0; i < n_expert_used; ++i) { + ggml_tensor * cur_expert; + + ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur); + cb(cur_up, "ffn_moe_up", il); + + ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur); + cb(cur_gate, "ffn_moe_gate", il); + + //GeLU + cur_gate = ggml_gelu(ctx0, cur_gate); + cb(cur_gate, "ffn_moe_gelu", il); + + cur_expert = ggml_mul(ctx0, cur_up, cur_gate); + cb(cur_expert, "ffn_moe_gate_par", il); + + cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd] + cb(cur_expert, "ffn_moe_down", il); + + cur_expert = ggml_mul(ctx0, cur_expert, + ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); + cb(cur_expert, "ffn_moe_weighted", il); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx0, moe_out, cur_expert); + cb(moe_out, "ffn_moe_out", il); + } + } + + cur = moe_out; + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_starcoder() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -9719,6 +9969,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_command_r(); } break; + case LLM_ARCH_DBRX: + { + result = llm.build_dbrx(); + } break; default: GGML_ASSERT(false); } @@ -14525,6 +14779,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MINICPM: case LLM_ARCH_XVERSE: case LLM_ARCH_COMMAND_R: + case LLM_ARCH_DBRX: // FIXME REVIEW @ggerganov I am not sure what to put here return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 From 3e3d2d127c8fbfdbace726f2b0988d55de6f62a7 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 15:46:47 +0200 Subject: [PATCH 03/77] gguf-py: remove wrong clip -> clamp --- gguf-py/gguf/gguf_writer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index a73c8edbde0cf..2ae6c814b52de 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -379,9 +379,6 @@ def add_layer_norm_rms_eps(self, value: float) -> None: def add_causal_attention(self, value: bool) -> None: self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) - def add_clip_kqv(self, value: int) -> None: - self.add_uint32(Keys.Attention.CLIP_KQV.format(arch=self.arch), value) - def add_pooling_type(self, value: PoolingType) -> None: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) From 3937100adb6df051011d41d4af3e697fe232547c Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 15:57:57 +0200 Subject: [PATCH 04/77] model: dbrx, trust remote code --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4b9b0fc8ecbf6..a847d118d5f25 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1451,7 +1451,7 @@ def _set_vocab_gpt2(self): toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) vocab_size = tokenizer.vocab_size reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()} From c0beb3cf7e2980e805eabea5b23b9950c6729c20 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 15:58:17 +0200 Subject: [PATCH 05/77] llama: add label for model 132B --- llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llama.cpp b/llama.cpp index cb1d1e567d01f..f5810df78aa6a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3563,6 +3563,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_40B: return "40B"; case MODEL_65B: return "65B"; case MODEL_70B: return "70B"; + case MODEL_132B: return "132B"; case MODEL_314B: return "314B"; case MODEL_SMALL: return "0.1B"; case MODEL_MEDIUM: return "0.4B"; From 09210334bf84fd024fa3edfe32626fa27db83b89 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 16:00:32 +0200 Subject: [PATCH 06/77] model: dbrx fix python linter in convert-hf-to-gguf.py --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index a847d118d5f25..5d858a1234a6a 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1446,7 +1446,6 @@ def set_gguf_parameters(self): def _set_vocab_gpt2(self): dir_model = self.dir_model - hparams = self.hparams tokens: list[str] = [] toktypes: list[int] = [] @@ -1478,6 +1477,7 @@ def _set_vocab_gpt2(self): special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) + @Model.register("MiniCPMForCausalLM") class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM From e4f8ee4f48032f60265567820a9d68723924ac78 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 16:08:25 +0200 Subject: [PATCH 07/77] llama: support dbrx fix norm type --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index f5810df78aa6a..279b39dfba8f5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7128,7 +7128,7 @@ struct llm_build_context { // norm cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM, cb, il); cb(cur, "attn_norm", il); @@ -7268,7 +7268,7 @@ struct llm_build_context { cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM, cb, -1); cb(cur, "result_norm", -1); // lm_head From a7f9a3eafc8ccac720ae5274e5c7cd0124d2a38f Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 18:59:53 +0200 Subject: [PATCH 08/77] dbrx: minor --- convert-hf-to-gguf.py | 1 + gguf-py/gguf/tensor_mapping.py | 16 ++++++++-------- llama.cpp | 14 +++++--------- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5d858a1234a6a..4faa04e57c830 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1456,6 +1456,7 @@ def _set_vocab_gpt2(self): reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()} added_vocab = tokenizer.get_added_vocab() + # REVIEW: Not tested yet, need to deep dive this tiktoken for i in range(vocab_size): if i not in reverse_vocab: tokens.append(f"[PAD{i}]") diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 48c9bd08d5030..3638e2aea6c50 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -96,13 +96,13 @@ class TensorNameMap: "model.layers.{bid}.norm", # mamba-qbert "backbone.layers.{bid}.norm", # mamba "transformer.decoder_layer.{bid}.rms_norm", # Grok - "transformer.blocks.{bid}.norm_attn_norm.norm_1.weight", # DBRX + "transformer.blocks.{bid}.norm_attn_norm.norm_1.weight", # dbrx ), # Attention norm 2 MODEL_TENSOR.ATTN_NORM_2: ( "transformer.h.{bid}.ln_attn", # falcon40b - "transformer.blocks.{bid}.norm_attn_norm.norm_2.weight", # DBRX + "transformer.blocks.{bid}.norm_attn_norm.norm_2.weight", # dbrx ), # Attention query-key-value @@ -110,7 +110,7 @@ class TensorNameMap: "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox "transformer.h.{bid}.attn.c_attn", # gpt2 qwen "transformer.blocks.{bid}.attn.Wqkv", # mpt - "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv.weight", # DBRX + "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv.weight", # dbrx "transformer.h.{bid}.self_attention.query_key_value", # falcon "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon @@ -172,7 +172,7 @@ class TensorNameMap: "model.layers.{bid}.attention.wo", # internlm2 "encoder.layers.{bid}.attn.out_proj", # nomic-bert "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok - "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # DBRX + "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # dbrx ), # Attention output norm @@ -209,7 +209,7 @@ class TensorNameMap: "layers.{bid}.feed_forward.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral "transformer.decoder_layer.{bid}.router", # Grok - "transformer.blocks.{bid}.ffn.router.layer.weight", # DBRX + "transformer.blocks.{bid}.ffn.router.layer.weight", # dbrx ), # Feed-forward up @@ -238,7 +238,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w2", # DBRX + "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx ), # AWQ-activation gate @@ -259,7 +259,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_EXP: ( "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.v1", # DBRX + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx ), # Feed-forward down @@ -287,7 +287,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_DOWN_EXP: ( "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w1", # DBRX + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx ), MODEL_TENSOR.ATTN_Q_NORM: ( diff --git a/llama.cpp b/llama.cpp index 279b39dfba8f5..016e119cba74a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7131,18 +7131,14 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "attn_norm", il); + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm_2, + NULL, + LLM_NORM, cb, il); + cb(cur, "attn_norm_2", il); // self-attention { - if (model.layers[il].attn_norm_2) { - // DBRX - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm_2, - NULL, - LLM_NORM, cb, il); - cb(cur, "attn_norm_2", il); - } - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); From e3c1e8127c41a7f2e6085b92487dec2f07f8a900 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 19:21:43 +0200 Subject: [PATCH 09/77] convert: dbrx: fix mixed up and down expert tensors --- gguf-py/gguf/tensor_mapping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 3638e2aea6c50..6e845c47944a2 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -238,7 +238,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx ), # AWQ-activation gate @@ -287,7 +287,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_DOWN_EXP: ( "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx + "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx ), MODEL_TENSOR.ATTN_Q_NORM: ( From 0a35f5881b3ec735f8ca7b61dfd3c82994a2cc7a Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 19:56:37 +0200 Subject: [PATCH 10/77] convert: dbrx: fix mixed up and down expert tensors llama: dbrx: review graph --- gguf-py/gguf/constants.py | 2 +- gguf-py/gguf/tensor_mapping.py | 42 +++++++++++++++++----------------- llama.cpp | 15 ++++++++---- 3 files changed, 33 insertions(+), 26 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index eb628d9d80e05..a22980b0ff550 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -646,9 +646,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_NORM_2, + MODEL_TENSOR.ATTN_OUT_NORM, MODEL_TENSOR.FFN_GATE_INP, MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 6e845c47944a2..bf498a5fca82e 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -155,31 +155,31 @@ class TensorNameMap: # Attention output MODEL_TENSOR.ATTN_OUT: ( - "gpt_neox.layers.{bid}.attention.dense", # gptneox - "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen - "transformer.blocks.{bid}.attn.out_proj", # mpt - "transformer.h.{bid}.self_attention.dense", # falcon - "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf - "layers.{bid}.attention.wo", # llama-pth - "encoder.layer.{bid}.attention.output.dense", # bert - "transformer.h.{bid}.attn.out_proj", # gpt-j - "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon - "model.layers.{bid}.self_attn.dense", # persimmon - "h.{bid}.attn.c_proj", # gpt2 - "transformer.h.{bid}.mixer.out_proj", # phi2 - "model.layers.layers.{bid}.self_attn.o_proj", # plamo - "model.layers.{bid}.attention.wo", # internlm2 - "encoder.layers.{bid}.attn.out_proj", # nomic-bert - "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok - "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # dbrx + "gpt_neox.layers.{bid}.attention.dense", # gptneox + "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen + "transformer.blocks.{bid}.attn.out_proj", # mpt + "transformer.h.{bid}.self_attention.dense", # falcon + "h.{bid}.self_attention.dense", # bloom + "model.layers.{bid}.self_attn.o_proj", # llama-hf + "layers.{bid}.attention.wo", # llama-pth + "encoder.layer.{bid}.attention.output.dense", # bert + "transformer.h.{bid}.attn.out_proj", # gpt-j + "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon + "model.layers.{bid}.self_attn.dense", # persimmon + "h.{bid}.attn.c_proj", # gpt2 + "transformer.h.{bid}.mixer.out_proj", # phi2 + "model.layers.layers.{bid}.self_attn.o_proj", # plamo + "model.layers.{bid}.attention.wo", # internlm2 + "encoder.layers.{bid}.attn.out_proj", # nomic-bert + "transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok ), # Attention output norm MODEL_TENSOR.ATTN_OUT_NORM: ( - "encoder.layer.{bid}.attention.output.LayerNorm", # bert - "encoder.layers.{bid}.norm1", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_1", # Grok + "encoder.layer.{bid}.attention.output.LayerNorm", # bert + "encoder.layers.{bid}.norm1", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_1", # Grok + "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # dbrx ), # Rotary embeddings diff --git a/llama.cpp b/llama.cpp index 016e119cba74a..360d4b08641d1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4695,7 +4695,6 @@ static bool llm_load_tensors( layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2,"weight", i), {n_embd}); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd}); - layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false); @@ -7184,9 +7183,10 @@ struct llm_build_context { // feed-forward network // MoE branch + // FIXME REVIEW: I do not see this op in https://huggingface.co/databricks/dbrx-instruct/blob/464e701f50aef4c1b59c81fb5667819a5d08e108/modeling_dbrx.py#L727 cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + NULL, NULL, + LLM_NORM, cb, il); cb(cur, "ffn_norm", il); ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] @@ -7244,9 +7244,16 @@ struct llm_build_context { cb(moe_out, "ffn_moe_out", il); } } - cur = moe_out; + // DbrxNormAttentionNorm + { + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].layer_out_norm, NULL, + LLM_NORM, cb, il); + cb(cur, "layer_out_norm", il); + } + cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); From c8e6f903e0bd9e6c55c75dd29f370bc44ec208c5 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 20:09:01 +0200 Subject: [PATCH 11/77] doc: dbrx: add the model as supported --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index bb66b6c6898f7..17699c358abf2 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ Typically finetunes of the base models below are supported as well. - [x] LLaMA 2 🦙🦙 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral) +- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct) - [X] Falcon - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) From 916b91852be3dc78350cf5860f97afcd2fbeb6ec Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 20:30:30 +0200 Subject: [PATCH 12/77] convert: dbrx: fix remove wrong ATTN_OUT_NORM tensor, add output layer mapping --- gguf-py/gguf/constants.py | 1 - gguf-py/gguf/tensor_mapping.py | 16 +++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a22980b0ff550..ce697f71405dc 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -648,7 +648,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_NORM_2, - MODEL_TENSOR.ATTN_OUT_NORM, MODEL_TENSOR.FFN_GATE_INP, MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index bf498a5fca82e..0b455d113e2b8 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -52,6 +52,7 @@ class TensorNameMap: "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 + "transformer.wte.weight", # dbrx ), # Output norm @@ -68,6 +69,7 @@ class TensorNameMap: "model.norm_f", # mamba-qbert "backbone.norm_f", # mamba "transformer.rms_norm", # Grok + "transformer.norm_f.weight", # dbrx ), # Rope frequencies @@ -176,10 +178,9 @@ class TensorNameMap: # Attention output norm MODEL_TENSOR.ATTN_OUT_NORM: ( - "encoder.layer.{bid}.attention.output.LayerNorm", # bert - "encoder.layers.{bid}.norm1", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_1", # Grok - "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # dbrx + "encoder.layer.{bid}.attention.output.LayerNorm", # bert + "encoder.layers.{bid}.norm1", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_1", # Grok ), # Rotary embeddings @@ -307,9 +308,10 @@ class TensorNameMap: ), MODEL_TENSOR.LAYER_OUT_NORM: ( - "encoder.layer.{bid}.output.LayerNorm", # bert - "encoder.layers.{bid}.norm2", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_3", # Grok + "encoder.layer.{bid}.output.LayerNorm", # bert + "encoder.layers.{bid}.norm2", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_3", # Grok + "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # dbrx ), MODEL_TENSOR.SSM_IN: ( From 03da419fc069180fe292dad803ea4f2290bd38dc Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 20:43:46 +0200 Subject: [PATCH 13/77] llama: dbrx: remove wrong attn output layer in model arch --- llama.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 360d4b08641d1..944317a8eac25 100644 --- a/llama.cpp +++ b/llama.cpp @@ -935,7 +935,6 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, From 76f266beef0b6fc5fb4c59ad1367382807c3d07f Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 21:10:19 +0200 Subject: [PATCH 14/77] scripts: get-wikitext-2 add unzip --- scripts/get-wikitext-2.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/get-wikitext-2.sh b/scripts/get-wikitext-2.sh index 7ca760fa61304..b01476a46015a 100755 --- a/scripts/get-wikitext-2.sh +++ b/scripts/get-wikitext-2.sh @@ -1,10 +1,11 @@ #!/bin/bash wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip +unzip wikitext-2-raw-v1.zip echo "Usage:" echo "" -echo " ./perplexity -m model.gguf -f wiki.test.raw [other params]" +echo " ./perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]" echo "" exit 0 From 9c7dedb0f34a196a04f7e473e077243eeac8ca85 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 22:22:57 +0200 Subject: [PATCH 15/77] llama: dbrx: no attention output layer --- llama.cpp | 93 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/llama.cpp b/llama.cpp index 944317a8eac25..74a455cda0779 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7165,7 +7165,7 @@ struct llm_build_context { cb(Vcur, "Vcur", il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + NULL, NULL, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -7182,68 +7182,69 @@ struct llm_build_context { // feed-forward network // MoE branch - // FIXME REVIEW: I do not see this op in https://huggingface.co/databricks/dbrx-instruct/blob/464e701f50aef4c1b59c81fb5667819a5d08e108/modeling_dbrx.py#L727 - cur = llm_build_norm(ctx0, ffn_inp, hparams, - NULL, NULL, - LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - - ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] - cb(logits, "ffn_moe_logits", il); + { + // FIXME REVIEW: I do not see this op in https://huggingface.co/databricks/dbrx-instruct/blob/464e701f50aef4c1b59c81fb5667819a5d08e108/modeling_dbrx.py#L727 + cur = llm_build_norm(ctx0, ffn_inp, hparams, + NULL, NULL, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); - ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] - cb(probs, "ffn_moe_probs", il); + ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] + cb(logits, "ffn_moe_logits", il); - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok] - cb(selected_experts->src[0], "ffn_moe_argsort", il); + ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] + cb(probs, "ffn_moe_probs", il); - ggml_tensor * weights = ggml_get_rows(ctx0, - ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); - cb(weights, "ffn_moe_weights", il); + // select experts + ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok] + cb(selected_experts->src[0], "ffn_moe_argsort", il); - weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok] + ggml_tensor * weights = ggml_get_rows(ctx0, + ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); + cb(weights, "ffn_moe_weights", il); - ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); - cb(weights_sum, "ffn_moe_weights_sum", il); + weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok] - weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok] - cb(weights, "ffn_moe_weights_norm", il); + ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); + cb(weights_sum, "ffn_moe_weights_sum", il); - // compute expert outputs - ggml_tensor * moe_out = nullptr; + weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok] + cb(weights, "ffn_moe_weights_norm", il); - for (int i = 0; i < n_expert_used; ++i) { - ggml_tensor * cur_expert; + // compute expert outputs + ggml_tensor * moe_out = nullptr; + for (int i = 0; i < n_expert_used; ++i) { + ggml_tensor * cur_expert; - ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur); - cb(cur_up, "ffn_moe_up", il); + ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur); + cb(cur_up, "ffn_moe_up", il); - ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur); - cb(cur_gate, "ffn_moe_gate", il); + ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur); + cb(cur_gate, "ffn_moe_gate", il); - //GeLU - cur_gate = ggml_gelu(ctx0, cur_gate); - cb(cur_gate, "ffn_moe_gelu", il); + //GeLU + cur_gate = ggml_gelu(ctx0, cur_gate); + cb(cur_gate, "ffn_moe_gelu", il); - cur_expert = ggml_mul(ctx0, cur_up, cur_gate); - cb(cur_expert, "ffn_moe_gate_par", il); + cur_expert = ggml_mul(ctx0, cur_up, cur_gate); + cb(cur_expert, "ffn_moe_gate_par", il); - cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd] - cb(cur_expert, "ffn_moe_down", il); + cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd] + cb(cur_expert, "ffn_moe_down", il); - cur_expert = ggml_mul(ctx0, cur_expert, - ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); - cb(cur_expert, "ffn_moe_weighted", il); + cur_expert = ggml_mul(ctx0, cur_expert, + ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i * weights->nb[0])); + cb(cur_expert, "ffn_moe_weighted", il); - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx0, moe_out, cur_expert); - cb(moe_out, "ffn_moe_out", il); + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx0, moe_out, cur_expert); + cb(moe_out, "ffn_moe_out", il); + } } + cur = moe_out; } - cur = moe_out; // DbrxNormAttentionNorm { From fe8089871e1227fd8ab3c1ea0997a9fb91098050 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 23:27:29 +0200 Subject: [PATCH 16/77] model: dbrx: fix missing embedding tensor, mix with output layer --- gguf-py/gguf/tensor_mapping.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0b455d113e2b8..0bf7ae7b2b9a0 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -24,6 +24,7 @@ class TensorNameMap: "backbone.embedding", # mamba "backbone.embeddings", # mamba-hf "transformer.in_out_embed", # Grok + "transformer.wte.weight", # dbrx ), # Token type embeddings @@ -52,7 +53,7 @@ class TensorNameMap: "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 - "transformer.wte.weight", # dbrx + "lm_head.weight", # dbrx ), # Output norm From 4f12a580d93a459bd79e7b715705bdee916fea11 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 23:35:23 +0200 Subject: [PATCH 17/77] llama: dbrx: remove not existing condition on empty output layer --- llama.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index 74a455cda0779..8e6580877ea60 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4676,12 +4676,6 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); - // if output is NULL, init from the input tok embed - if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); - } } for (int i = 0; i < n_layer; ++i) { From 7e7cd53ca6f52ffffe35d44439f3db1c0ac4dfa8 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 6 Apr 2024 23:55:37 +0200 Subject: [PATCH 18/77] llama: dbrx: remove unnecessary optional tensor on FFN_GATE_EXPS --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 7347ae6557fb8..b8ca64b6544b8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4692,7 +4692,7 @@ static bool llm_load_tensors( layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd}); layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); - layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false); + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}); layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}); layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); From 52c403355f14c78cef36b6c6e4cb77f77eb08471 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 03:16:33 +0200 Subject: [PATCH 19/77] llama: increase maximum experts allowed --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index b8ca64b6544b8..2f61faa9b832a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -105,7 +105,7 @@ #endif #define LLAMA_MAX_NODES 8192 -#define LLAMA_MAX_EXPERTS 8 +#define LLAMA_MAX_EXPERTS 16 // From 06a59abf0a89a9c10e67a59744951dd1ad8ab468 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 03:17:24 +0200 Subject: [PATCH 20/77] model: dbrx: convert add n_ff --- convert-hf-to-gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4faa04e57c830..6bbe8a0a618f8 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1441,6 +1441,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) From 305ac3b61be400e4afeb0e0458efeec5f91703ce Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 05:01:33 +0200 Subject: [PATCH 21/77] llama: dbrx: quantize fix n_attention_wv tensor name --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 2f61faa9b832a..a76d5f7ec1da1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13717,7 +13717,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const std::string name = ggml_get_name(meta); // TODO: avoid hardcoded tensor names - use the TN_* constants - if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { + if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos || name.find("attn.Wqkv.weight") != std::string::npos) { ++qs.n_attention_wv; } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; From b6522a9f5b8b82e23016121fd761a8c1b16b3f31 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 04:23:32 +0200 Subject: [PATCH 22/77] model: dbrx: convert fix tokenizer --- convert-hf-to-gguf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 6bbe8a0a618f8..61e2462418ff6 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1457,6 +1457,8 @@ def _set_vocab_gpt2(self): reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()} added_vocab = tokenizer.get_added_vocab() + self.gguf_writer.add_chat_template(tokenizer.default_chat_template) + # REVIEW: Not tested yet, need to deep dive this tiktoken for i in range(vocab_size): if i not in reverse_vocab: @@ -1476,7 +1478,8 @@ def _set_vocab_gpt2(self): self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + special_vocab = gguf.SpecialVocab(dir_model) # FIXME https://huggingface.co/databricks/dbrx-instruct/blob/main/tokenizer_config.json + special_vocab.merges = [] special_vocab.add_to_gguf(self.gguf_writer) From dccb0126378da098af757d858e99dca4b5cfd883 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 05:09:17 +0200 Subject: [PATCH 23/77] llama: dbrx: quantize fix n_attention_wv tensor name --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index a76d5f7ec1da1..5922db5a5c0fc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13717,7 +13717,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const std::string name = ggml_get_name(meta); // TODO: avoid hardcoded tensor names - use the TN_* constants - if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos || name.find("attn.Wqkv.weight") != std::string::npos) { + if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos || name == LLM_TN(model.arch)(LLM_TENSOR_ATTN_QKV, "weight")) { ++qs.n_attention_wv; } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; From 61be4b91a68959f4aa3bbff5f548e7557b0b505d Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 12:15:16 +0200 Subject: [PATCH 24/77] model: convert-hf-to-gguf.py add _set_vocab_tiktoken gpt2 backed on llama.cpp --- convert-hf-to-gguf.py | 83 ++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 61e2462418ff6..1e53f35e871b5 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -390,6 +390,51 @@ def _set_vocab_llama_hf(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_tiktoken(self): + # https://github.com/openai/tiktoken + dir_model = self.dir_model + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + + vocab_size = tokenizer.vocab_size + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()} + added_vocab = tokenizer.get_added_vocab() + merges = [] + + # FIXME REVIEW should we extract this from QwenModel to base Model class ? + mergeable_ranks = tokenizer.encoding._mergeable_ranks + for token, rank in mergeable_ranks.items(): + reverse_vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + assert len(merged) == 2 + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + for i in range(vocab_size): + if reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + # FIXME REVIEW should we introduce tiktoken in llama.cpp ? + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) + special_vocab.merges = merges + # FIXME REVIEW how to add special tokens https://huggingface.co/databricks/dbrx-instruct/blob/main/tiktoken.py#L193 + special_vocab.add_to_gguf(self.gguf_writer) + @Model.register("GPTNeoXForCausalLM") class GPTNeoXModel(Model): @@ -1445,42 +1490,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) - def _set_vocab_gpt2(self): - dir_model = self.dir_model - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = tokenizer.vocab_size - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()} - added_vocab = tokenizer.get_added_vocab() - - self.gguf_writer.add_chat_template(tokenizer.default_chat_template) - - # REVIEW: Not tested yet, need to deep dive this tiktoken - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model) # FIXME https://huggingface.co/databricks/dbrx-instruct/blob/main/tokenizer_config.json - special_vocab.merges = [] - special_vocab.add_to_gguf(self.gguf_writer) + def set_vocab(self): + self._set_vocab_tiktoken() @Model.register("MiniCPMForCausalLM") From 1fb6d95c1de98001f60ad932b58b8cb22a959019 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 15:40:21 +0200 Subject: [PATCH 25/77] model: convert-hf-to-gguf.py fix classname conflict with qwen2 --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 1e53f35e871b5..5cbe6e8853dc6 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1470,7 +1470,7 @@ def write_tensors(self): @Model.register("DbrxForCausalLM") -class Qwen2MoeModel(Model): +class DbrxModel(Model): model_arch = gguf.MODEL_ARCH.DBRX def set_gguf_parameters(self): From 200ce21436ffc3839b1bd66b7ce295515ae943a6 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 15:54:19 +0200 Subject: [PATCH 26/77] model: dbrx: convert-hf-to-gguf.py fix fix ftype missing, fix tensor names does not suffix with .weight --- convert-hf-to-gguf.py | 13 ++++--- gguf-py/gguf/tensor_mapping.py | 67 ++++++++++++++++------------------ 2 files changed, 39 insertions(+), 41 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5cbe6e8853dc6..7be74957370f1 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -95,17 +95,17 @@ def set_gguf_parameters(self): self.gguf_writer.add_context_length(n_ctx) print(f"gguf: context length = {n_ctx}") - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - self.gguf_writer.add_embedding_length(n_embd) - print(f"gguf: embedding length = {n_embd}") + if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None: + self.gguf_writer.add_embedding_length(n_embd) + print(f"gguf: embedding length = {n_embd}") if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) print(f"gguf: feed forward length = {n_ff}") - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_head_count(n_head) - print(f"gguf: head count = {n_head}") + if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None: + self.gguf_writer.add_head_count(n_head) + print(f"gguf: head count = {n_head}") if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) @@ -1474,6 +1474,7 @@ class DbrxModel(Model): model_arch = gguf.MODEL_ARCH.DBRX def set_gguf_parameters(self): + super().set_gguf_parameters() ffn_config = self.hparams["ffn_config"] attn_config = self.hparams["attn_config"] self.gguf_writer.add_name(self.hparams["model_type"]) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0bf7ae7b2b9a0..e3d930dedd2a1 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -10,7 +10,7 @@ class TensorNameMap: # Token embeddings MODEL_TENSOR.TOKEN_EMBD: ( "gpt_neox.embed_in", # gptneox - "transformer.wte", # gpt2 gpt-j mpt refact qwen + "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx "transformer.word_embeddings", # falcon "word_embeddings", # bloom "model.embed_tokens", # llama-hf @@ -24,7 +24,6 @@ class TensorNameMap: "backbone.embedding", # mamba "backbone.embeddings", # mamba-hf "transformer.in_out_embed", # Grok - "transformer.wte.weight", # dbrx ), # Token type embeddings @@ -49,11 +48,10 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 - "lm_head.weight", # dbrx ), # Output norm @@ -62,7 +60,7 @@ class TensorNameMap: "transformer.ln_f", # gpt2 gpt-j falcon "model.norm", # llama-hf baichuan internlm2 "norm", # llama-pth - "transformer.norm_f", # mpt + "transformer.norm_f", # mpt dbrx "ln_f", # refact bloom qwen gpt2 "language_model.encoder.final_layernorm", # persimmon "model.final_layernorm", # persimmon @@ -70,7 +68,6 @@ class TensorNameMap: "model.norm_f", # mamba-qbert "backbone.norm_f", # mamba "transformer.rms_norm", # Grok - "transformer.norm_f.weight", # dbrx ), # Rope frequencies @@ -82,30 +79,30 @@ class TensorNameMap: block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { # Attention norm MODEL_TENSOR.ATTN_NORM: ( - "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen - "transformer.blocks.{bid}.norm_1", # mpt - "transformer.h.{bid}.input_layernorm", # falcon7b - "h.{bid}.input_layernorm", # bloom - "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf - "layers.{bid}.attention_norm", # llama-pth - "language_model.encoder.layers.{bid}.input_layernorm", # persimmon - "model.layers.{bid}.ln1", # yi - "h.{bid}.ln_1", # gpt2 - "transformer.h.{bid}.ln", # phi2 - "model.layers.layers.{bid}.norm", # plamo - "model.layers.{bid}.attention_norm", # internlm2 - "model.layers.{bid}.norm", # mamba-qbert - "backbone.layers.{bid}.norm", # mamba - "transformer.decoder_layer.{bid}.rms_norm", # Grok - "transformer.blocks.{bid}.norm_attn_norm.norm_1.weight", # dbrx + "gpt_neox.layers.{bid}.input_layernorm", # gptneox + "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen + "transformer.blocks.{bid}.norm_1", # mpt + "transformer.h.{bid}.input_layernorm", # falcon7b + "h.{bid}.input_layernorm", # bloom + "transformer.h.{bid}.ln_mlp", # falcon40b + "model.layers.{bid}.input_layernorm", # llama-hf + "layers.{bid}.attention_norm", # llama-pth + "language_model.encoder.layers.{bid}.input_layernorm", # persimmon + "model.layers.{bid}.ln1", # yi + "h.{bid}.ln_1", # gpt2 + "transformer.h.{bid}.ln", # phi2 + "model.layers.layers.{bid}.norm", # plamo + "model.layers.{bid}.attention_norm", # internlm2 + "model.layers.{bid}.norm", # mamba-qbert + "backbone.layers.{bid}.norm", # mamba + "transformer.decoder_layer.{bid}.rms_norm", # Grok + "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx ), # Attention norm 2 MODEL_TENSOR.ATTN_NORM_2: ( - "transformer.h.{bid}.ln_attn", # falcon40b - "transformer.blocks.{bid}.norm_attn_norm.norm_2.weight", # dbrx + "transformer.h.{bid}.ln_attn", # falcon40b + "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), # Attention query-key-value @@ -113,7 +110,7 @@ class TensorNameMap: "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox "transformer.h.{bid}.attn.c_attn", # gpt2 qwen "transformer.blocks.{bid}.attn.Wqkv", # mpt - "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv.weight", # dbrx + "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx "transformer.h.{bid}.self_attention.query_key_value", # falcon "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon @@ -208,10 +205,10 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_INP: ( - "layers.{bid}.feed_forward.gate", # mixtral - "model.layers.{bid}.block_sparse_moe.gate", # mixtral - "transformer.decoder_layer.{bid}.router", # Grok - "transformer.blocks.{bid}.ffn.router.layer.weight", # dbrx + "layers.{bid}.feed_forward.gate", # mixtral + "model.layers.{bid}.block_sparse_moe.gate", # mixtral + "transformer.decoder_layer.{bid}.router", # Grok + "transformer.blocks.{bid}.ffn.router.layer", # dbrx ), # Feed-forward up @@ -309,10 +306,10 @@ class TensorNameMap: ), MODEL_TENSOR.LAYER_OUT_NORM: ( - "encoder.layer.{bid}.output.LayerNorm", # bert - "encoder.layers.{bid}.norm2", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_3", # Grok - "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # dbrx + "encoder.layer.{bid}.output.LayerNorm", # bert + "encoder.layers.{bid}.norm2", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_3", # Grok + "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx ), MODEL_TENSOR.SSM_IN: ( From 9e17dad087aa5c63e337e5503b6588ef14ab1a6f Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 15:57:36 +0200 Subject: [PATCH 27/77] model: dbrx: convert-hf-to-gguf.py add chat template --- convert-hf-to-gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 7be74957370f1..640c8dde7049d 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -432,6 +432,7 @@ def _set_vocab_tiktoken(self): special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) special_vocab.merges = merges + special_vocab.chat_template = tokenizer.default_chat_template # FIXME REVIEW how to add special tokens https://huggingface.co/databricks/dbrx-instruct/blob/main/tiktoken.py#L193 special_vocab.add_to_gguf(self.gguf_writer) From d7546fda641494b3c43afd21a0176378e7515525 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 15:59:07 +0200 Subject: [PATCH 28/77] llama: quantize: remove wrong look for tensor qkv name as it was badly missing the .weight suffix --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 5922db5a5c0fc..2f61faa9b832a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13717,7 +13717,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const std::string name = ggml_get_name(meta); // TODO: avoid hardcoded tensor names - use the TN_* constants - if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos || name == LLM_TN(model.arch)(LLM_TENSOR_ATTN_QKV, "weight")) { + if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { ++qs.n_attention_wv; } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; From 3a9dc2eee2e75a621c96b466006d0db6a3aced8d Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 17:21:35 +0200 Subject: [PATCH 29/77] model: dbrx: convert-hf-to-gguf.py fix 'token_embd.weight' has wrong shape, fix special tokens --- convert-hf-to-gguf.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 640c8dde7049d..429af208aed21 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -393,29 +393,36 @@ def _set_vocab_llama_hf(self): def _set_vocab_tiktoken(self): # https://github.com/openai/tiktoken dir_model = self.dir_model + hparams = self.hparams tokens: list[str] = [] toktypes: list[int] = [] from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - - vocab_size = tokenizer.vocab_size - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()} - added_vocab = tokenizer.get_added_vocab() + vocab_size = hparams["vocab_size"] + assert max(tokenizer.get_vocab().values()) < vocab_size + vocab = {} merges = [] # FIXME REVIEW should we extract this from QwenModel to base Model class ? mergeable_ranks = tokenizer.encoding._mergeable_ranks for token, rank in mergeable_ranks.items(): - reverse_vocab[QwenModel.token_bytes_to_string(token)] = rank + vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: continue merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) assert len(merged) == 2 merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined + added_vocab = tokenizer.get_added_vocab() + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()} + for i in range(vocab_size): - if reverse_vocab[i] in added_vocab: + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: tokens.append(reverse_vocab[i]) if tokenizer.added_tokens_decoder[i].special: toktypes.append(gguf.TokenType.CONTROL) @@ -425,15 +432,22 @@ def _set_vocab_tiktoken(self): tokens.append(reverse_vocab[i]) toktypes.append(gguf.TokenType.NORMAL) - # FIXME REVIEW should we introduce tiktoken in llama.cpp ? self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.merges = merges special_vocab.chat_template = tokenizer.default_chat_template - # FIXME REVIEW how to add special tokens https://huggingface.co/databricks/dbrx-instruct/blob/main/tiktoken.py#L193 + special_vocab.merges = merges + tk_endoftext = tokenizer.encoding._special_tokens["<|endoftext|>"] + + # only add special tokens when they were not already loaded from config.json + if len(special_vocab.special_token_ids) == 0: + special_vocab._set_special_token("bos", tk_endoftext) + special_vocab._set_special_token("eos", tk_endoftext) + # this one is usually not in config.json anyway + special_vocab._set_special_token("unk", tk_endoftext) + special_vocab.add_to_gguf(self.gguf_writer) From 8154617ff228cb43cfdc0d26a148dc1e41e1c4e8 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 17:25:39 +0200 Subject: [PATCH 30/77] model: dbrx: convert-hf-to-gguf.py support python 3.8 --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 429af208aed21..655428c7e7b4d 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -416,7 +416,7 @@ def _set_vocab_tiktoken(self): # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined added_vocab = tokenizer.get_added_vocab() - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()} + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in ({**vocab, **added_vocab}).items()} for i in range(vocab_size): if i not in reverse_vocab: From 2449ef48a9036e47f204c360ca5ce464391b3713 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 16:57:13 +0200 Subject: [PATCH 31/77] llama: dbrx: no weight suffix in ffn_gate_exps, ffn_up_exps and ffn_down_exps. Output tensor not optional. --- llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 2f61faa9b832a..a30443433d45a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4677,7 +4677,7 @@ static bool llm_load_tensors( // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } for (int i = 0; i < n_layer; ++i) { @@ -4692,9 +4692,9 @@ static bool llm_load_tensors( layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd}); layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); - layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}); - layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}); - layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, i), {n_embd, n_ff, n_expert}); + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, i), {n_ff, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, i), {n_embd, n_ff, n_expert}); layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); } From 1bd94270e54e7c7ab876826b760cf568ea615d18 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 17:55:22 +0200 Subject: [PATCH 32/77] llama: quantize: remove wrong look for tensor qkv name as it was badly missing the .weight suffix model: dbrx: convert to gguf force experts tensors to have .weight suffix --- convert-hf-to-gguf.py | 72 +++++++++++++++++++++++++++++++++++++------ llama.cpp | 8 ++--- 2 files changed, 67 insertions(+), 13 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 655428c7e7b4d..24a0f5ed6e3b3 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -95,17 +95,17 @@ def set_gguf_parameters(self): self.gguf_writer.add_context_length(n_ctx) print(f"gguf: context length = {n_ctx}") - if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None: - self.gguf_writer.add_embedding_length(n_embd) - print(f"gguf: embedding length = {n_embd}") + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + self.gguf_writer.add_embedding_length(n_embd) + print(f"gguf: embedding length = {n_embd}") if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) print(f"gguf: feed forward length = {n_ff}") - if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None: - self.gguf_writer.add_head_count(n_head) - print(f"gguf: head count = {n_head}") + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + self.gguf_writer.add_head_count(n_head) + print(f"gguf: head count = {n_head}") if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) @@ -1489,23 +1489,77 @@ class DbrxModel(Model): model_arch = gguf.MODEL_ARCH.DBRX def set_gguf_parameters(self): - super().set_gguf_parameters() ffn_config = self.hparams["ffn_config"] attn_config = self.hparams["attn_config"] self.gguf_writer.add_name(self.hparams["model_type"]) + self.gguf_writer.add_block_count(self.hparams["n_layers"]) + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_block_count(self.hparams["n_layers"]) + self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) + self.gguf_writer.add_head_count(self.hparams["n_heads"]) self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) + self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) + self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) + self.gguf_writer.add_file_type(self.ftype) + print(f"gguf: file type = {self.ftype}") + + def write_tensors(self): + block_count = self.hparams.get("n_layers") + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + for name, data_torch in self.get_tensors(): + # In MoE models the ffn tensors are typically most of the model weights, + # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. + # Every other model has the weight names ending in .weight, + # let's assume that is the convention which is not the case for dbrx: + # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 + exp_tensor_names = ["ffn.experts.mlp.v1", "ffn.experts.mlp.w1", "ffn.experts.mlp.w2"] + for exp_tensor_name in exp_tensor_names: + if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: + name += ".weight" + break + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + def set_vocab(self): self._set_vocab_tiktoken() diff --git a/llama.cpp b/llama.cpp index a30443433d45a..ee75cbab7fdfe 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4691,10 +4691,10 @@ static bool llm_load_tensors( layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd}); - layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); - layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, i), {n_embd, n_ff, n_expert}); - layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, i), {n_ff, n_embd, n_expert}); - layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, i), {n_embd, n_ff, n_expert}); + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS,"weight", i), {n_embd, n_ff, n_expert}); + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS,"weight", i), {n_ff, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); } From e9987c66d0b65f1494e1b4045a3d475eb7eef5e7 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 18:21:57 +0200 Subject: [PATCH 33/77] llama: dbrx: fix tensor qkv number of elements --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index ee75cbab7fdfe..e290611202402 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4689,7 +4689,7 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2,"weight", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd}); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS,"weight", i), {n_embd, n_ff, n_expert}); From d151d8fad9d87bfb128d31e6a0ec8d9188318649 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 18:41:33 +0200 Subject: [PATCH 34/77] model: dbrx: convert reshape expert tensors to 3D --- convert-hf-to-gguf.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 24a0f5ed6e3b3..60363bc1bdbbc 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1516,15 +1516,12 @@ def write_tensors(self): block_count = self.hparams.get("n_layers") tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): - # In MoE models the ffn tensors are typically most of the model weights, - # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. - # Every other model has the weight names ending in .weight, - # let's assume that is the convention which is not the case for dbrx: - # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 + # Specific behavior for experts tensors: reshape to 3D and add suffix .weight exp_tensor_names = ["ffn.experts.mlp.v1", "ffn.experts.mlp.w1", "ffn.experts.mlp.w2"] + experts = False for exp_tensor_name in exp_tensor_names: if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: - name += ".weight" + experts = True break old_dtype = data_torch.dtype @@ -1536,7 +1533,12 @@ def write_tensors(self): data = data_torch.squeeze().numpy() # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + # In MoE models the ffn tensors are typically most of the model weights, + # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. + # Every other model has the weight names ending in .weight, + # let's assume that is the convention which is not the case for dbrx: + # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 + new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight", ".bias")) if new_name is None: print(f"Can not map tensor {name!r}") sys.exit() @@ -1556,6 +1558,11 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) + # Reshape experts tensors from 2D to 3D as expected by GeLU + if experts and n_dims == 2: + data = data.reshape((self.hparams["d_model"], self.hparams["ffn_config"]["ffn_hidden_size"], self.hparams["ffn_config"]["moe_num_experts"])) + n_dims = len(data.shape) + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) From f062b834ed471e2f2637c2947a3c87f9ace724e9 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 18:47:37 +0200 Subject: [PATCH 35/77] model: dbrx: convert experts to f16 --- convert-hf-to-gguf.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 60363bc1bdbbc..e3d5b5763e6e8 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1538,7 +1538,7 @@ def write_tensors(self): # Every other model has the weight names ending in .weight, # let's assume that is the convention which is not the case for dbrx: # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 - new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight", ".bias")) + new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=".weight") if new_name is None: print(f"Can not map tensor {name!r}") sys.exit() @@ -1546,23 +1546,19 @@ def write_tensors(self): n_dims = len(data.shape) data_dtype = data.dtype + # Reshape experts tensors from 2D to 3D as expected by GeLU + if experts and n_dims == 2: + data = data.reshape((self.hparams["d_model"], self.hparams["ffn_config"]["ffn_hidden_size"], self.hparams["ffn_config"]["moe_num_experts"])) + n_dims = len(data.shape) + # if f32 desired, convert any float16 to float32 if self.ftype == 0 and data_dtype == np.float16: data = data.astype(np.float32) - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1: data = data.astype(np.float16) - # Reshape experts tensors from 2D to 3D as expected by GeLU - if experts and n_dims == 2: - data = data.reshape((self.hparams["d_model"], self.hparams["ffn_config"]["ffn_hidden_size"], self.hparams["ffn_config"]["moe_num_experts"])) - n_dims = len(data.shape) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) From dbfd59114f0cc654ba4e948ef62f765cb7287b44 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 18:52:28 +0200 Subject: [PATCH 36/77] model: dbrx: fix tensor names mapping broken --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index e3d5b5763e6e8..ca0a73ad0a63f 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1538,7 +1538,7 @@ def write_tensors(self): # Every other model has the weight names ending in .weight, # let's assume that is the convention which is not the case for dbrx: # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 - new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=".weight") + new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) if new_name is None: print(f"Can not map tensor {name!r}") sys.exit() From 7dd84b092418c3a6a8cb7646cde3b33092cddff4 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 19:12:24 +0200 Subject: [PATCH 37/77] model: dbrx: fix expert reshape --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index ca0a73ad0a63f..1e9effc26f7df 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1548,7 +1548,7 @@ def write_tensors(self): # Reshape experts tensors from 2D to 3D as expected by GeLU if experts and n_dims == 2: - data = data.reshape((self.hparams["d_model"], self.hparams["ffn_config"]["ffn_hidden_size"], self.hparams["ffn_config"]["moe_num_experts"])) + data = data.reshape((self.hparams["ffn_config"]["moe_num_experts"], self.hparams["ffn_config"]["ffn_hidden_size"], self.hparams["d_model"])) n_dims = len(data.shape) # if f32 desired, convert any float16 to float32 From c9bddbf253ff594cc001e23840053088170abc9e Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 19:38:35 +0200 Subject: [PATCH 38/77] model: dbrx: fix expert reshape --- convert-hf-to-gguf.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 1e9effc26f7df..31ebc1ab0735a 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1516,12 +1516,19 @@ def write_tensors(self): block_count = self.hparams.get("n_layers") tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): + n_expert = self.hparams["ffn_config"]["moe_num_experts"] + n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] + n_embd = self.hparams["d_model"] + # Specific behavior for experts tensors: reshape to 3D and add suffix .weight - exp_tensor_names = ["ffn.experts.mlp.v1", "ffn.experts.mlp.w1", "ffn.experts.mlp.w2"] + exp_tensor_names = {"ffn.experts.mlp.v1": (n_embd, n_ff, n_expert), # LLM_TENSOR_FFN_GATE_EXPS + "ffn.experts.mlp.w1": (n_embd, n_ff, n_expert), # LLM_TENSOR_FFN_DOWN_EXPS + "ffn.experts.mlp.w2": (n_ff, n_embd, n_expert)} # LLM_TENSOR_FFN_UP_EXPS experts = False - for exp_tensor_name in exp_tensor_names: + for exp_tensor_name in exp_tensor_names.keys(): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: experts = True + expert_reshape = exp_tensor_names[exp_tensor_name] break old_dtype = data_torch.dtype @@ -1548,7 +1555,7 @@ def write_tensors(self): # Reshape experts tensors from 2D to 3D as expected by GeLU if experts and n_dims == 2: - data = data.reshape((self.hparams["ffn_config"]["moe_num_experts"], self.hparams["ffn_config"]["ffn_hidden_size"], self.hparams["d_model"])) + data = data.reshape(expert_reshape) n_dims = len(data.shape) # if f32 desired, convert any float16 to float32 From e2c919962b215839a91a9e1b7036234b6a3c4136 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 20:10:16 +0200 Subject: [PATCH 39/77] model: dbrx: fix again sic expert reshape --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 31ebc1ab0735a..4c2d32298885e 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1528,7 +1528,7 @@ def write_tensors(self): for exp_tensor_name in exp_tensor_names.keys(): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: experts = True - expert_reshape = exp_tensor_names[exp_tensor_name] + expert_reshape = exp_tensor_names[exp_tensor_name].reverse() break old_dtype = data_torch.dtype From 50b43736735fdca1d73051749ad1fec278f038da Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 20:14:43 +0200 Subject: [PATCH 40/77] model: dbrx: weird fix expert reshape --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4c2d32298885e..802f3dbbea46b 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1528,7 +1528,7 @@ def write_tensors(self): for exp_tensor_name in exp_tensor_names.keys(): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: experts = True - expert_reshape = exp_tensor_names[exp_tensor_name].reverse() + expert_reshape = exp_tensor_names[exp_tensor_name][::-1] break old_dtype = data_torch.dtype From 0ab1bae854bed8b69a042dc14e66b6ce5daaa267 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 20:56:53 +0200 Subject: [PATCH 41/77] llama: dbrx: output norm dim --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index e290611202402..f6b8dacdd5ed2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4696,7 +4696,7 @@ static bool llm_load_tensors( layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS,"weight", i), {n_ff, n_embd, n_expert}); layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); - layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); + layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd, n_embd}); } } break; case LLM_ARCH_BAICHUAN: From 830e46d7ae1c0190bbb5e5f58fc3932a58222998 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 23:40:12 +0200 Subject: [PATCH 42/77] llama: dbrx: fix last normalization --- llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index f6b8dacdd5ed2..de51aaf46b757 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7242,6 +7242,9 @@ struct llm_build_context { cur = moe_out; } + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + // DbrxNormAttentionNorm { cur = llm_build_norm(ctx0, cur, hparams, @@ -7250,9 +7253,6 @@ struct llm_build_context { cb(cur, "layer_out_norm", il); } - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); if (layer_dir != nullptr) { cur = ggml_add(ctx0, cur, layer_dir); From 2897aa628c1c09fa8f1239a4ef86b1b02a509a9d Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 23:47:26 +0200 Subject: [PATCH 43/77] llama: dbrx: revert --- llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index de51aaf46b757..f6b8dacdd5ed2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7242,9 +7242,6 @@ struct llm_build_context { cur = moe_out; } - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - // DbrxNormAttentionNorm { cur = llm_build_norm(ctx0, cur, hparams, @@ -7253,6 +7250,9 @@ struct llm_build_context { cb(cur, "layer_out_norm", il); } + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); if (layer_dir != nullptr) { cur = ggml_add(ctx0, cur, layer_dir); From 993f8360294aa447a15c5289f20cd492e7c20d99 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 00:11:19 +0200 Subject: [PATCH 44/77] llama: dbrx: move norm2 after attention, fix build kv --- llama.cpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/llama.cpp b/llama.cpp index f6b8dacdd5ed2..f397cc28f8169 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7125,12 +7125,6 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "attn_norm", il); - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm_2, - NULL, - LLM_NORM, cb, il); - cb(cur, "attn_norm_2", il); - // self-attention { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); @@ -7161,7 +7155,7 @@ struct llm_build_context { cb(Vcur, "Vcur", il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - NULL, NULL, + model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -7179,9 +7173,8 @@ struct llm_build_context { // feed-forward network // MoE branch { - // FIXME REVIEW: I do not see this op in https://huggingface.co/databricks/dbrx-instruct/blob/464e701f50aef4c1b59c81fb5667819a5d08e108/modeling_dbrx.py#L727 cur = llm_build_norm(ctx0, ffn_inp, hparams, - NULL, NULL, + model.layers[il].attn_norm_2, NULL, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); From b01b062ab5506f2b04d5a7c7ce43db70e6aed532 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 00:25:54 +0200 Subject: [PATCH 45/77] llama: dbrx: fix build kv att out --- llama.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/llama.cpp b/llama.cpp index f397cc28f8169..1d165ce747ced 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7155,7 +7155,7 @@ struct llm_build_context { cb(Vcur, "Vcur", il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + model.layers[il].attn_out_norm, NULL, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -7235,14 +7235,6 @@ struct llm_build_context { cur = moe_out; } - // DbrxNormAttentionNorm - { - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].layer_out_norm, NULL, - LLM_NORM, cb, il); - cb(cur, "layer_out_norm", il); - } - cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); From 74e6d876f6563d21194e9e33b96c398c506627e6 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 00:37:28 +0200 Subject: [PATCH 46/77] llama: dbrx: fix build kv att out tensor name --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 1d165ce747ced..9a1e72273c278 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7155,7 +7155,7 @@ struct llm_build_context { cb(Vcur, "Vcur", il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].attn_out_norm, NULL, + model.layers[il].layer_out_norm, NULL, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); } From f8f97e74f9337f901723b3bbe1bf90f0f465f239 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 01:17:33 +0200 Subject: [PATCH 47/77] llama: dbrx: hardcode nn.LayerNorm epsilon --- llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llama.cpp b/llama.cpp index 9a1e72273c278..cac0dd0c650a5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3986,6 +3986,7 @@ static void llm_load_hparams( case LLM_ARCH_DBRX: { ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); + hparams.f_norm_eps = 1.e-5; // REVIEW is that OK ? https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html switch (hparams.n_layer) { case 40: model.type = e_model::MODEL_132B; break; From 71f9e479aa9e8fcef8158dfe721bb8338d628200 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 01:29:00 +0200 Subject: [PATCH 48/77] llama: dbrx: Try another rope type --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index cac0dd0c650a5..c7ece7c5d133e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3986,7 +3986,7 @@ static void llm_load_hparams( case LLM_ARCH_DBRX: { ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); - hparams.f_norm_eps = 1.e-5; // REVIEW is that OK ? https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html + hparams.f_norm_eps = 1.e-5; // REVIEW is that OK ? https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html, should we put in the converter ? switch (hparams.n_layer) { case 40: model.type = e_model::MODEL_132B; break; @@ -14765,12 +14765,12 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MINICPM: case LLM_ARCH_XVERSE: case LLM_ARCH_COMMAND_R: - case LLM_ARCH_DBRX: // FIXME REVIEW @ggerganov I am not sure what to put here return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 case LLM_ARCH_FALCON: case LLM_ARCH_GROK: + case LLM_ARCH_DBRX: // FIXME REVIEW @ggerganov I am not sure what to put here case LLM_ARCH_PERSIMMON: case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: From 52c6276e12c0966ed67dc368c2c12bc5c635ed61 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 10:43:36 +0200 Subject: [PATCH 49/77] llama: dbrx: fix k scale --- llama.cpp | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/llama.cpp b/llama.cpp index c7ece7c5d133e..fdab763d76544 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7128,6 +7128,10 @@ struct llm_build_context { // self-attention { + struct ggml_tensor * Qcur = nullptr; + struct ggml_tensor * Kcur = nullptr; + struct ggml_tensor * Vcur = nullptr; + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -7136,28 +7140,32 @@ struct llm_build_context { cb(cur, "wqkv_clamped", il); } - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - cb(Vcur, "Vcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].layer_out_norm, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens,kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -14770,7 +14778,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { // the pairs of head values are offset by n_rot/2 case LLM_ARCH_FALCON: case LLM_ARCH_GROK: - case LLM_ARCH_DBRX: // FIXME REVIEW @ggerganov I am not sure what to put here + case LLM_ARCH_DBRX: case LLM_ARCH_PERSIMMON: case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: From 8e226884019c015af9637c3c44718ce935fb78da Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 11:22:24 +0200 Subject: [PATCH 50/77] llama: dbrx: move norm epsilon to convert. Fix missing normalization. --- convert-hf-to-gguf.py | 2 ++ llama.cpp | 27 +++++++++++++-------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 802f3dbbea46b..ca48fe37141d2 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1509,6 +1509,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) + self.gguf_writer.add_layer_norm_eps(1e-5) + self.gguf_writer.add_file_type(self.ftype) print(f"gguf: file type = {self.ftype}") diff --git a/llama.cpp b/llama.cpp index fdab763d76544..3d6365d215e7c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3986,7 +3986,6 @@ static void llm_load_hparams( case LLM_ARCH_DBRX: { ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); - hparams.f_norm_eps = 1.e-5; // REVIEW is that OK ? https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html, should we put in the converter ? switch (hparams.n_layer) { case 40: model.type = e_model::MODEL_132B; break; @@ -7133,12 +7132,11 @@ struct llm_build_context { struct ggml_tensor * Vcur = nullptr; cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = ggml_norm(ctx0, cur, hparams.f_norm_eps); cb(cur, "wqkv", il); - if (hparams.f_clamp_kqv > 0.0f) { - cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); - cb(cur, "wqkv_clamped", il); - } + cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(cur, "wqkv_clamped", il); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); @@ -7148,24 +7146,25 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].layer_out_norm, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens,kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + model.layers[il].layer_out_norm, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + + cur = ggml_norm(ctx0, cur, hparams.f_norm_eps); } if (il == n_layer - 1) { From 35dce3e1452acfb6d40fd93efb90baacc5c356ac Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 14:02:08 +0200 Subject: [PATCH 51/77] llama: dbrx: rename tensor to actual meaning. Fix normalization in graph. Permute expert tensors to the llama.cpp layout --- convert-hf-to-gguf.py | 13 ++++++---- gguf-py/gguf/constants.py | 4 +-- gguf-py/gguf/tensor_mapping.py | 46 +++++++++++++++++----------------- llama.cpp | 20 +++++++-------- 4 files changed, 42 insertions(+), 41 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index ca48fe37141d2..5325e2f01a8b5 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1523,18 +1523,21 @@ def write_tensors(self): n_embd = self.hparams["d_model"] # Specific behavior for experts tensors: reshape to 3D and add suffix .weight - exp_tensor_names = {"ffn.experts.mlp.v1": (n_embd, n_ff, n_expert), # LLM_TENSOR_FFN_GATE_EXPS - "ffn.experts.mlp.w1": (n_embd, n_ff, n_expert), # LLM_TENSOR_FFN_DOWN_EXPS - "ffn.experts.mlp.w2": (n_ff, n_embd, n_expert)} # LLM_TENSOR_FFN_UP_EXPS + exp_tensor_names = {"ffn.experts.mlp.v1": (2, 1, 3), # LLM_TENSOR_FFN_GATE_EXPS(n_embd, n_ff, n_expert) + "ffn.experts.mlp.w2": (1, 2, 3), # LLM_TENSOR_FFN_DOWN_EXPS(n_ff, n_embd, n_expert) + "ffn.experts.mlp.w1": (2, 1, 3)} # LLM_TENSOR_FFN_UP_EXPS (n_embd, n_ff, n_expert) experts = False for exp_tensor_name in exp_tensor_names.keys(): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: experts = True - expert_reshape = exp_tensor_names[exp_tensor_name][::-1] + expert_permute = exp_tensor_names[exp_tensor_name][::-1] break old_dtype = data_torch.dtype + if experts: + data_torch = data_torch.view(n_expert, n_ff, n_embd) + # convert any unsupported data types to float32 if data_torch.dtype not in (torch.float16, torch.float32): data_torch = data_torch.to(torch.float32) @@ -1557,7 +1560,7 @@ def write_tensors(self): # Reshape experts tensors from 2D to 3D as expected by GeLU if experts and n_dims == 2: - data = data.reshape(expert_reshape) + data = data.transpose(expert_permute) n_dims = len(data.shape) # if f32 desired, convert any float16 to float32 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f6ade5b22311c..886256102d28c 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -646,9 +646,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_NORM_2, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.FFN_GATE_INP, MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index e3d930dedd2a1..5872e2b23f038 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -101,8 +101,7 @@ class TensorNameMap: # Attention norm 2 MODEL_TENSOR.ATTN_NORM_2: ( - "transformer.h.{bid}.ln_attn", # falcon40b - "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx + "transformer.h.{bid}.ln_attn", # falcon40b ), # Attention query-key-value @@ -155,23 +154,24 @@ class TensorNameMap: # Attention output MODEL_TENSOR.ATTN_OUT: ( - "gpt_neox.layers.{bid}.attention.dense", # gptneox - "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen - "transformer.blocks.{bid}.attn.out_proj", # mpt - "transformer.h.{bid}.self_attention.dense", # falcon - "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf - "layers.{bid}.attention.wo", # llama-pth - "encoder.layer.{bid}.attention.output.dense", # bert - "transformer.h.{bid}.attn.out_proj", # gpt-j - "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon - "model.layers.{bid}.self_attn.dense", # persimmon - "h.{bid}.attn.c_proj", # gpt2 - "transformer.h.{bid}.mixer.out_proj", # phi2 - "model.layers.layers.{bid}.self_attn.o_proj", # plamo - "model.layers.{bid}.attention.wo", # internlm2 - "encoder.layers.{bid}.attn.out_proj", # nomic-bert - "transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok + "gpt_neox.layers.{bid}.attention.dense", # gptneox + "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen + "transformer.blocks.{bid}.attn.out_proj", # mpt + "transformer.h.{bid}.self_attention.dense", # falcon + "h.{bid}.self_attention.dense", # bloom + "model.layers.{bid}.self_attn.o_proj", # llama-hf + "layers.{bid}.attention.wo", # llama-pth + "encoder.layer.{bid}.attention.output.dense", # bert + "transformer.h.{bid}.attn.out_proj", # gpt-j + "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon + "model.layers.{bid}.self_attn.dense", # persimmon + "h.{bid}.attn.c_proj", # gpt2 + "transformer.h.{bid}.mixer.out_proj", # phi2 + "model.layers.layers.{bid}.self_attn.o_proj", # plamo + "model.layers.{bid}.attention.wo", # internlm2 + "encoder.layers.{bid}.attn.out_proj", # nomic-bert + "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok + "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx ), # Attention output norm @@ -306,10 +306,10 @@ class TensorNameMap: ), MODEL_TENSOR.LAYER_OUT_NORM: ( - "encoder.layer.{bid}.output.LayerNorm", # bert - "encoder.layers.{bid}.norm2", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_3", # Grok - "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx + "encoder.layer.{bid}.output.LayerNorm", # bert + "encoder.layers.{bid}.norm2", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_3", # Grok + "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), MODEL_TENSOR.SSM_IN: ( diff --git a/llama.cpp b/llama.cpp index 3d6365d215e7c..14fd010b40f75 100644 --- a/llama.cpp +++ b/llama.cpp @@ -938,7 +938,7 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, @@ -4687,16 +4687,16 @@ static bool llm_load_tensors( auto & layer = model.layers[i]; layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2,"weight", i), {n_embd}); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS,"weight", i), {n_embd, n_ff, n_expert}); layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS,"weight", i), {n_ff, n_embd, n_expert}); layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); - layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd, n_embd}); + layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); } } break; case LLM_ARCH_BAICHUAN: @@ -7132,7 +7132,6 @@ struct llm_build_context { struct ggml_tensor * Vcur = nullptr; cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cur = ggml_norm(ctx0, cur, hparams.f_norm_eps); cb(cur, "wqkv", il); cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -7161,10 +7160,9 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].layer_out_norm, model.layers[il].bo, + model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cur = ggml_norm(ctx0, cur, hparams.f_norm_eps); } if (il == n_layer - 1) { @@ -7181,11 +7179,6 @@ struct llm_build_context { // feed-forward network // MoE branch { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].attn_norm_2, NULL, - LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] cb(logits, "ffn_moe_logits", il); @@ -7243,6 +7236,11 @@ struct llm_build_context { cur = moe_out; } + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].layer_out_norm, NULL, + LLM_NORM, cb, il); + cb(cur, "layer_out_norm", il); + cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); From 506cc2ea53263a1b2596890221388ec11e5e8faf Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 14:09:06 +0200 Subject: [PATCH 52/77] llama: dbrx: convert remove previous reverse --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5325e2f01a8b5..7cdd39a275449 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1530,7 +1530,7 @@ def write_tensors(self): for exp_tensor_name in exp_tensor_names.keys(): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: experts = True - expert_permute = exp_tensor_names[exp_tensor_name][::-1] + expert_permute = exp_tensor_names[exp_tensor_name] break old_dtype = data_torch.dtype From eb0847e6b1c82f08f847052d855c20adb30093ab Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 14:38:21 +0200 Subject: [PATCH 53/77] llama: dbrx: load norm eps in hparams --- llama.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 14fd010b40f75..808a700ae690c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3985,7 +3985,8 @@ static void llm_load_hparams( } break; case LLM_ARCH_DBRX: { - ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); switch (hparams.n_layer) { case 40: model.type = e_model::MODEL_132B; break; From 81f308ad646d4bb295ab70bf089dcaab03eeb15e Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 15:04:18 +0200 Subject: [PATCH 54/77] llama: dbrx: fix experts tensor layout --- convert-hf-to-gguf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 7cdd39a275449..94b392be9bdcb 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1523,9 +1523,10 @@ def write_tensors(self): n_embd = self.hparams["d_model"] # Specific behavior for experts tensors: reshape to 3D and add suffix .weight - exp_tensor_names = {"ffn.experts.mlp.v1": (2, 1, 3), # LLM_TENSOR_FFN_GATE_EXPS(n_embd, n_ff, n_expert) - "ffn.experts.mlp.w2": (1, 2, 3), # LLM_TENSOR_FFN_DOWN_EXPS(n_ff, n_embd, n_expert) - "ffn.experts.mlp.w1": (2, 1, 3)} # LLM_TENSOR_FFN_UP_EXPS (n_embd, n_ff, n_expert) + # orginal implementation expects (n_expert, n_ff, n_embd) + exp_tensor_names = {"ffn.experts.mlp.v1": (2, 1, 0), # LLM_TENSOR_FFN_GATE_EXPS(n_embd, n_ff, n_expert) + "ffn.experts.mlp.w2": (1, 2, 0), # LLM_TENSOR_FFN_DOWN_EXPS(n_ff, n_embd, n_expert) + "ffn.experts.mlp.w1": (2, 1, 0)} # LLM_TENSOR_FFN_UP_EXPS (n_embd, n_ff, n_expert) experts = False for exp_tensor_name in exp_tensor_names.keys(): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: From 21fb24aa45d7da7b4f5ab0597764b94b93fa5462 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 16:55:56 +0200 Subject: [PATCH 55/77] model: dbrx: convert-hf-to-gguf.py fix experts tensors shapes --- convert-hf-to-gguf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 94b392be9bdcb..7e4dfd6095c66 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1522,7 +1522,7 @@ def write_tensors(self): n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] n_embd = self.hparams["d_model"] - # Specific behavior for experts tensors: reshape to 3D and add suffix .weight + # Specific behavior for experts tensors: suffix .weight, reshape to 3D and transpose # orginal implementation expects (n_expert, n_ff, n_embd) exp_tensor_names = {"ffn.experts.mlp.v1": (2, 1, 0), # LLM_TENSOR_FFN_GATE_EXPS(n_embd, n_ff, n_expert) "ffn.experts.mlp.w2": (1, 2, 0), # LLM_TENSOR_FFN_DOWN_EXPS(n_ff, n_embd, n_expert) @@ -1536,6 +1536,7 @@ def write_tensors(self): old_dtype = data_torch.dtype + # View experts tensors as 3D if experts: data_torch = data_torch.view(n_expert, n_ff, n_embd) @@ -1559,8 +1560,8 @@ def write_tensors(self): n_dims = len(data.shape) data_dtype = data.dtype - # Reshape experts tensors from 2D to 3D as expected by GeLU - if experts and n_dims == 2: + # Transpose experts to the expected llama.cpp format + if experts: data = data.transpose(expert_permute) n_dims = len(data.shape) From f20c04f01fcbc3fae3450242a85d982c6a57d01d Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 17:45:35 +0200 Subject: [PATCH 56/77] llama: factorize moe graph implementation between grok, mixtral and dbrx --- llama.cpp | 234 +++++++++++++++--------------------------------------- 1 file changed, 63 insertions(+), 171 deletions(-) diff --git a/llama.cpp b/llama.cpp index 808a700ae690c..26dc24ebc826e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6457,62 +6457,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] - cb(logits, "ffn_moe_logits", il); - - ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] - cb(probs, "ffn_moe_probs", il); - - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok] - cb(selected_experts->src[0], "ffn_moe_argsort", il); - - ggml_tensor * weights = ggml_get_rows(ctx0, - ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); - cb(weights, "ffn_moe_weights", il); - - weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok] - - ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); - cb(weights_sum, "ffn_moe_weights_sum", il); - - weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok] - cb(weights, "ffn_moe_weights_norm", il); - - // compute expert outputs - ggml_tensor * moe_out = nullptr; - - for (int i = 0; i < n_expert_used; ++i) { - ggml_tensor * cur_expert; - - ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur); - cb(cur_up, "ffn_moe_up", il); - - ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur); - cb(cur_gate, "ffn_moe_gate", il); - - cur_gate = ggml_silu(ctx0, cur_gate); - cb(cur_gate, "ffn_moe_silu", il); - - cur_expert = ggml_mul(ctx0, cur_up, cur_gate); - cb(cur_expert, "ffn_moe_gate_par", il); - - cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd] - cb(cur_expert, "ffn_moe_down", il); - - cur_expert = ggml_mul(ctx0, cur_expert, - ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); - cb(cur_expert, "ffn_moe_weighted", il); - - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx0, moe_out, cur_expert); - cb(moe_out, "ffn_moe_out", il); - } - } - - cur = moe_out; + cur = build_moe(cur, n_tokens, il); } cur = ggml_add(ctx0, cur, ffn_inp); @@ -6544,6 +6489,65 @@ struct llm_build_context { return gf; } + ggml_tensor * build_moe(ggml_tensor * cur, int32_t n_tokens, int il) { + ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] + cb(logits, "ffn_moe_logits", il); + + ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] + cb(probs, "ffn_moe_probs", il); + + // select experts + ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok] + cb(selected_experts->src[0], "ffn_moe_argsort", il); + + ggml_tensor * weights = ggml_get_rows(ctx0, + ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); + cb(weights, "ffn_moe_weights", il); + + weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok] + + ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); + cb(weights_sum, "ffn_moe_weights_sum", il); + + weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok] + cb(weights, "ffn_moe_weights_norm", il); + + // compute expert outputs + ggml_tensor * moe_out = nullptr; + + for (int i = 0; i < n_expert_used; ++i) { + ggml_tensor * cur_expert; + + ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur); + cb(cur_up, "ffn_moe_up", il); + + ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur); + cb(cur_gate, "ffn_moe_gate", il); + + cur_gate = ggml_silu(ctx0, cur_gate); + cb(cur_gate, "ffn_moe_silu", il); + + cur_expert = ggml_mul(ctx0, cur_up, cur_gate); + cb(cur_expert, "ffn_moe_gate_par", il); + + cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd] + cb(cur_expert, "ffn_moe_down", il); + + cur_expert = ggml_mul(ctx0, cur_expert, + ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); + cb(cur_expert, "ffn_moe_weighted", il); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx0, moe_out, cur_expert); + cb(moe_out, "ffn_moe_out", il); + } + } + + return moe_out; + } + struct ggml_cgraph * build_baichuan() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -6991,63 +6995,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] - cb(logits, "ffn_moe_logits", il); - - ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] - cb(probs, "ffn_moe_probs", il); - - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok] - cb(selected_experts->src[0], "ffn_moe_argsort", il); - - ggml_tensor * weights = ggml_get_rows(ctx0, - ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); - cb(weights, "ffn_moe_weights", il); - - weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok] - - ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); - cb(weights_sum, "ffn_moe_weights_sum", il); - - weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok] - cb(weights, "ffn_moe_weights_norm", il); - - // compute expert outputs - ggml_tensor * moe_out = nullptr; - - for (int i = 0; i < n_expert_used; ++i) { - ggml_tensor * cur_expert; - - ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur); - cb(cur_up, "ffn_moe_up", il); - - ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur); - cb(cur_gate, "ffn_moe_gate", il); - - //GeLU - cur_gate = ggml_gelu(ctx0, cur_gate); - cb(cur_gate, "ffn_moe_gelu", il); - - cur_expert = ggml_mul(ctx0, cur_up, cur_gate); - cb(cur_expert, "ffn_moe_gate_par", il); - - cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd] - cb(cur_expert, "ffn_moe_down", il); - - cur_expert = ggml_mul(ctx0, cur_expert, - ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); - cb(cur_expert, "ffn_moe_weighted", il); - - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx0, moe_out, cur_expert); - cb(moe_out, "ffn_moe_out", il); - } - } - - cur = moe_out; + cur = build_moe(cur, n_tokens, il); // Grok // if layer_out_norm is present then apply it before adding the input @@ -7163,7 +7111,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } if (il == n_layer - 1) { @@ -7179,64 +7126,9 @@ struct llm_build_context { // feed-forward network // MoE branch - { - ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] - cb(logits, "ffn_moe_logits", il); - - ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] - cb(probs, "ffn_moe_probs", il); - - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok] - cb(selected_experts->src[0], "ffn_moe_argsort", il); - - ggml_tensor * weights = ggml_get_rows(ctx0, - ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); - cb(weights, "ffn_moe_weights", il); - - weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok] - - ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); - cb(weights_sum, "ffn_moe_weights_sum", il); - - weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok] - cb(weights, "ffn_moe_weights_norm", il); - - // compute expert outputs - ggml_tensor * moe_out = nullptr; - for (int i = 0; i < n_expert_used; ++i) { - ggml_tensor * cur_expert; - - ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur); - cb(cur_up, "ffn_moe_up", il); - - ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur); - cb(cur_gate, "ffn_moe_gate", il); - - //GeLU - cur_gate = ggml_gelu(ctx0, cur_gate); - cb(cur_gate, "ffn_moe_gelu", il); - - cur_expert = ggml_mul(ctx0, cur_up, cur_gate); - cb(cur_expert, "ffn_moe_gate_par", il); - - cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd] - cb(cur_expert, "ffn_moe_down", il); - - cur_expert = ggml_mul(ctx0, cur_expert, - ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i * weights->nb[0])); - cb(cur_expert, "ffn_moe_weighted", il); - - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx0, moe_out, cur_expert); - cb(moe_out, "ffn_moe_out", il); - } - } - cur = moe_out; - } + cur = build_moe(cur, n_tokens, il); + // DBRX norm2 cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, NULL, LLM_NORM, cb, il); From 48909ed2a7afa48d9dbef3be8361af950f2b403f Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 19:01:44 +0200 Subject: [PATCH 57/77] model: dbrx convert permute experts directly torch, log shape --- convert-hf-to-gguf.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 7e4dfd6095c66..b62d0747a5188 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1531,15 +1531,11 @@ def write_tensors(self): for exp_tensor_name in exp_tensor_names.keys(): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: experts = True - expert_permute = exp_tensor_names[exp_tensor_name] + data_torch = data_torch.view(n_expert, n_ff, n_embd).permute(*exp_tensor_names[exp_tensor_name]) break old_dtype = data_torch.dtype - # View experts tensors as 3D - if experts: - data_torch = data_torch.view(n_expert, n_ff, n_embd) - # convert any unsupported data types to float32 if data_torch.dtype not in (torch.float16, torch.float32): data_torch = data_torch.to(torch.float32) @@ -1560,11 +1556,6 @@ def write_tensors(self): n_dims = len(data.shape) data_dtype = data.dtype - # Transpose experts to the expected llama.cpp format - if experts: - data = data.transpose(expert_permute) - n_dims = len(data.shape) - # if f32 desired, convert any float16 to float32 if self.ftype == 0 and data_dtype == np.float16: data = data.astype(np.float32) @@ -1573,7 +1564,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) From 18a84feddafc14a8d6ec69d73512c34eedd1a4bf Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 19:12:53 +0200 Subject: [PATCH 58/77] llama: dbrx: fix experts 3D tensor layout (again) --- convert-hf-to-gguf.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index b62d0747a5188..7d81a69ece900 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1522,16 +1522,16 @@ def write_tensors(self): n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] n_embd = self.hparams["d_model"] - # Specific behavior for experts tensors: suffix .weight, reshape to 3D and transpose + # Specific behavior for experts tensors: suffix .weight, reshape to 3D # orginal implementation expects (n_expert, n_ff, n_embd) - exp_tensor_names = {"ffn.experts.mlp.v1": (2, 1, 0), # LLM_TENSOR_FFN_GATE_EXPS(n_embd, n_ff, n_expert) - "ffn.experts.mlp.w2": (1, 2, 0), # LLM_TENSOR_FFN_DOWN_EXPS(n_ff, n_embd, n_expert) - "ffn.experts.mlp.w1": (2, 1, 0)} # LLM_TENSOR_FFN_UP_EXPS (n_embd, n_ff, n_expert) + exp_tensor_names = {"ffn.experts.mlp.v1", # LLM_TENSOR_FFN_GATE_EXPS ne {n_embd, n_ff, n_expert} + "ffn.experts.mlp.w2", # LLM_TENSOR_FFN_DOWN_EXPS ne {n_ff, n_embd, n_expert} + "ffn.experts.mlp.w1"} # LLM_TENSOR_FFN_UP_EXPS ne {n_embd, n_ff, n_expert } experts = False - for exp_tensor_name in exp_tensor_names.keys(): + for exp_tensor_name in exp_tensor_names: if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: experts = True - data_torch = data_torch.view(n_expert, n_ff, n_embd).permute(*exp_tensor_names[exp_tensor_name]) + data_torch = data_torch.view(n_expert, n_ff, n_embd) break old_dtype = data_torch.dtype From 996895292188770e4bc37625f943004119329c69 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 19:37:23 +0200 Subject: [PATCH 59/77] llama: dbrx: fix experts 3D tensor layout (again) --- convert-hf-to-gguf.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 7d81a69ece900..635b2bd61e6c4 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1522,16 +1522,16 @@ def write_tensors(self): n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] n_embd = self.hparams["d_model"] - # Specific behavior for experts tensors: suffix .weight, reshape to 3D + # Specific behavior for experts tensors: suffix .weight, reshape to 3D and transpose # orginal implementation expects (n_expert, n_ff, n_embd) - exp_tensor_names = {"ffn.experts.mlp.v1", # LLM_TENSOR_FFN_GATE_EXPS ne {n_embd, n_ff, n_expert} - "ffn.experts.mlp.w2", # LLM_TENSOR_FFN_DOWN_EXPS ne {n_ff, n_embd, n_expert} - "ffn.experts.mlp.w1"} # LLM_TENSOR_FFN_UP_EXPS ne {n_embd, n_ff, n_expert } + exp_tensor_names = {"ffn.experts.mlp.v1": (0, 1, 2), # LLM_TENSOR_FFN_GATE_EXPS(n_embd, n_ff, n_expert) + "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS(n_ff, n_embd, n_expert) + "ffn.experts.mlp.w1": (0, 1, 2)} # LLM_TENSOR_FFN_UP_EXPS (n_embd, n_ff, n_expert) experts = False - for exp_tensor_name in exp_tensor_names: + for exp_tensor_name in exp_tensor_names.keys(): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: experts = True - data_torch = data_torch.view(n_expert, n_ff, n_embd) + data_torch = data_torch.view(n_expert, n_ff, n_embd).permute(*exp_tensor_names[exp_tensor_name]) break old_dtype = data_torch.dtype From e66f1e3448e3c97f24db212e730a1686e1ba5667 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 20:08:54 +0200 Subject: [PATCH 60/77] llama: dbrx: document changes, permute only FFN_DOWN_EXPS. Add a check for ftype --- convert-hf-to-gguf.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 635b2bd61e6c4..a639e39ef268b 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1522,16 +1522,21 @@ def write_tensors(self): n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] n_embd = self.hparams["d_model"] - # Specific behavior for experts tensors: suffix .weight, reshape to 3D and transpose - # orginal implementation expects (n_expert, n_ff, n_embd) - exp_tensor_names = {"ffn.experts.mlp.v1": (0, 1, 2), # LLM_TENSOR_FFN_GATE_EXPS(n_embd, n_ff, n_expert) - "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS(n_ff, n_embd, n_expert) - "ffn.experts.mlp.w1": (0, 1, 2)} # LLM_TENSOR_FFN_UP_EXPS (n_embd, n_ff, n_expert) + # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose + # original implementation expects (n_expert, n_ff, n_embd) for all experts weights + # But llama.cpp moe graph works differently + # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions + # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor + exp_tensor_names = {"ffn.experts.mlp.v1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} + "ffn.experts.mlp.w1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} experts = False for exp_tensor_name in exp_tensor_names.keys(): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: experts = True - data_torch = data_torch.view(n_expert, n_ff, n_embd).permute(*exp_tensor_names[exp_tensor_name]) + data_torch = data_torch.view(n_expert, n_ff, n_embd) + if permute_tensor := exp_tensor_names[exp_tensor_name] is not None: + data_torch = data_torch.permute(*permute_tensor) break old_dtype = data_torch.dtype @@ -1556,6 +1561,12 @@ def write_tensors(self): n_dims = len(data.shape) data_dtype = data.dtype + # Most of the codebase that takes in 1D tensors only handles F32 tensors + # and most of the outputs tensors are F32. + if data_dtype != np.float32 and n_dims == 1: + print(f"Can not map tensor {name!r}: all 1D tensors must be F32") + sys.exit() + # if f32 desired, convert any float16 to float32 if self.ftype == 0 and data_dtype == np.float16: data = data.astype(np.float32) From f30a73bb01d31c1c33f0be0356f17ce8b489a8be Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 20:38:31 +0200 Subject: [PATCH 61/77] llama: dbrx: rename layer_out_norm to attn_out_norm --- convert-hf-to-gguf.py | 2 +- gguf-py/gguf/constants.py | 2 +- gguf-py/gguf/tensor_mapping.py | 2 +- llama.cpp | 16 ++++++++-------- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index a639e39ef268b..228b68e140eec 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1535,7 +1535,7 @@ def write_tensors(self): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: experts = True data_torch = data_torch.view(n_expert, n_ff, n_embd) - if permute_tensor := exp_tensor_names[exp_tensor_name] is not None: + if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None: data_torch = data_torch.permute(*permute_tensor) break diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 886256102d28c..a610acc5868fc 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -649,11 +649,11 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_OUT_NORM, MODEL_TENSOR.FFN_GATE_INP, MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, - MODEL_TENSOR.LAYER_OUT_NORM, ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 5872e2b23f038..510a273c82533 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -179,6 +179,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.output.LayerNorm", # bert "encoder.layers.{bid}.norm1", # nomic-bert "transformer.decoder_layer.{bid}.rms_norm_1", # Grok + "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), # Rotary embeddings @@ -309,7 +310,6 @@ class TensorNameMap: "encoder.layer.{bid}.output.LayerNorm", # bert "encoder.layers.{bid}.norm2", # nomic-bert "transformer.decoder_layer.{bid}.rms_norm_3", # Grok - "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), MODEL_TENSOR.SSM_IN: ( diff --git a/llama.cpp b/llama.cpp index 26dc24ebc826e..a9437a5b5b4ee 100644 --- a/llama.cpp +++ b/llama.cpp @@ -939,11 +939,11 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, }, }, { @@ -4692,12 +4692,13 @@ static bool llm_load_tensors( layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); layer.wo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS,"weight", i), {n_embd, n_ff, n_expert}); layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS,"weight", i), {n_ff, n_embd, n_expert}); layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); - layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); } } break; case LLM_ARCH_BAICHUAN: @@ -7121,6 +7122,11 @@ struct llm_build_context { inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].attn_out_norm, NULL, + LLM_NORM, cb, il); + cb(cur, "attn_out_norm", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -7128,12 +7134,6 @@ struct llm_build_context { // MoE branch cur = build_moe(cur, n_tokens, il); - // DBRX norm2 - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].layer_out_norm, NULL, - LLM_NORM, cb, il); - cb(cur, "layer_out_norm", il); - cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); From ea8b58c6cd18db632fd3b04f8046269324b91a17 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 21:10:49 +0200 Subject: [PATCH 62/77] llama: dbrx: first add the residuals and then do the norm --- llama.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llama.cpp b/llama.cpp index a9437a5b5b4ee..8db50e0fd4b93 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7122,16 +7122,16 @@ struct llm_build_context { inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].attn_out_norm, NULL, - LLM_NORM, cb, il); - cb(cur, "attn_out_norm", il); - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network // MoE branch + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].attn_out_norm, NULL, + LLM_NORM, cb, il); + cb(cur, "attn_out_norm", il); + cur = build_moe(cur, n_tokens, il); cur = ggml_add(ctx0, cur, ffn_inp); From 55943a281f3e7cca00ad2db6a751e2e04362dd5d Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 8 Apr 2024 21:47:59 +0200 Subject: [PATCH 63/77] model: dbrx: convert fix mixed ffn_gate_exps and ffn_down_exps --- convert-hf-to-gguf.py | 4 ++-- gguf-py/gguf/tensor_mapping.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 228b68e140eec..49a328ff5105b 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1527,9 +1527,9 @@ def write_tensors(self): # But llama.cpp moe graph works differently # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor - exp_tensor_names = {"ffn.experts.mlp.v1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} - "ffn.experts.mlp.w1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} experts = False for exp_tensor_name in exp_tensor_names.keys(): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 510a273c82533..5d120a10f834c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -238,7 +238,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx ), # AWQ-activation gate @@ -259,7 +259,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_EXP: ( "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx ), # Feed-forward down From c7b9a2e85ee8881369aa3330a16945e7bb511b00 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Tue, 9 Apr 2024 00:58:50 +0200 Subject: [PATCH 64/77] llama: dbrx: fix ggml context of the attention outputs weight --- llama.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 8db50e0fd4b93..a67149c5f1bf7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4690,7 +4690,7 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); @@ -4698,7 +4698,6 @@ static bool llm_load_tensors( layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS,"weight", i), {n_embd, n_ff, n_expert}); layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS,"weight", i), {n_ff, n_embd, n_expert}); layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); - } } break; case LLM_ARCH_BAICHUAN: From ac82aa0e63b3d1052eaff730205d622c4f8b307b Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Tue, 9 Apr 2024 01:26:57 +0200 Subject: [PATCH 65/77] gguf-py: revert spaces --- gguf-py/gguf/tensor_mapping.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 5d120a10f834c..0c9da962eff80 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -307,9 +307,9 @@ class TensorNameMap: ), MODEL_TENSOR.LAYER_OUT_NORM: ( - "encoder.layer.{bid}.output.LayerNorm", # bert - "encoder.layers.{bid}.norm2", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_3", # Grok + "encoder.layer.{bid}.output.LayerNorm", # bert + "encoder.layers.{bid}.norm2", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_3", # Grok ), MODEL_TENSOR.SSM_IN: ( From ac75fbd8c515f8317f0d626271942fe7b595d865 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Tue, 9 Apr 2024 02:41:39 +0200 Subject: [PATCH 66/77] gguf-py: dbrx: reverse again the MOE tensors mapping: layer.ffn_up_exps -> Up-projection weights (w1) layer.ffn_gate_exps -> Gating weights (v1) layer.ffn_down_exps -> Down-projection weights (w2) --- convert-hf-to-gguf.py | 4 ++-- gguf-py/gguf/tensor_mapping.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 49a328ff5105b..228b68e140eec 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1527,9 +1527,9 @@ def write_tensors(self): # But llama.cpp moe graph works differently # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor - exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + exp_tensor_names = {"ffn.experts.mlp.v1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} - "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.w1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} experts = False for exp_tensor_name in exp_tensor_names.keys(): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0c9da962eff80..9d34ce5c1840e 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -238,7 +238,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx ), # AWQ-activation gate @@ -259,7 +259,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_EXP: ( "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx ), # Feed-forward down From 74529e54e58bae95659f530b340841f6494ee46b Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Wed, 10 Apr 2024 19:27:53 +0200 Subject: [PATCH 67/77] llama: dbrx: use the MOE naming convention for model type --- llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 2841bc2719083..656f3baf4c7fd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1718,7 +1718,6 @@ enum e_model { MODEL_40B, MODEL_65B, MODEL_70B, - MODEL_132B, MODEL_314B, MODEL_SMALL, MODEL_MEDIUM, @@ -1726,6 +1725,7 @@ enum e_model { MODEL_XL, MODEL_8x7B, MODEL_8x22B, + MODEL_16x12B, }; static const size_t kiB = 1024; @@ -3574,7 +3574,6 @@ static const char * llama_model_type_name(e_model type) { case MODEL_40B: return "40B"; case MODEL_65B: return "65B"; case MODEL_70B: return "70B"; - case MODEL_132B: return "132B"; case MODEL_314B: return "314B"; case MODEL_SMALL: return "0.1B"; case MODEL_MEDIUM: return "0.4B"; @@ -3582,6 +3581,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_XL: return "1.5B"; case MODEL_8x7B: return "8x7B"; case MODEL_8x22B: return "8x22B"; + case MODEL_16x12B: return "16x12B"; default: return "?B"; } } @@ -4009,7 +4009,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); switch (hparams.n_layer) { - case 40: model.type = e_model::MODEL_132B; break; + case 40: model.type = e_model::MODEL_16x12B; break; default: model.type = e_model::MODEL_UNKNOWN; } } break; From fc89feeddf7a3c28607ada5109f6870c29bfa843 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Thu, 11 Apr 2024 14:27:15 +0200 Subject: [PATCH 68/77] model: convert-hf-to-gguf.py remove tiktoken --- convert-hf-to-gguf.py | 64 ------------------- .../requirements-convert-hf-to-gguf.txt | 1 - 2 files changed, 65 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 291a78e3485aa..d289085866662 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -393,66 +393,6 @@ def _set_vocab_llama_hf(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def _set_vocab_tiktoken(self): - # https://github.com/openai/tiktoken - dir_model = self.dir_model - hparams = self.hparams - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams["vocab_size"] - assert max(tokenizer.get_vocab().values()) < vocab_size - vocab = {} - merges = [] - - # FIXME REVIEW should we extract this from QwenModel to base Model class ? - mergeable_ranks = tokenizer.encoding._mergeable_ranks - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - assert len(merged) == 2 - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined - added_vocab = tokenizer.get_added_vocab() - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in ({**vocab, **added_vocab}).items()} - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.chat_template = tokenizer.default_chat_template - special_vocab.merges = merges - tk_endoftext = tokenizer.encoding._special_tokens["<|endoftext|>"] - - # only add special tokens when they were not already loaded from config.json - if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token("bos", tk_endoftext) - special_vocab._set_special_token("eos", tk_endoftext) - # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tk_endoftext) - - special_vocab.add_to_gguf(self.gguf_writer) - @Model.register("GPTNeoXForCausalLM") class GPTNeoXModel(Model): @@ -1582,10 +1522,6 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) - def set_vocab(self): - self._set_vocab_tiktoken() - - @Model.register("MiniCPMForCausalLM") class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM diff --git a/requirements/requirements-convert-hf-to-gguf.txt b/requirements/requirements-convert-hf-to-gguf.txt index db8888caca774..6ce840d73cb73 100644 --- a/requirements/requirements-convert-hf-to-gguf.txt +++ b/requirements/requirements-convert-hf-to-gguf.txt @@ -1,4 +1,3 @@ -r ./requirements-convert.txt torch~=2.1.1 einops~=0.7.0 -tiktoken~=0.6.0 From bdc4efe17fd4c2c9b0cc795e635054122cc54204 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 12 Apr 2024 21:40:47 +0200 Subject: [PATCH 69/77] Is silu activation function applied to MODEL_TENSOR.FFN_GATE_EXP here? If so, we must change this to w1 for DBRX. Each expert in DBRX has 3 linear layers: w1, v1 and w2. For an input tensor x, output from the expert layer would be (silu(x.w1_t) * x.v1_t) . w2_t). Same math is also used in mixtral, only difference being DBRX uses v1 instead of w3 in mixtral. Co-authored-by: Megha Agarwal <16129366+megha95@users.noreply.github.com> --- gguf-py/gguf/tensor_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index da900f1037214..6ef5bf3110016 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -259,7 +259,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_EXP: ( "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx ), # Feed-forward down From 542585fbea9a557fd4ec712fbfd206eb3b9e3bfd Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 12 Apr 2024 21:40:57 +0200 Subject: [PATCH 70/77] Is silu activation function applied to MODEL_TENSOR.FFN_GATE_EXP here? If so, we must change this to w1 for DBRX. Each expert in DBRX has 3 linear layers: w1, v1 and w2. For an input tensor x, output from the expert layer would be (silu(x.w1_t) * x.v1_t) . w2_t). Same math is also used in mixtral, only difference being DBRX uses v1 instead of w3 in mixtral. Co-authored-by: Megha Agarwal <16129366+megha95@users.noreply.github.com> --- gguf-py/gguf/tensor_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 6ef5bf3110016..ec6fcbb838425 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -238,7 +238,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx ), # AWQ-activation gate From ecbfb1b5842a033c422ec76f9c37e391bad98178 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 12 Apr 2024 21:41:14 +0200 Subject: [PATCH 71/77] Wrong input was being fed to moe layer. This needs to be corrected Co-authored-by: Megha Agarwal <16129366+megha95@users.noreply.github.com> --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 00a849b54ef29..000346f45a0da 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7165,7 +7165,7 @@ struct llm_build_context { // feed-forward network // MoE branch - cur = llm_build_norm(ctx0, cur, hparams, + cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].attn_out_norm, NULL, LLM_NORM, cb, il); cb(cur, "attn_out_norm", il); From 647a11b1dc41c8432d6f7ce3ee264930146052aa Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 12 Apr 2024 21:34:46 +0200 Subject: [PATCH 72/77] eval-callback: also print last n elements of each dimension --- examples/eval-callback/eval-callback.cpp | 26 ++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index f70d62128a8c8..68c217f4ee522 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -28,14 +28,27 @@ static std::string ggml_ne_string(const ggml_tensor * t) { } static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { + GGML_ASSERT(n > 0); float sum = 0; for (int64_t i3 = 0; i3 < ne[3]; i3++) { printf(" [\n"); - for (int64_t i2 = 0; i2 < ne[2] && i2 < n; i2++) { + for (int64_t i2 = 0; i2 < ne[2]; i2++) { + if (i2 == n && ne[2] > 2*n) { + printf(" ..., \n"); + i2 = ne[2] - n; + } printf(" [\n"); - for (int64_t i1 = 0; i1 < ne[1] && i1 < n; i1++) { + for (int64_t i1 = 0; i1 < ne[1]; i1++) { + if (i1 == n && ne[1] > 2*n) { + printf(" ..., \n"); + i1 = ne[1] - n; + } printf(" ["); - for (int64_t i0 = 0; i0 < ne[0] && i0 < n; i0++) { + for (int64_t i0 = 0; i0 < ne[0]; i0++) { + if (i0 == n && ne[0] > 2*n) { + printf("..., "); + i0 = ne[0] - n; + } size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; float v; if (type == GGML_TYPE_F16) { @@ -51,17 +64,14 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne } else { GGML_ASSERT(false); } - printf("%8.4f", v); + printf("%12.4f", v); sum += v; - if (i0 < ne[0] - 1 && i0 < n - 1) printf(", "); + if (i0 < ne[0] - 1) printf(", "); } - if (ne[0] > n) printf(", ..."); printf("],\n"); } - if (ne[1] > n) printf(" ...\n"); printf(" ],\n"); } - if (ne[2] > n) printf(" ...\n"); printf(" ]\n"); printf(" sum = %f\n", sum); } From 03bdc36e8be6b66aa55576755dafb76965085d81 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 12 Apr 2024 22:01:37 +0200 Subject: [PATCH 73/77] minor spaces --- llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 000346f45a0da..d3f472d9c7e51 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4728,10 +4728,10 @@ static bool llm_load_tensors( layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); - layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); - layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS,"weight", i), {n_embd, n_ff, n_expert}); - layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS,"weight", i), {n_ff, n_embd, n_expert}); - layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); } } break; case LLM_ARCH_BAICHUAN: From 8e6758f2f4822594df9099d614200e10326bca09 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 12 Apr 2024 22:15:11 +0200 Subject: [PATCH 74/77] convert: update comment of MOE tensors mapping --- convert-hf-to-gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index d289085866662..028b9780c3697 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1470,9 +1470,9 @@ def write_tensors(self): # But llama.cpp moe graph works differently # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor - exp_tensor_names = {"ffn.experts.mlp.v1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} - "ffn.experts.mlp.w1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} experts = False for exp_tensor_name in exp_tensor_names.keys(): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: From f1256dc8c8d831b8c31af3ce8eed60edaaf612c2 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 13 Apr 2024 00:14:50 +0200 Subject: [PATCH 75/77] llama: rename build_moe to build_moe_ffn and fix grok is using gelu instead of silu. Do not pass too much time on this function as it will be replaced in #6505 --- llama.cpp | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/llama.cpp b/llama.cpp index d3f472d9c7e51..c9209574faa41 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6496,7 +6496,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = build_moe(cur, n_tokens, il); + cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, il); } cur = ggml_add(ctx0, cur, ffn_inp); @@ -6528,7 +6528,8 @@ struct llm_build_context { return gf; } - ggml_tensor * build_moe(ggml_tensor * cur, int32_t n_tokens, int il) { + // REVIEW: will be replaced by https://github.com/ggerganov/llama.cpp/pull/6505 + ggml_tensor * build_moe_ffn(ggml_tensor * cur, int32_t n_tokens, llm_ffn_op_type type_op, int il) { ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] cb(logits, "ffn_moe_logits", il); @@ -6560,13 +6561,25 @@ struct llm_build_context { ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur); cb(cur_up, "ffn_moe_up", il); - ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur); - cb(cur_gate, "ffn_moe_gate", il); + ggml_tensor * gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur); + cb(gate, "ffn_moe_gate", il); - cur_gate = ggml_silu(ctx0, cur_gate); - cb(cur_gate, "ffn_moe_silu", il); + switch (type_op) { + case LLM_FFN_SILU: + { + gate = ggml_silu(ctx0, gate); + cb(gate, "ffn_moe_silu", il); + } break; + case LLM_FFN_GELU: + { + gate = ggml_gelu(ctx0, gate); + cb(gate, "ffn_moe_gelu", il); + } break; + default: + GGML_ASSERT(false); + } - cur_expert = ggml_mul(ctx0, cur_up, cur_gate); + cur_expert = ggml_mul(ctx0, cur_up, gate); cb(cur_expert, "ffn_moe_gate_par", il); cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd] @@ -7034,7 +7047,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = build_moe(cur, n_tokens, il); + cur = build_moe_ffn(cur, n_tokens, LLM_FFN_GELU, il); // Grok // if layer_out_norm is present then apply it before adding the input @@ -7170,7 +7183,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "attn_out_norm", il); - cur = build_moe(cur, n_tokens, il); + cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, il); cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); From e517585fbaac0f285d2211769ae784a43ec77d6e Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 13 Apr 2024 00:17:57 +0200 Subject: [PATCH 76/77] convert-hf-to-gguf.py: fix python linter --- convert-hf-to-gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 028b9780c3697..e1ac09e024b11 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1522,6 +1522,7 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) + @Model.register("MiniCPMForCausalLM") class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM From 9f77484c913af9fb76b27d7f3e9703c45b89839c Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 13 Apr 2024 11:07:30 +0200 Subject: [PATCH 77/77] minor: fix indent in llama_build_graph --- llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index c9209574faa41..aa4d34eca1a52 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9930,9 +9930,9 @@ static struct ggml_cgraph * llama_build_graph( result = llm.build_command_r(); } break; case LLM_ARCH_DBRX: - { - result = llm.build_dbrx(); - } break; + { + result = llm.build_dbrx(); + } break; default: GGML_ASSERT(false); }