Skip to content

Commit

Permalink
New quand strategy / FTYPE IQ3_XL 4bpw
Browse files Browse the repository at this point in the history
Intermediary FTYPE mixed between IQ3_M and IQ4_XS at 4bpw.

Transposed loyally from @ikawrakow's new FTYPE IQ3_KL on ik_llama.cpp so it can be trusted, except for attn_k.weight that I chose to IQ4_XS when GQA is present, because it makes no sense whatsoever in such case to have a key head smaller than attn_output.weight or half of the FFNs.

The XL suffix is chosen in the eventuality of the emergence of an IQ3_M/L GGML_TYPE close to 4bpw, with a related FTYPE. In such case, the proposed mixed FTYPE could be replaced.

This very FTYPE answers to the demand of many users, me included, to have an intermediary between IQ3_M and IQ4_XS, separated by 0.5bpw, and preventing the users to get the best quality in many cases (70b on 36GiB VRAM, 123b on 64GiB VRAM).

In an ulterior PR that I can provide, this in order to not multiply the FTYPES, IQ2_M could be eliminated, IQ2_S elevated to replace it and make sense in term of nomenclature (currently, IQ2_S is an IQ2_XS+, and IQ2_M uses IQ2_S GGML_TYPE), and IQ2_XS having a little boost to compensate for the potential disappearance of the old IQ2_S FTYPE, which is tbh first in line to be sacrificed.

Note : I allow myself to PR this because I amused myself quite extensively with the quant strategies, and already revamped for my own use LCCP's quant strategies, some of it being already PRed on this repo as a demo.
  • Loading branch information
Nexesenex committed Oct 12, 2024
1 parent 9677640 commit 2b2ab6f
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 7 deletions.
1 change: 1 addition & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
{ "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 4 bpw non-linear quantization mix",},
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
Expand Down
1 change: 1 addition & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -1443,6 +1443,7 @@ class LlamaFileType(IntEnum):
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
MOSTLY_TQ1_0 = 36 # except 1d tensors
MOSTLY_TQ2_0 = 37 # except 1d tensors
MOSTLY_IQ3_XL = 38 # except 1d tensors

GUESSED = 1024 # not specified in the model file

Expand Down
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ3_XL = 38, // except 1d tensors

LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
Expand Down
29 changes: 22 additions & 7 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5285,6 +5285,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S and IQ4_XS mix - 4 bpw";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
Expand Down Expand Up @@ -18011,8 +18012,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
new_type = qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
Expand Down Expand Up @@ -18042,11 +18043,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ2_S;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
new_type = GGML_TYPE_IQ3_XXS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ2_S;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
new_type = qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
}
} else if (name.find("attn_q.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
Expand Down Expand Up @@ -18074,6 +18078,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
new_type = use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
}
Expand Down Expand Up @@ -18115,6 +18122,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ) new_type = GGML_TYPE_IQ4_XS;
}
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
Expand All @@ -18133,6 +18141,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
new_type = GGML_TYPE_IQ3_XXS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && use_more_bits(i_layer, n_layer)) {
new_type = GGML_TYPE_IQ4_XS;
}
++qs.i_ffn_gate;
}
else if (name.find("ffn_up") != std::string::npos) {
Expand All @@ -18141,6 +18152,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
new_type = GGML_TYPE_IQ3_XXS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && use_more_bits(i_layer, n_layer)) {
new_type = GGML_TYPE_IQ4_XS;
}
++qs.i_ffn_up;
}

Expand Down Expand Up @@ -18282,16 +18296,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break;
case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break;
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
case LLAMA_FTYPE_MOSTLY_IQ2_XS:
case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_IQ3_S:
case LLAMA_FTYPE_MOSTLY_IQ3_M:
case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
Expand Down

0 comments on commit 2b2ab6f

Please sign in to comment.