From f26f9ef42c0e6d105b79c89447c2cd9d2c9c4763 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sun, 13 Aug 2023 11:41:20 +0300 Subject: [PATCH 01/12] Improve LLaMA-2 2-, 3- and 4-bit quantization * Q3_K_S: use Q5_K for 1st 2 layers of attention.wv and feed_forward.w2 * Q4_K_S: use Q6_K for 1st 2 layers of attention.wv and feed_forward.w2 * Q2_K and Q3_K_M: use Q5_K instead of Q4_K for 1st 2 layers of attention.wv and feed_forward.w2 This leads to a slight model sized increase as follows: Q2_K : 2.684G vs 2.670G Q3_K_S: 2.775G vs 2.745G Q3_K_M: 3.071G vs 3.057G Q4_K_S: 3.592G vs 3.563G LLaMA-2 PPL for context 512 changes as follows: Q2_K : 6.6691 vs 6.8201 Q3_K_S: 6.2129 vs 6.2584 Q3_K_M: 6.0387 vs 6.1371 Q4_K_S: 5.9138 vs 6.0041 There are improvements for LLaMA-1 as well, but they are way smaller than the above. --- llama.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index c97aaee6967e1..fa780f37ceb6c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3718,18 +3718,29 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; + } else if (tensor.name.find("attention.wv.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 2) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_attention_wv < 2) new_type = GGML_TYPE_Q5_K; else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; ++i_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; + } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 2) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_feed_forward_w2 < 2) new_type = GGML_TYPE_Q5_K; ++i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; From 77aea7214fc0ca613990517c7490a1bf17bdc426 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sun, 13 Aug 2023 13:06:07 +0300 Subject: [PATCH 02/12] Minor 4-bit quantization improvement For the same model size as previus commit, we get PPL = 5.9069 vs 5.9138. --- k_quants.c | 11 ++++++----- llama.cpp | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/k_quants.c b/k_quants.c index 6348fce6b94d0..00ef19f7f3423 100644 --- a/k_quants.c +++ b/k_quants.c @@ -221,7 +221,8 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * return 1/iscale; } -static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, int ntry) { +static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, + int ntry, float alpha) { float min = x[0]; float max = x[0]; for (int i = 1; i < n; ++i) { @@ -254,7 +255,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t for (int i = 0; i < n; ++i) { sum += x[i] - scale*L[i]; } - min = sum/n; + min = alpha*min + (1 - alpha)*sum/n; if (min > 0) min = 0; iscale = 1/scale; if (!did_change) break; @@ -291,7 +292,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/16; ++j) { - scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5); + scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5, 0.f); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -645,7 +646,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 5); + scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -810,7 +811,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5); + scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5, 0.f); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; diff --git a/llama.cpp b/llama.cpp index fa780f37ceb6c..96faff378751e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3725,7 +3725,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 2) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_attention_wv < 2) new_type = GGML_TYPE_Q5_K; else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; @@ -3739,7 +3739,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 2) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_feed_forward_w2 < 2) new_type = GGML_TYPE_Q5_K; ++i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { From ec9cb753a66b1eabcbef78f21d82a4785f6f6c4a Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sun, 13 Aug 2023 18:02:19 +0300 Subject: [PATCH 03/12] Some more fine tuning --- k_quants.c | 2 +- llama.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/k_quants.c b/k_quants.c index 00ef19f7f3423..4e5563a33928f 100644 --- a/k_quants.c +++ b/k_quants.c @@ -811,7 +811,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5, 0.f); + scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; diff --git a/llama.cpp b/llama.cpp index 96faff378751e..07d8f343f32b0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3726,7 +3726,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_attention_wv < 2) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; ++i_attention_wv; @@ -3740,7 +3740,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_feed_forward_w2 < 2) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K; ++i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; From 4f8dcb16533db2686c716e651156f8d90c4edd5f Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Mon, 14 Aug 2023 16:06:00 +0300 Subject: [PATCH 04/12] Adding make_qkx2_quants With it, we get PPL = 5.8828 for L2-7B Q4_K_S. --- k_quants.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- llama.cpp | 2 +- 2 files changed, 83 insertions(+), 3 deletions(-) diff --git a/k_quants.c b/k_quants.c index 4e5563a33928f..b5faefd6c6660 100644 --- a/k_quants.c +++ b/k_quants.c @@ -225,9 +225,13 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t int ntry, float alpha) { float min = x[0]; float max = x[0]; + float sum_x = 0; + float sum_x2 = 0; for (int i = 1; i < n; ++i) { if (x[i] < min) min = x[i]; if (x[i] > max) max = x[i]; + sum_x += x[i]; + sum_x2 += x[i]*x[i]; } if (max == min) { for (int i = 0; i < n; ++i) L[i] = 0; @@ -264,6 +268,76 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t return scale; } +static float make_qkx2_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, + uint8_t * restrict Laux) { + float min = x[0]; + float max = x[0]; + float sum_x = 0, sum_x2 = 0; + for (int i = 1; i < n; ++i) { + if (x[i] < min) min = x[i]; + if (x[i] > max) max = x[i]; + sum_x += x[i]; + sum_x2 += x[i] * x[i]; + } + if (min > 0) min = 0; + if (max == min) { + for (int i = 0; i < n; ++i) L[i] = 0; + *the_min = -min; + return 0.f; + } + float num = sum_x2 * n - sum_x * sum_x * n / (n-1); + float iscale = nmax/(max - min); + float scale = 1/iscale; + float best_mse = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + L[i] = MAX(0, MIN(nmax, l)); + float diff = scale * L[i] + min - x[i]; + float w = x[i] * x[i]; + best_mse += w * diff * diff; + } + if (num <= 0) { + *the_min = -min; + return scale; + } + for (int is = -5; is <= 10; ++is) { + iscale = (0.1f*is + nmax)/(max - min); + int sum_l = 0, sum_l2 = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + l = MAX(0, MIN(nmax, l)); + Laux[i] = l; + sum_l += l; + sum_l2 += l*l; + } + int den = sum_l2 * n - sum_l * sum_l; + if (den > 0) { + float this_scale = sqrtf(num / den); + float this_min = (sum_x - this_scale * sum_l)/n; + if (this_min > 0) { + this_min = 0; + this_scale = sqrtf(sum_x2 / sum_l2); + } + float mse = 0; + for (int i = 0; i < n; ++i) { + float diff = this_scale * Laux[i] + this_min - x[i]; + float w = x[i] * x[i]; + mse += w * diff * diff; + } + if (mse < best_mse) { + for (int i = 0; i < n; ++i) { + L[i] = Laux[i]; + } + best_mse = mse; + scale = this_scale; + min = this_min; + } + } + } + *the_min = -min; + return scale; +} + #if QK_K == 256 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { if (j < 4) { @@ -282,6 +356,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict const int nb = k / QK_K; uint8_t L[QK_K]; + uint8_t Laux[16]; float mins[QK_K/16]; float scales[QK_K/16]; @@ -292,7 +367,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/16; ++j) { - scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5, 0.f); + //scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5, 0.f); + scales[j] = make_qkx2_quants(16, 3, x + 16*j, L + 16*j, &mins[j], Laux); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -638,6 +714,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict const int nb = k / QK_K; uint8_t L[QK_K]; + uint8_t Laux[32]; float mins[QK_K/32]; float scales[QK_K/32]; @@ -646,7 +723,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + scales[j] = make_qkx2_quants(32, 15, x + 32*j, L + 32*j, &mins[j], Laux); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -797,6 +875,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict #if QK_K == 256 uint8_t L[QK_K]; + //uint8_t Laux[32]; float mins[QK_K/32]; float scales[QK_K/32]; #else @@ -812,6 +891,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + //scales[j] = make_qkx2_quants(32, 31, x + 32*j, L + 32*j, &mins[j], Laux); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; diff --git a/llama.cpp b/llama.cpp index 07d8f343f32b0..49e4503c02c58 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3740,7 +3740,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K; + //else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K; ++i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; From e9f1340c20952713895028c9d76b71abbf999735 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Mon, 14 Aug 2023 17:20:02 +0300 Subject: [PATCH 05/12] Another minor improvement --- k_quants.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/k_quants.c b/k_quants.c index b5faefd6c6660..70c2f2f7ba2ba 100644 --- a/k_quants.c +++ b/k_quants.c @@ -269,7 +269,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t } static float make_qkx2_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, - uint8_t * restrict Laux) { + uint8_t * restrict Laux, bool use_mad) { float min = x[0]; float max = x[0]; float sum_x = 0, sum_x2 = 0; @@ -288,13 +288,17 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, uint8_t float num = sum_x2 * n - sum_x * sum_x * n / (n-1); float iscale = nmax/(max - min); float scale = 1/iscale; - float best_mse = 0; + float best_mad = 0; for (int i = 0; i < n; ++i) { int l = nearest_int(iscale*(x[i] - min)); L[i] = MAX(0, MIN(nmax, l)); float diff = scale * L[i] + min - x[i]; float w = x[i] * x[i]; - best_mse += w * diff * diff; + if (use_mad) { + best_mad += w * fabsf(diff); + } else { + best_mad += w * diff * diff; + } } if (num <= 0) { *the_min = -min; @@ -318,17 +322,21 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, uint8_t this_min = 0; this_scale = sqrtf(sum_x2 / sum_l2); } - float mse = 0; + float mad = 0; for (int i = 0; i < n; ++i) { float diff = this_scale * Laux[i] + this_min - x[i]; float w = x[i] * x[i]; - mse += w * diff * diff; + if (use_mad) { + mad += w * fabsf(diff); + } else { + mad += w * diff * diff; + } } - if (mse < best_mse) { + if (mad < best_mad) { for (int i = 0; i < n; ++i) { L[i] = Laux[i]; } - best_mse = mse; + best_mad = mad; scale = this_scale; min = this_min; } @@ -368,7 +376,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict float max_min = 0; for (int j = 0; j < QK_K/16; ++j) { //scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5, 0.f); - scales[j] = make_qkx2_quants(16, 3, x + 16*j, L + 16*j, &mins[j], Laux); + scales[j] = make_qkx2_quants(16, 3, x + 16*j, L + 16*j, &mins[j], Laux, false); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -724,7 +732,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); - scales[j] = make_qkx2_quants(32, 15, x + 32*j, L + 32*j, &mins[j], Laux); + scales[j] = make_qkx2_quants(32, 15, x + 32*j, L + 32*j, &mins[j], Laux, true); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -875,7 +883,6 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict #if QK_K == 256 uint8_t L[QK_K]; - //uint8_t Laux[32]; float mins[QK_K/32]; float scales[QK_K/32]; #else @@ -891,7 +898,6 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); - //scales[j] = make_qkx2_quants(32, 31, x + 32*j, L + 32*j, &mins[j], Laux); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; From 1c1f985b27feff70f5c622624fa0d17620efa306 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Mon, 14 Aug 2023 20:03:14 +0300 Subject: [PATCH 06/12] Q2_K improvement Smaller model, lower perplexity. 7B: file size = 2.632G, PPL = 6.3772 vs original 2.670G PPL = 6.8201 12B: file size = 5.056G, PPL = 5.4577 vs original 5.130G PPL = 5.7178 It is mostly Q3_K except for tok_embeddings, attention.wq, attention.wk, which are Q2_K --- llama.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 49e4503c02c58..3c85945d7b315 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3719,33 +3719,40 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; } else if (tensor.name.find("attention.wv.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; + //else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; ++i_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K; - //else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K; ++i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; + } else if (tensor.name.find("attention.wo.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; } + else if (tensor.name.find("feed_forward.w") != std::string::npos && ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + new_type = GGML_TYPE_Q3_K; + } bool convert_incompatible_tensor = false; if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { From 404e43cc3bea2d8baacc741f917977fd7e4a5d04 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 16 Aug 2023 10:52:54 +0300 Subject: [PATCH 07/12] Iterating --- k_quants.c | 78 ++++++++++++++++++++++++++++++------------------------ llama.cpp | 9 +++++-- 2 files changed, 51 insertions(+), 36 deletions(-) diff --git a/k_quants.c b/k_quants.c index 70c2f2f7ba2ba..6c934c9f39c34 100644 --- a/k_quants.c +++ b/k_quants.c @@ -268,16 +268,19 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t return scale; } -static float make_qkx2_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, - uint8_t * restrict Laux, bool use_mad) { +static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, + uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, + float rmin, float rdelta, int nstep, bool use_mad) { float min = x[0]; float max = x[0]; - float sum_x = 0, sum_x2 = 0; + float sum_w = weights[0]; + float sum_x = sum_w * x[0]; for (int i = 1; i < n; ++i) { if (x[i] < min) min = x[i]; if (x[i] > max) max = x[i]; - sum_x += x[i]; - sum_x2 += x[i] * x[i]; + float w = weights[i]; + sum_w += w; + sum_x += w * x[i]; } if (min > 0) min = 0; if (max == min) { @@ -285,7 +288,6 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, uint8_t *the_min = -min; return 0.f; } - float num = sum_x2 * n - sum_x * sum_x * n / (n-1); float iscale = nmax/(max - min); float scale = 1/iscale; float best_mad = 0; @@ -293,44 +295,40 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, uint8_t int l = nearest_int(iscale*(x[i] - min)); L[i] = MAX(0, MIN(nmax, l)); float diff = scale * L[i] + min - x[i]; - float w = x[i] * x[i]; - if (use_mad) { - best_mad += w * fabsf(diff); - } else { - best_mad += w * diff * diff; - } + diff = use_mad ? fabsf(diff) : diff * diff; + float w = weights[i]; + best_mad += w * diff; } - if (num <= 0) { + if (nstep < 1) { *the_min = -min; return scale; } - for (int is = -5; is <= 10; ++is) { - iscale = (0.1f*is + nmax)/(max - min); - int sum_l = 0, sum_l2 = 0; + for (int is = 0; is <= nstep; ++is) { + iscale = (rmin + rdelta*is + nmax)/(max - min); + float sum_l = 0, sum_l2 = 0, sum_xl = 0; for (int i = 0; i < n; ++i) { int l = nearest_int(iscale*(x[i] - min)); l = MAX(0, MIN(nmax, l)); Laux[i] = l; - sum_l += l; - sum_l2 += l*l; + float w = weights[i]; + sum_l += w*l; + sum_l2 += w*l*l; + sum_xl += w*l*x[i]; } - int den = sum_l2 * n - sum_l * sum_l; - if (den > 0) { - float this_scale = sqrtf(num / den); - float this_min = (sum_x - this_scale * sum_l)/n; + float D = sum_w * sum_l2 - sum_l * sum_l; + if (D > 0) { + float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D; + float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D; if (this_min > 0) { this_min = 0; - this_scale = sqrtf(sum_x2 / sum_l2); + this_scale = sum_xl / sum_l2; } float mad = 0; for (int i = 0; i < n; ++i) { float diff = this_scale * Laux[i] + this_min - x[i]; - float w = x[i] * x[i]; - if (use_mad) { - mad += w * fabsf(diff); - } else { - mad += w * diff * diff; - } + diff = use_mad ? fabsf(diff) : diff * diff; + float w = weights[i]; + mad += w * diff; } if (mad < best_mad) { for (int i = 0; i < n; ++i) { @@ -365,6 +363,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict uint8_t L[QK_K]; uint8_t Laux[16]; + float weights[16]; float mins[QK_K/16]; float scales[QK_K/16]; @@ -375,8 +374,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/16; ++j) { - //scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5, 0.f); - scales[j] = make_qkx2_quants(16, 3, x + 16*j, L + 16*j, &mins[j], Laux, false); + for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]); + scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -723,6 +722,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict uint8_t L[QK_K]; uint8_t Laux[32]; + float weights[32]; float mins[QK_K/32]; float scales[QK_K/32]; @@ -731,8 +731,11 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); - scales[j] = make_qkx2_quants(32, 15, x + 32*j, L + 32*j, &mins[j], Laux, true); + float sum_x2 = 0; + for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; + float av_x = sqrtf(sum_x2/32); + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -885,6 +888,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict uint8_t L[QK_K]; float mins[QK_K/32]; float scales[QK_K/32]; + float weights[32]; + uint8_t Laux[32]; #else int8_t L[QK_K]; float scales[QK_K/16]; @@ -897,7 +902,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + float sum_x2 = 0; + for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; + float av_x = sqrtf(sum_x2/32); + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; diff --git a/llama.cpp b/llama.cpp index 3c85945d7b315..0947918e6df1a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3750,9 +3750,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; } - else if (tensor.name.find("feed_forward.w") != std::string::npos && ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { - new_type = GGML_TYPE_Q3_K; + else if (tensor.name.find("feed_forward.w") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; } + // This can be used to reduce the size of the Q5_K_S model. + // The associated PPL increase is fully in line with the size reduction + //else { + // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; + //} bool convert_incompatible_tensor = false; if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { From 9f78d4cdf97bdccf781a8dda3c5b59920e4ff65c Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sun, 20 Aug 2023 09:02:43 +0300 Subject: [PATCH 08/12] Revert Q5_K back to make_qkx1_quants --- k_quants.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/k_quants.c b/k_quants.c index 6c934c9f39c34..f106df1358a71 100644 --- a/k_quants.c +++ b/k_quants.c @@ -731,6 +731,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { + //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); float sum_x2 = 0; for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; float av_x = sqrtf(sum_x2/32); @@ -888,8 +889,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict uint8_t L[QK_K]; float mins[QK_K/32]; float scales[QK_K/32]; - float weights[32]; - uint8_t Laux[32]; + //float weights[32]; + //uint8_t Laux[32]; #else int8_t L[QK_K]; float scales[QK_K/16]; @@ -902,12 +903,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); - float sum_x2 = 0; - for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; - float av_x = sqrtf(sum_x2/32); - for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); - scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false); + scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + //float sum_x2 = 0; + //for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; + //float av_x = sqrtf(sum_x2/32); + //for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + //scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; From e2af308cc76ba52447009369e84553201748b6c3 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Mon, 21 Aug 2023 17:57:26 +0300 Subject: [PATCH 09/12] Better Q6_K --- k_quants.c | 56 +++++++----------------------------------------------- 1 file changed, 7 insertions(+), 49 deletions(-) diff --git a/k_quants.c b/k_quants.c index f106df1358a71..29944aa251f0e 100644 --- a/k_quants.c +++ b/k_quants.c @@ -77,6 +77,11 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * } return 1/iscale; } + bool return_early = false; + if (rmse_type < 0) { + rmse_type = -rmse_type; + return_early = true; + } int weight_type = rmse_type%2; float sumlx = 0; float suml2 = 0; @@ -89,56 +94,9 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * suml2 += w*l*l; } float scale = sumlx/suml2; + if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale; float best = scale * sumlx; - for (int itry = 0; itry < 3; ++itry) { - iscale = 1/scale; - float slx = 0; - float sl2 = 0; - bool changed = false; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - l = MAX(-nmax, MIN(nmax-1, l)); - if (l + nmax != L[i]) { changed = true; } - float w = weight_type == 1 ? x[i] * x[i] : 1.f; - slx += w*x[i]*l; - sl2 += w*l*l; - } - if (!changed || sl2 == 0 || slx*slx <= best*sl2) { break; } - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); - } - sumlx = slx; suml2 = sl2; - scale = sumlx/suml2; - best = scale * sumlx; - } - for (int itry = 0; itry < 5; ++itry) { - int n_changed = 0; - for (int i = 0; i < n; ++i) { - float w = weight_type == 1 ? x[i]*x[i] : 1; - int l = L[i] - nmax; - float slx = sumlx - w*x[i]*l; - if (slx > 0) { - float sl2 = suml2 - w*l*l; - int new_l = nearest_int(x[i] * sl2 / slx); - new_l = MAX(-nmax, MIN(nmax-1, new_l)); - if (new_l != l) { - slx += w*x[i]*new_l; - sl2 += w*new_l*new_l; - if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) { - L[i] = nmax + new_l; sumlx = slx; suml2 = sl2; - scale = sumlx / suml2; best = scale * sumlx; - ++n_changed; - } - } - } - } - if (!n_changed) { break; } - } - if (rmse_type < 3) { - return scale; - } - for (int is = -4; is <= 4; ++is) { + for (int is = -9; is <= 9; ++is) { if (is == 0) { continue; } From b7063393d87d90fd0d39132450f05bd9e5c58e1d Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 22 Aug 2023 08:45:28 +0300 Subject: [PATCH 10/12] make_qkx2_quants is better for Q5_K after all --- k_quants.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/k_quants.c b/k_quants.c index 29944aa251f0e..82bf816976c00 100644 --- a/k_quants.c +++ b/k_quants.c @@ -847,8 +847,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict uint8_t L[QK_K]; float mins[QK_K/32]; float scales[QK_K/32]; - //float weights[32]; - //uint8_t Laux[32]; + float weights[32]; + uint8_t Laux[32]; #else int8_t L[QK_K]; float scales[QK_K/16]; @@ -861,12 +861,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); - //float sum_x2 = 0; - //for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; - //float av_x = sqrtf(sum_x2/32); - //for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); - //scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false); + //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + float sum_x2 = 0; + for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; + float av_x = sqrtf(sum_x2/32); + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; From 35a0b974e32df2ef2433de42f44ab01b6e5c55f0 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 22 Aug 2023 08:51:13 +0300 Subject: [PATCH 11/12] Fix after rebasing on master --- llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 0947918e6df1a..ebec2a4f05019 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3718,7 +3718,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; - } else if (tensor.name.find("attention.wv.weight") != std::string::npos) { + } else if (name.find("attention.wv.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; @@ -3733,7 +3733,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ++i_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; - } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { + } else if (name.find("feed_forward.w2.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; @@ -3745,12 +3745,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ++i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; - } else if (tensor.name.find("attention.wo.weight") != std::string::npos) { + } else if (name.find("attention.wo.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; } - else if (tensor.name.find("feed_forward.w") != std::string::npos) { + else if (name.find("feed_forward.w") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; } // This can be used to reduce the size of the Q5_K_S model. From fdf73db54d26652b9722bdea9c8ce66fe4c4130a Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 22 Aug 2023 10:46:22 +0300 Subject: [PATCH 12/12] Fix for changed tensor names --- llama.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index ebec2a4f05019..c7ad1ff00c83e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3717,8 +3717,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type = GGML_TYPE_Q6_K; } } else if (name.find("attn_v.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; - } else if (name.find("attention.wv.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; @@ -3727,13 +3725,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; - //else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; ++i_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; - } else if (name.find("feed_forward.w2.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; @@ -3744,13 +3739,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K; ++i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; - } else if (name.find("attention.wo.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; } - else if (name.find("feed_forward.w") != std::string::npos) { + else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; } // This can be used to reduce the size of the Q5_K_S model.