diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index 5e92b333aa5..686accc6089 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -134,6 +134,7 @@ class NetQuantize : public ModelWriter int quantize_gru(); int quantize_embed(); + int quantize_gemm(); int fuse_requantize(); }; @@ -613,6 +614,113 @@ int NetQuantize::quantize_embed() return 0; } +int NetQuantize::quantize_gemm() +{ + for (size_t i = 0; i < layers.size(); i++) + { + if (layers[i]->type != "Gemm") + continue; + + // Gemm - quantize weight from fp32 to int8 + ncnn::Gemm* gemm = (ncnn::Gemm*)layers[i]; + + fprintf(stderr, "quantize_gemm %s\n", gemm->name.c_str()); + + // TODO move to ncnn2table + + if (gemm->constantA) + { + if (gemm->transA == 1) + { + // transpose for easier quantization + ncnn::Mat A_data_transposed(gemm->constantK * gemm->constantM); + for (int i = 0; i < gemm->constantM; i++) + { + float* ptr = (float*)A_data_transposed + i * gemm->constantK; + for (int j = 0; j < gemm->constantK; j++) + { + ptr[j] = gemm->A_data[j * gemm->constantM + i]; + } + } + gemm->A_data = A_data_transposed; + gemm->transA = 0; + } + + gemm->A_data_int8_scales.create(gemm->constantM); + for (int i = 0; i < gemm->constantM; i++) + { + float absmax = 0.f; + + const float* ptr = (const float*)gemm->A_data + i * gemm->constantK; + for (int j = 0; j < gemm->constantK; j++) + { + absmax = std::max(absmax, (float)fabs(ptr[j])); + } + + gemm->A_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax; + } + + ncnn::Mat A_data = gemm->A_data.reshape(gemm->constantK, gemm->constantM); + ncnn::Mat A_data_int8; + + ncnn::Option opt_q = opt; + opt_q.blob_allocator = A_data.allocator; + opt_q.use_packing_layout = false; + ncnn::quantize_to_int8(A_data, A_data_int8, gemm->A_data_int8_scales, opt_q); + if (A_data_int8.empty()) + return -100; + + gemm->A_data = A_data_int8.reshape(gemm->constantK * gemm->constantM); + } + + if (gemm->constantB) + { + if (gemm->transB == 0) + { + // transpose for easier quantization + ncnn::Mat B_data_transposed(gemm->constantK * gemm->constantN); + for (int i = 0; i < gemm->constantN; i++) + { + float* ptr = (float*)B_data_transposed + i * gemm->constantK; + for (int j = 0; j < gemm->constantK; j++) + { + ptr[j] = gemm->B_data[j * gemm->constantN + i]; + } + } + gemm->B_data = B_data_transposed; + gemm->transB = 1; + } + + const float* ptr = gemm->B_data; + float absmax = 0.f; + for (int j = 0; j < gemm->B_data.w; j++) + { + absmax = std::max(absmax, (float)fabs(ptr[j])); + } + + gemm->B_data_int8_scale = absmax == 0.f ? 1.f : 127 / absmax; + + ncnn::Mat B_data_int8_scales(1); + B_data_int8_scales[0] = gemm->B_data_int8_scale; + + ncnn::Mat B_data_int8; + + ncnn::Option opt_q = opt; + opt_q.blob_allocator = gemm->B_data.allocator; + opt_q.use_packing_layout = false; + ncnn::quantize_to_int8(gemm->B_data, B_data_int8, B_data_int8_scales, opt_q); + if (B_data_int8.empty()) + return -100; + + gemm->B_data = B_data_int8; + } + + gemm->int8_scale_term = 2; + } + + return 0; +} + int NetQuantize::fuse_requantize() { const size_t layer_count = layers.size(); @@ -861,6 +969,7 @@ int main(int argc, char** argv) quantizer.quantize_lstm(); quantizer.quantize_gru(); quantizer.quantize_embed(); + quantizer.quantize_gemm(); quantizer.fuse_requantize();