Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

embed int8 quantization #5667

Merged
merged 3 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .ci/pnnx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ on:
branches: [master]
paths:
- '.ci/pnnx.yml'
- 'src/layer/*'
- 'tools/pnnx/**'
- '!tools/pnnx/README.md'
mr:
target-branches: [master]
paths:
- '.ci/pnnx.yml'
- 'src/layer/*'
- 'tools/pnnx/**'
- '!tools/pnnx/README.md'
concurrency:
Expand Down
2 changes: 2 additions & 0 deletions docs/developer-guide/operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -837,11 +837,13 @@ y = embedding(x)
| 1 | input_dim | int | 0 | |
| 2 | bias_term | int | 0 | |
| 3 | weight_data_size | int | 0 | |
| 18 | int8_scale_term| int | 0 | |

| weight | type | shape |
| ------------- | ----- | --------------------- |
| weight_data | float | [weight_data_size] |
| bias_term | float | [num_output] |
| weight_data_int8_scales| float | [1] |

# Exp
```
Expand Down
88 changes: 79 additions & 9 deletions src/layer/embed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ int Embed::load_param(const ParamDict& pd)
input_dim = pd.get(1, 0);
bias_term = pd.get(2, 0);
weight_data_size = pd.get(3, 0);
int8_scale_term = pd.get(18, 0);

return 0;
}
Expand All @@ -47,18 +48,23 @@ int Embed::load_model(const ModelBin& mb)
return -100;
}

#if NCNN_INT8
if (int8_scale_term)
{
weight_data_int8_scale = mb.load(1, 1)[0];
}
#endif // NCNN_INT8

return 0;
}

int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
static void embed(const Mat& bottom_blob, const Mat& weight_data, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
{
int words = static_cast<int>(bottom_blob.total());
const int num_output = top_blob.w;
const int words = top_blob.h;

top_blob.create(num_output, words, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;
const float* bias_ptr = bias_data;

// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < words; q++)
{
Expand All @@ -73,15 +79,79 @@ int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) con

const float* em = (const float*)weight_data + num_output * word_index;

memcpy(outptr, em, num_output * sizeof(float));
if (bias_ptr)
{
for (int p = 0; p < num_output; p++)
{
outptr[p] = em[p] + bias_ptr[p];
}
}
else
{
memcpy(outptr, em, num_output * sizeof(float));
}
}
}

#if NCNN_INT8
static void embed_int8(const Mat& bottom_blob, const Mat& weight_data, float weight_data_int8_scale, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
{
const int num_output = top_blob.w;
const int words = top_blob.h;

const float* bias_ptr = bias_data;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < words; q++)
{
float* outptr = top_blob.row(q);

int word_index = ((const int*)bottom_blob)[q];

if (bias_term)
if (word_index < 0)
word_index = 0;
if (word_index >= input_dim)
word_index = input_dim - 1;

const float descale_em = 1.f / weight_data_int8_scale;

const signed char* em = (const signed char*)weight_data + num_output * word_index;

if (bias_ptr)
{
for (int p = 0; p < num_output; p++)
{
outptr[p] += bias_data[p];
outptr[p] = em[p] * descale_em + bias_ptr[p];
}
}
else
{
for (int p = 0; p < num_output; p++)
{
outptr[p] = em[p] * descale_em;
}
}
}
}
#endif // NCNN_INT8

int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int words = static_cast<int>(bottom_blob.total());

top_blob.create(num_output, words, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

#if NCNN_INT8
if (int8_scale_term)
{
embed_int8(bottom_blob, weight_data, weight_data_int8_scale, bias_data, top_blob, input_dim, opt);
}
else
#endif // NCNN_INT8
{
embed(bottom_blob, weight_data, bias_data, top_blob, input_dim, opt);
}

return 0;
Expand Down
6 changes: 6 additions & 0 deletions src/layer/embed.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,15 @@ class Embed : public Layer

int weight_data_size;

int int8_scale_term;

// model
Mat weight_data;
Mat bias_data;

#if NCNN_INT8
float weight_data_int8_scale;
#endif
};

} // namespace ncnn
Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ ncnn_add_layer_test(Dropout)
ncnn_add_layer_test(Einsum)
ncnn_add_layer_test(Eltwise)
ncnn_add_layer_test(ELU)
ncnn_add_layer_test(Embed)
ncnn_add_layer_test(Erf)
ncnn_add_layer_test(ExpandDims)
ncnn_add_layer_test(Flatten)
Expand Down
108 changes: 108 additions & 0 deletions tests/test_embed.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "testutil.h"

static int test_embed(int words, int num_output, int input_dim, int bias)
{
ncnn::ParamDict pd;
pd.set(0, num_output);
pd.set(1, input_dim);
pd.set(2, bias);
pd.set(3, num_output * input_dim);

std::vector<ncnn::Mat> weights(bias ? 2 : 1);
weights[0] = RandomMat(num_output * input_dim);
if (bias)
weights[1] = RandomMat(num_output);

ncnn::Mat a(words);
RandomizeInt(a, 0, input_dim);

int ret = test_layer("Embed", pd, weights, a);
if (ret != 0)
{
fprintf(stderr, "test_embed failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias);
}

return ret;
}

static int test_embed_0()
{
return 0
|| test_embed(128, 128, 128, 0)
|| test_embed(128, 128, 128, 1)
|| test_embed(127, 127, 127, 0)
|| test_embed(127, 127, 127, 1)
|| test_embed(124, 124, 124, 0)
|| test_embed(124, 124, 124, 1);
}

#if NCNN_INT8
static int test_embed_int8(int words, int num_output, int input_dim, int bias)
{
ncnn::ParamDict pd;
pd.set(0, num_output);
pd.set(1, input_dim);
pd.set(2, bias);
pd.set(3, num_output * input_dim);
pd.set(18, 2);

std::vector<ncnn::Mat> weights(bias ? 3 : 2);
weights[0] = RandomS8Mat(num_output * input_dim);
if (bias)
{
weights[1] = RandomMat(num_output);
weights[2] = RandomMat(1, 100.f, 200.f);
}
else
{
weights[1] = RandomMat(1, 100.f, 200.f);
}

ncnn::Mat a(words);
RandomizeInt(a, 0, input_dim);

int ret = test_layer("Embed", pd, weights, a);
if (ret != 0)
{
fprintf(stderr, "test_embed_int8 failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias);
}

return ret;
}

static int test_embed_1()
{
return 0
|| test_embed_int8(128, 128, 128, 0)
|| test_embed_int8(128, 128, 128, 1)
|| test_embed_int8(127, 127, 127, 0)
|| test_embed_int8(127, 127, 127, 1)
|| test_embed_int8(124, 124, 124, 0)
|| test_embed_int8(124, 124, 124, 1);
}
#endif // NCNN_INT8

int main()
{
SRAND(7767517);

#if NCNN_INT8
return test_embed_0() || test_embed_1();
#else
return test_embed_0();
#endif
}
11 changes: 11 additions & 0 deletions tools/modelwriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -1676,9 +1676,20 @@ int ModelWriter::save(const char* parampath, const char* binpath)
fprintf_param_value(" 1=%d", input_dim)
fprintf_param_value(" 2=%d", bias_term)
fprintf_param_value(" 3=%d", weight_data_size)
fprintf_param_value(" 18=%d", int8_scale_term)

fwrite_weight_tag_data(op->weight_data, bp);
fwrite_weight_data(op->bias_data, bp);

#if NCNN_INT8
// write int8_scale data
if (op->int8_scale_term)
{
ncnn::Mat weight_data_int8_scales(1);
weight_data_int8_scales[0] = op->weight_data_int8_scale;
fwrite_weight_data(weight_data_int8_scales, bp, 90, 100);
}
#endif // NCNN_INT8
}
else if (layer->type == "Exp")
{
Expand Down
52 changes: 52 additions & 0 deletions tools/quantize/ncnn2int8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ class NetQuantize : public ModelWriter
int quantize_lstm();
int quantize_gru();

int quantize_embed();

int fuse_requantize();
};

Expand Down Expand Up @@ -562,6 +564,55 @@ int NetQuantize::quantize_gru()
return 0;
}

int NetQuantize::quantize_embed()
{
for (size_t i = 0; i < layers.size(); i++)
{
if (layers[i]->type != "Embed")
continue;

// Embed - quantize weight from fp32 to int8
ncnn::Embed* embed = (ncnn::Embed*)layers[i];

fprintf(stderr, "quantize_embed %s\n", embed->name.c_str());

// TODO move to ncnn2table

const int num_output = embed->num_output;
const int input_dim = embed->input_dim;

ncnn::Mat weight_data_int8_scales(1);
{
const float* ptr = embed->weight_data;
float absmax = 0.f;
for (int i = 0; i < embed->weight_data.w; i++)
{
absmax = std::max(absmax, (float)fabs(ptr[i]));
}

weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax;
}

{
ncnn::Mat weight_data_int8;

ncnn::Option opt_q = opt;
opt_q.blob_allocator = embed->weight_data.allocator;
opt_q.use_packing_layout = false;
ncnn::quantize_to_int8(embed->weight_data, weight_data_int8, weight_data_int8_scales, opt_q);
if (weight_data_int8.empty())
return -100;

embed->weight_data = weight_data_int8;
}

embed->int8_scale_term = 2;
embed->weight_data_int8_scale = weight_data_int8_scales[0];
}

return 0;
}

int NetQuantize::fuse_requantize()
{
const size_t layer_count = layers.size();
Expand Down Expand Up @@ -809,6 +860,7 @@ int main(int argc, char** argv)
quantizer.quantize_rnn();
quantizer.quantize_lstm();
quantizer.quantize_gru();
quantizer.quantize_embed();

quantizer.fuse_requantize();

Expand Down
Loading