diff --git a/docs/developer-guide/operation-param-weight-table.md b/docs/developer-guide/operation-param-weight-table.md
index aa5c99adf5e6..fb77d5c250b7 100644
--- a/docs/developer-guide/operation-param-weight-table.md
+++ b/docs/developer-guide/operation-param-weight-table.md
@@ -151,6 +151,9 @@
 ||2|width_scale|1.f|
 ||3|output_height|0|
 ||4|output_width|0|
+|LinearInt8|0|in_dim|0|scale weight|
+||1|out_dim|0|
+||2|group_size|0|
 |Log|0|base|-1.f|
 ||1|scale|1.f|
 ||2|shift|0.f|
diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
index 56e7516a36a0..6db563b429f1 100644
--- a/docs/developer-guide/operators.md
+++ b/docs/developer-guide/operators.md
@@ -45,6 +45,7 @@
 * [InstanceNorm](#instancenorm)
 * [Interp](#interp)
 * [LayerNorm](#layernorm)
+* [LinearInt8](#linearint8)
 * [Log](#log)
 * [LRN](#lrn)
 * [LSTM](#lstm)
@@ -1104,6 +1105,24 @@ y = x * gamma + beta by elementwise
 | gamma_data    | float | [affine_size]         |
 | beta_data     | float | [affine_size]         |
 
+# LinearInt8
+```
+y = x (WS)^T
+```
+
+* one_blob_only
+
+| param id  | name          | type  | default   | description       |
+| --------- | ------------- | ----- | --------- | ----------------- |
+| 0         | in_dim          | int | 0      |                   |
+| 1         | out_dim         | int | 0       |                   |
+| 2         | group_size         | int | 0       |                   |
+
+| weight        | type  | shape                 |
+| ------------- | ----- | --------------------- |
+| scale   | float | [in_dim * out_dim / group_size] |
+| weight     | int8 | [in_dim, out_dim]          |
+
 # Log
 ```
 if base == -1   y = log(shift + x * scale)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4a4ea24e6365..4dd6812f5c06 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -165,6 +165,7 @@ ncnn_add_layer(CopyTo)
 ncnn_add_layer(Erf)
 ncnn_add_layer(Diag)
 ncnn_add_layer(CELU)
+ncnn_add_layer(LinearInt8)
 
 if(NCNN_VULKAN)
     ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)
diff --git a/src/layer/arm/linearint8_arm.cpp b/src/layer/arm/linearint8_arm.cpp
new file mode 100644
index 000000000000..b0cff23423b7
--- /dev/null
+++ b/src/layer/arm/linearint8_arm.cpp
@@ -0,0 +1,91 @@
+#include "linearint8_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+int LinearInt8_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    if (bottom_blob.dims != 2 || bottom_blob.w != in_dim)
+        return -1;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    size_t elemsize = bottom_blob.elemsize;
+
+    top_blob.create(out_dim, h, elemsize, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    const int8_t* wt = (const int8_t*)weight;
+
+#if (__ARM_NEON && __aarch64__)
+
+    float zero = 0.0f;
+
+    if (!(w % group_size) && !(group_size % 8))
+    {
+        for (int j = 0; j < h; j++)
+        {
+            const float* m = bottom_blob.row(j);
+            float* out = top_blob.row(j);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < out_dim; p++)
+            {
+                int base = w * p;
+                float32x4_t acc_p0 = vld1q_dup_f32(&zero), acc_p1 = vld1q_dup_f32(&zero);
+                for (int k = 0; k < w; k += group_size)
+                {
+                    int scales_index = (base + k) / group_size;
+                    int index = base + k;
+                    const float* sc = (const float*)scales + scales_index;
+                    for (int i = 0, ind = index; i < group_size; i += 8, ind += 8)
+                    {
+                        int8x8_t i8x8 = vld1_s8(wt + ind);
+                        int16x8_t i16x8 = vmovl_s8(i8x8);
+                        int32x4_t i32_0 = vmovl_s16(vget_low_s16(i16x8));
+                        int32x4_t i32_1 = vmovl_s16(vget_high_s16(i16x8));
+                        float32x4_t wt_p0 = vcvtq_f32_s32(i32_0);
+                        float32x4_t wt_p1 = vcvtq_f32_s32(i32_1);
+                        float32x4_t m_p0 = vld1q_f32(m + k + i);
+                        float32x4_t m_p1 = vld1q_f32(m + k + i + 4);
+                        float32x4_t sc_p = vld1q_dup_f32(sc);
+                        float32x4_t acc_real0 = vmulq_f32(wt_p0, sc_p);
+                        float32x4_t acc_real1 = vmulq_f32(wt_p1, sc_p);
+                        acc_p0 = vmlaq_f32(acc_p0, m_p0, acc_real0);
+                        acc_p1 = vmlaq_f32(acc_p1, m_p1, acc_real1);
+                    }
+                }
+                out[p] = vaddvq_f32(acc_p0) + vaddvq_f32(acc_p1);
+            }
+        }
+        return 0;
+    }
+#endif
+
+    for (int j = 0; j < h; j++)
+    {
+        const float* m = bottom_blob.row(j);
+        float* out = top_blob.row(j);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < out_dim; p++)
+        {
+            int base = w * p;
+            float acc = 0.0f;
+            for (int i = 0, index = base, scales_index = index / group_size; i < w; i++, index++)
+            {
+                acc += m[i] * wt[index] * scales[scales_index];
+                if (index % group_size == group_size - 1) scales_index++;
+            }
+            out[p] = acc;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/arm/linearint8_arm.h b/src/layer/arm/linearint8_arm.h
new file mode 100644
index 000000000000..76b2255f4fa5
--- /dev/null
+++ b/src/layer/arm/linearint8_arm.h
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_LINEARINT8_ARM_H
+#define LAYER_LINEARINT8_ARM_H
+
+#include "net.h"
+#include "linearint8.h"
+
+namespace ncnn {
+
+class LinearInt8_arm : virtual public LinearInt8
+{
+public:
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_LINEARINT8_H
diff --git a/src/layer/linearint8.cpp b/src/layer/linearint8.cpp
new file mode 100644
index 000000000000..6c3e2d7da107
--- /dev/null
+++ b/src/layer/linearint8.cpp
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "linearint8.h"
+
+namespace ncnn {
+
+LinearInt8::LinearInt8()
+{
+    one_blob_only = true;
+    support_inplace = false;
+}
+
+int LinearInt8::load_param(const ParamDict& pd)
+{
+    in_dim = pd.get(0, 0);
+    out_dim = pd.get(1, 0);
+    group_size = pd.get(2, 1);
+    if (in_dim * out_dim % group_size)
+        return -1;
+    return 0;
+}
+
+int LinearInt8::load_model(const ModelBin& mb)
+{
+    scales = mb.load(in_dim * out_dim / group_size, 1);
+    weight = mb.load(in_dim * out_dim, 0);
+    if (weight.elemsize != 1)
+        return -1;
+    return 0;
+}
+
+int LinearInt8::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    if (bottom_blob.dims != 2 || bottom_blob.w != in_dim)
+        return -1;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    size_t elemsize = bottom_blob.elemsize;
+
+    top_blob.create(out_dim, h, elemsize, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    const int8_t* wt = (const int8_t*)weight;
+
+    for (int j = 0; j < h; j++)
+    {
+        const float* m = bottom_blob.row(j);
+        float* out = top_blob.row(j);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < out_dim; p++)
+        {
+            int base = w * p;
+            float acc = 0.0f;
+            for (int i = 0; i < w; i++)
+            {
+                int index = base + i;
+                acc += m[i] * wt[index] * scales[index / group_size];
+            }
+            out[p] = acc;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/linearint8.h b/src/layer/linearint8.h
new file mode 100644
index 000000000000..981177d1c8b8
--- /dev/null
+++ b/src/layer/linearint8.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_LINEARINT8_H
+#define LAYER_LINEARINT8_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class LinearInt8 : public Layer
+{
+public:
+    LinearInt8();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int load_model(const ModelBin& mb);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    int in_dim;
+    int out_dim;
+    int group_size;
+    Mat weight;
+    Mat scales;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_LINEARINT8_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 21de08c6ff35..bdf0dee43d1a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -114,6 +114,7 @@ ncnn_add_layer_test(InnerProduct)
 ncnn_add_layer_test(InstanceNorm)
 ncnn_add_layer_test(Interp)
 ncnn_add_layer_test(LayerNorm)
+ncnn_add_layer_test(LinearInt8)
 ncnn_add_layer_test(LRN)
 ncnn_add_layer_test(LSTM)
 ncnn_add_layer_test(MatMul)
diff --git a/tests/test_linearint8.cpp b/tests/test_linearint8.cpp
new file mode 100644
index 000000000000..a4cea2a027b7
--- /dev/null
+++ b/tests/test_linearint8.cpp
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/linearint8.h"
+#include "testutil.h"
+
+static int test_linearint8(const ncnn::Mat& a, int in_dim, int out_dim, int group_size)
+{
+    if (in_dim * out_dim % group_size)
+    {
+        fprintf(stderr, "malformed test case: in_dim=%d out_dim=%d group_size=%d\n", in_dim, out_dim, group_size);
+        return -1;
+    }
+    if (a.w != in_dim)
+    {
+        fprintf(stderr, "malformed test case: in_dim=%d out_dim=%d group_size=%d\n", in_dim, out_dim, group_size);
+        return -1;
+    }
+    ncnn::ParamDict pd;
+    pd.set(0, in_dim);
+    pd.set(1, out_dim);
+    pd.set(2, group_size);
+
+    std::vector<ncnn::Mat> weights(2);
+    weights[0] = RandomMat(in_dim * out_dim / group_size);
+    weights[1] = RandomS8Mat(in_dim * out_dim);
+
+    int ret = test_layer<ncnn::LinearInt8>("LinearInt8", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_linearint8 failed a.dims=%d a=(%d, %d) in_dim=%d out_dim=%d group_size=%d\n", a.dims, a.h, a.w, in_dim, out_dim, group_size);
+    }
+
+    return ret;
+}
+
+static int test_linearint8_0()
+{
+    ncnn::Mat a = RandomMat(10, 1);
+
+    return 0
+           || test_linearint8(a, 10, 6, 4)
+           || test_linearint8(a, 10, 8, 4)
+           || test_linearint8(a, 10, 10, 4);
+}
+
+static int test_linearint8_1()
+{
+    ncnn::Mat a = RandomMat(16, 1);
+
+    return 0
+           || test_linearint8(a, 16, 6, 16);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return 0
+           || test_linearint8_0()
+           || test_linearint8_1();
+}
diff --git a/tests/testutil.h b/tests/testutil.h
index b879fa527fbe..a826ca78955f 100644
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -1458,6 +1458,11 @@ int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std:
         weights_fp16.resize(weights.size());
         for (size_t j = 0; j < weights.size(); j++)
         {
+            if (weights[j].elemsize != 4)
+            {
+                weights_fp16[j] = weights[j].clone();
+                continue;
+            }
             ncnn::Mat tmp;
             ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
             ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
@@ -1469,6 +1474,11 @@ int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std:
         weights_fp16.resize(weights.size());
         for (size_t j = 0; j < weights.size(); j++)
         {
+            if (weights[j].elemsize != 4)
+            {
+                weights_fp16[j] = weights[j].clone();
+                continue;
+            }
             ncnn::Mat tmp;
             ncnn::cast_float32_to_float16(weights[j], tmp, opt);
             ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);