Tencent · nihui · Oct 19, 2022 · Oct 18, 2022 · Oct 18, 2022 · Oct 18, 2022
diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
@@ -29,6 +29,7 @@
 * [Exp](#exp)
 * [Flatten](#flatten)
 * [GELU](#gelu)
+* [GLU](#glu)
 * [Gemm](#gemm)
 * [GroupNorm](#groupnorm)
 * [GRU](#gru)
@@ -784,6 +785,22 @@ else                y = 0.5 * x * erfc(-0.70710678 * x)
 | --------- | ------------- | ----- | --------- | ----------------- |
 | 0         | fast_gelu     | int   | 0         | use approximation |
 
+# GLU
+
+If axis < 0, we use axis = x.dims + axis
+
+GLU(a,b)=a⊗σ(b)
+
+where a is the first half of the input matrix and b is the second half.
+
+axis specifies the dimension to split the input
+
+* one_blob_only
+
+| param id  | name          | type  | default   | description       |
+| --------- | ------------- | ----- | --------- | ----------------- |
+| 0         | axis          | int   | 0         |                   |
+
 # Gemm
 ```
 a = transA ? transpose(x0) : x0

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -156,6 +156,7 @@ ncnn_add_layer(Deconvolution3D)
 ncnn_add_layer(DeconvolutionDepthWise3D)
 ncnn_add_layer(Einsum)
 ncnn_add_layer(DeformableConv2D)
+ncnn_add_layer(GLU)
 
 if(NCNN_VULKAN)
     ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)

diff --git a/src/layer/glu.cpp b/src/layer/glu.cpp
@@ -0,0 +1,220 @@
+// Copyright (c) 2022 Xiaomi Corp.        (author: Fangjun Kuang)
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this
+// file except in compliance with the License. You may obtain a copy of the
+// License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+#include "glu.h"
+
+#include <math.h>
+
+namespace ncnn {
+
+GLU::GLU()
+{
+    one_blob_only = true;
+    support_inplace = false;
+}
+
+int GLU::load_param(const ParamDict& pd)
+{
+    axis = pd.get(0, 0);
+
+    return 0;
+}
+
+int GLU::forward(const Mat& bottom_blob, Mat& top_blob,
+                 const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    int positive_axis = axis < 0 ? dims + axis : axis;
+
+    if (dims == 1)
+    {   // ignore axis
+        int w = bottom_blob.w;
+        int out_w = w / 2;
+        top_blob.create(out_w, sizeof(float), opt.blob_allocator);
+
+        const float* in_ptr = bottom_blob;
+        float* out_ptr = top_blob;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int x = 0; x < out_w; ++x)
+        {
+            float sigmoid = static_cast<float>(1.f / (1.f + expf(-in_ptr[x + out_w])));
+
+            out_ptr[x] = in_ptr[x] * sigmoid;
+        }
+
+        return 0;
+    } // if (dims == 1)
+
+    if (dims == 2 && positive_axis == 0)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int out_w = w;
+        int out_h = h / 2;
+        top_blob.create(out_w, out_h, sizeof(float), opt.blob_allocator);
+
+        int offset = out_w * out_h;
+
+#if 0
+        // this one is equivalent to the else branch. It is more readable
+        // but less efficient
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int y = 0; y < out_h; ++y) {
+            const float *in_ptr = bottom_blob.row(y);
+            float *out_ptr = top_blob.row(y);
+
+            for (int x = 0; x < w; ++x) {
+                float sigmoid =
+                    static_cast<float>(1.f / (1.f + exp(-in_ptr[x + offset])));
+
+                out_ptr[x] = in_ptr[x] * sigmoid;
+            }
+        }
+#else
+        int size = offset;
+        const float* in_ptr = bottom_blob;
+        float* out_ptr = top_blob;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < size; ++i)
+        {
+            float sigmoid = static_cast<float>(1.f / (1.f + exp(-in_ptr[i + offset])));
+            out_ptr[i] = in_ptr[i] * sigmoid;
+        }
+#endif
+
+        return 0;
+    } // if (dims == 2 && positive_axis == 0)
+
+    if (dims == 2 && positive_axis == 1)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int out_w = w / 2;
+        int out_h = h;
+
+        top_blob.create(out_w, out_h, sizeof(float), opt.blob_allocator);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int y = 0; y < h; ++y)
+        {
+            const float* in_ptr = bottom_blob.row(y);
+            float* out_ptr = top_blob.row(y);
+
+            for (int x = 0; x < out_w; ++x)
+            {
+                float sigmoid = static_cast<float>(1.f / (1.f + exp(-in_ptr[x + out_w])));
+                out_ptr[x] = in_ptr[x] * sigmoid;
+            }
+        }
+
+        return 0;
+    } // if (dims == 2 && positive_axis == 1)
+
+    if (dims == 3 && positive_axis == 0)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int c = bottom_blob.c;
+
+        int out_w = w;
+        int out_h = h;
+        int out_c = c / 2;
+
+        top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator);
+
+        int offset = out_c * bottom_blob.cstep;
+        int size = w * h;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < out_c; ++q)
+        {
+            const float* in_ptr = bottom_blob.channel(q);
+            float* out_ptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; ++i)
+            {
+                float sigmoid = static_cast<float>(1.f / (1.f + exp(-in_ptr[i + offset])));
+                out_ptr[i] = in_ptr[i] * sigmoid;
+            }
+        }
+        return 0;
+    } //   if (dims == 3 && positive_axis == 0) {
+
+    if (dims == 3 && positive_axis == 1)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int c = bottom_blob.c;
+
+        int out_w = w;
+        int out_h = h / 2;
+        int out_c = c;
+
+        top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator);
+
+        int offset = out_h * out_w;
+        int size = offset;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < c; ++q)
+        {
+            const float* in_ptr = bottom_blob.channel(q);
+            float* out_ptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; ++i)
+            {
+                float sigmoid = static_cast<float>(1.f / (1.f + exp(-in_ptr[i + offset])));
+                out_ptr[i] = in_ptr[i] * sigmoid;
+            }
+        }
+        return 0;
+    } // if (dims == 3 && positive_axis == 1)
+
+    if (dims == 3 && positive_axis == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int c = bottom_blob.c;
+
+        int out_w = w / 2;
+        int out_h = h;
+        int out_c = c;
+
+        top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < c; ++q)
+        {
+            const float* in_ptr = bottom_blob.channel(q);
+            float* out_ptr = top_blob.channel(q);
+            for (int y = 0; y < h; ++y)
+            {
+                for (int x = 0; x < out_w; ++x)
+                {
+                    float sigmoid = static_cast<float>(1.f / (1.f + exp(-in_ptr[x + out_w])));
+                    out_ptr[x] = in_ptr[x] * sigmoid;
+                }
+                in_ptr += w;
+                out_ptr += out_w;
+            }
+        }
+        return 0;
+    } // if (dims == 3 && positive_axis == 2)
+
+    return -100;
+}
+
+} // namespace ncnn
diff --git a/src/layer/glu.h b/src/layer/glu.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 Xiaomi Corp.        (author: Fangjun Kuang)
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this
+// file except in compliance with the License. You may obtain a copy of the
+// License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+#ifndef LAYER_GLU_H
+#define LAYER_GLU_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class GLU : public Layer
+{
+public:
+    GLU();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob,
+                        const Option& opt) const;
+
+public:
+    int axis;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_GLU_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -86,6 +86,7 @@ ncnn_add_layer_test(ELU)
 ncnn_add_layer_test(ExpandDims)
 ncnn_add_layer_test(Flatten)
 ncnn_add_layer_test(GELU)
+ncnn_add_layer_test(GLU)
 ncnn_add_layer_test(Gemm)
 ncnn_add_layer_test(GroupNorm)
 ncnn_add_layer_test(GRU)

diff --git a/tests/test_glu.cpp b/tests/test_glu.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 Xiaomi Corp.        (author: Fangjun Kuang)
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer/glu.h"
+#include "testutil.h"
+
+static int test_glu(const ncnn::Mat& a, int axis)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, axis);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer<ncnn::GLU>("GLU", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_glu failed a.dims=%d a=(%d %d %d) axis=%d\n", a.dims, a.w, a.h, a.c, axis);
+    }
+
+    return ret;
+}
+
+static int test_glu_0()
+{
+    return 0
+           || test_glu(RandomMat(6, 7, 24), 0)
+           || test_glu(RandomMat(6, 8, 24), 1)
+           || test_glu(RandomMat(6, 8, 24), 2)
+           || test_glu(RandomMat(36, 7, 22), 0)
+           || test_glu(RandomMat(5, 256, 23), -2)
+           || test_glu(RandomMat(129, 9, 60), 2)
+           || test_glu(RandomMat(129, 9, 30), -1);
+}
+
+static int test_glu_1()
+{
+    return 0
+           || test_glu(RandomMat(10, 24), 0)
+           || test_glu(RandomMat(7, 24), 1)
+           || test_glu(RandomMat(128, 22), 0)
+           || test_glu(RandomMat(128, 256), 1);
+}
+
+static int test_glu_2()
+{
+    return 0
+           || test_glu(RandomMat(10), 0)
+           || test_glu(RandomMat(20), 0)
+           || test_glu(RandomMat(128), 0);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return 0
+           || test_glu_0()
+           || test_glu_1()
+           || test_glu_2();
+}
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
@@ -123,6 +123,7 @@ set(pnnx_pass_level2_SRCS
     pass_level2/F_embedding.cpp
     pass_level2/F_feature_alpha_dropout.cpp
     pass_level2/F_gelu.cpp
+    pass_level2/F_glu.cpp
     pass_level2/F_grid_sample.cpp
     pass_level2/F_group_norm.cpp
     pass_level2/F_hardshrink.cpp
@@ -366,6 +367,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/F_elu.cpp
     pass_ncnn/F_embedding.cpp
     pass_ncnn/F_gelu.cpp
+    pass_ncnn/F_glu.cpp
     pass_ncnn/F_group_norm.cpp
     pass_ncnn/F_hardsigmoid.cpp
     pass_ncnn/F_hardswish.cpp