Tencent · brightening-eyes · Aug 29, 2023 · Aug 29, 2023 · Aug 29, 2023 · Aug 29, 2023
diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
@@ -1781,3 +1781,4 @@ Operation type:
 - 17 = LOG10
 - 18 = ROUND
 - 19 = TRUNC
+- 20 = ERF
diff --git a/src/layer/arm/unaryop_arm.cpp b/src/layer/arm/unaryop_arm.cpp
@@ -550,6 +550,9 @@ int UnaryOp_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     if (op_type == Operation_TRUNC)
         return unary_op_inplace<unary_op_trunc>(bottom_top_blob, opt);
 
+    if (op_type == Operation_ERF)
+        return UnaryOp::forward_inplace(bottom_top_blob, opt);
+
     return 0;
 }
 
@@ -686,6 +689,9 @@ int UnaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt)
     if (op_type == Operation_TRUNC)
         return unary_op_inplace_bf16s<unary_op_trunc>(bottom_top_blob, opt);
 
+    if (op_type == Operation_ERF)
+        return UnaryOp::forward_inplace(bottom_top_blob, opt);
+
     return 0;
 }
 #endif // NCNN_BF16

diff --git a/src/layer/loongarch/unaryop_loongarch.cpp b/src/layer/loongarch/unaryop_loongarch.cpp
@@ -482,6 +482,9 @@ int UnaryOp_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt)
     if (op_type == Operation_TRUNC)
         return unary_op_inplace<unary_op_trunc>(bottom_top_blob, opt);
 
+    if (op_type == Operation_ERF)
+        return UnaryOp::forward_inplace(bottom_top_blob, opt);
+
     return 0;
 }
 

diff --git a/src/layer/mips/unaryop_mips.cpp b/src/layer/mips/unaryop_mips.cpp
@@ -436,6 +436,27 @@ struct unary_op_trunc
 #endif // __mips_msa
 };
 
+struct unary_op_erf
+{
+    float func(const float& x) const
+    {
+        return (float)erf(x);
+    }
+#if __mips_msa
+    v4f32 func_pack4(const v4f32& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __msa_st_w((v4i32)x, tmp, 0);
+        tmp[0] = erf(tmp[0]);
+        tmp[1] = erf(tmp[1]);
+        tmp[2] = erf(tmp[2]);
+        tmp[3] = erf(tmp[3]);
+        return (v4f32)__msa_ld_w(tmp, 0);
+    }
+#endif // __mips_msa
+};
+
 } // namespace UnaryOp_mips_functor
 
 int UnaryOp_mips::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -502,6 +523,9 @@ int UnaryOp_mips::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     if (op_type == Operation_TRUNC)
         return unary_op_inplace<unary_op_trunc>(bottom_top_blob, opt);
 
+    if (op_type == Operation_ERF)
+        return unary_op_inplace<unary_op_erf>(bottom_top_blob, opt);
+
     return 0;
 }
 

diff --git a/src/layer/riscv/unaryop_riscv.cpp b/src/layer/riscv/unaryop_riscv.cpp
@@ -360,6 +360,9 @@ int UnaryOp_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
     if (op_type == Operation_TRUNC)
         return unary_op_inplace<unary_op_trunc>(bottom_top_blob, opt);
 
+    if (op_type == Operation_ERF)
+        return UnaryOp::forward_inplace(bottom_top_blob, opt);
+
     return 0;
 #else  // __riscv_vector
     return UnaryOp::forward_inplace(bottom_top_blob, opt);
@@ -683,6 +686,9 @@ int UnaryOp_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
     if (op_type == Operation_TRUNC)
         return unary_op_inplace_fp16s<unary_op_trunc_fp16s>(bottom_top_blob, opt);
 
+    if (op_type == Operation_ERF)
+        return UnaryOp::forward_inplace(bottom_top_blob, opt);
+
     return 0;
 }
 #endif // __riscv_vector && __riscv_zfh

diff --git a/src/layer/unaryop.cpp b/src/layer/unaryop.cpp
@@ -218,6 +218,14 @@ struct unary_op_trunc
     }
 };
 
+struct unary_op_erf
+{
+    float operator()(const float& x) const
+    {
+        return (float)erf(x);
+    }
+};
+
 int UnaryOp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
     if (op_type == Operation_ABS)
@@ -280,6 +288,9 @@ int UnaryOp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     if (op_type == Operation_TRUNC)
         return unary_op_inplace<unary_op_trunc>(bottom_top_blob, opt);
 
+    if (op_type == Operation_ERF)
+        return unary_op_inplace<unary_op_erf>(bottom_top_blob, opt);
+
     return 0;
 }
 

diff --git a/src/layer/unaryop.h b/src/layer/unaryop.h
@@ -49,7 +49,8 @@ class UnaryOp : public Layer
         Operation_TANH = 16,
         Operation_LOG10 = 17,
         Operation_ROUND = 18,
-        Operation_TRUNC = 19
+        Operation_TRUNC = 19,
+        Operation_ERF = 20
     };
 
 public:

diff --git a/src/layer/vulkan/shader/unaryop.comp b/src/layer/vulkan/shader/unaryop.comp
@@ -46,6 +46,21 @@ layout (push_constant) uniform parameter
     int cstep;
 } p;
 
+float erf(float x)
+{
+    float a1 =  0.254829592f;
+    float a2 = -0.284496736f;
+    float a3 =  1.421413741f;
+    float a4 = -1.453152027f;
+    float a5 =  1.061405429f;
+    float p  =  0.3275911f;
+    float s = sign(x);
+    float x_abs = abs(x);
+    float t = 1.0f/(1.0f + p*x_abs);
+    float y = 1.0f - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-x_abs*x_abs);
+    return s * y;
+}
+
 void main()
 {
     int gx = int(gl_GlobalInvocationID.x);
@@ -89,6 +104,7 @@ void main()
     if (op_type == 17) res = log(v) * afp(0.434294481903);
     if (op_type == 18) res = round(v);
     if (op_type == 19) res = trunc(v);
+    if (op_type == 20) res = erf(v);
 
 #if NCNN_image_shader
     image3d_st1(top_blob_3d, ivec3(gx, gy, gz), res);

diff --git a/src/layer/vulkan/shader/unaryop_pack4.comp b/src/layer/vulkan/shader/unaryop_pack4.comp
@@ -46,6 +46,21 @@ layout (push_constant) uniform parameter
     int cstep;
 } p;
 
+afpvec4 erf(afpvec4 x)
+{
+    afpvec4 a1 = afpvec4(0.254829592f);
+    afpvec4 a2 = afpvec4(-0.284496736f);
+    afpvec4 a3 = afpvec4(1.421413741f);
+    afpvec4 a4 = afpvec4(-1.453152027f);
+    afpvec4 a5 = afpvec4(1.061405429f);
+    afpvec4 p = afpvec4(0.3275911f);
+    afpvec4 s = sign(x);
+    afpvec4 x_abs = abs(x);
+    afpvec4 t = 1.0f / (1.0f + p * x_abs);
+    afpvec4 y = 1.0f - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x_abs * x_abs);
+    return s * y;
+}
+
 void main()
 {
     int gx = int(gl_GlobalInvocationID.x);
@@ -89,6 +104,7 @@ void main()
     if (op_type == 17) res = log(v) * afp(0.434294481903);
     if (op_type == 18) res = round(v);
     if (op_type == 19) res = trunc(v);
+    if (op_type == 20) res = erf(v);
 
 #if NCNN_image_shader
     image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res);

diff --git a/src/layer/vulkan/shader/unaryop_pack8.comp b/src/layer/vulkan/shader/unaryop_pack8.comp
@@ -47,6 +47,21 @@ layout (push_constant) uniform parameter
     int cstep;
 } p;
 
+afpvec4 erf(afpvec4 x)
+{
+    afpvec4 a1 = afpvec4(0.254829592f);
+    afpvec4 a2 = afpvec4(-0.284496736f);
+    afpvec4 a3 = afpvec4(1.421413741f);
+    afpvec4 a4 = afpvec4(-1.453152027f);
+    afpvec4 a5 = afpvec4(1.061405429f);
+    afpvec4 p = afpvec4(0.3275911f);
+    afpvec4 s = sign(x);
+    afpvec4 x_abs = abs(x);
+    afpvec4 t = 1.0f / (1.0f + p * x_abs);
+    afpvec4 y = 1.0f - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x_abs * x_abs);
+    return s * y;
+}
+
 void main()
 {
     int gx = int(gl_GlobalInvocationID.x);
@@ -171,6 +186,11 @@ void main()
         res[0] = trunc(v[0]);
         res[1] = trunc(v[1]);
     }
+    if (op_type == 20)
+    {
+        res[0] = erf(v[0]);
+        res[1] = erf(v[1]);
+    }
 
 #if NCNN_image_shader
     image3d_st8(top_blob_3d, ivec3(gx, gy, gz), res);

diff --git a/src/layer/x86/unaryop_x86.cpp b/src/layer/x86/unaryop_x86.cpp
@@ -642,6 +642,74 @@ struct unary_op_trunc
 #endif // __SSE2__
 };
 
+struct unary_op_erf
+{
+    float func(const float& x) const
+    {
+        return (float)erf(x);
+    }
+#if __SSE2__
+    __m128 func_pack4(const __m128& x) const
+    {
+        __m128 a1 = _mm_set1_ps(0.254829592f);
+        __m128 a2 = _mm_set1_ps(-0.284496736f);
+        __m128 a3 = _mm_set1_ps(1.421413741f);
+        __m128 a4 = _mm_set1_ps(-1.453152027f);
+        __m128 a5 = _mm_set1_ps(1.061405429f);
+        __m128 p = _mm_set1_ps(0.3275911f);
+        __m128 s = _mm_and_ps(x, _mm_set1_ps(-0.0f));
+        __m128 x_abs = abs_ps(x);
+        __m128 t = _mm_rcp_ps(_mm_mul_ps(_mm_add_ps(_mm_set_ps1(1.0f), p), x_abs));
+        __m128 y = _mm_set1_ps(1.0f);
+        __m128 err = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(a5, t), a4), t), a3), t), a2), t), a1), t);
+        err = exp_ps(_mm_mul_ps(_mm_sub_ps(_mm_setzero_ps(), x_abs), x_abs));
+        y = _mm_sub_ps(y, err);
+        return _mm_mul_ps(s, y);
+    }
+#if __AVX__
+    __m256 func_pack8(const __m256& x) const
+    {
+        __m256 a1 = _mm256_set1_ps(0.254829592f);
+        __m256 a2 = _mm256_set1_ps(-0.284496736f);
+        __m256 a3 = _mm256_set1_ps(1.421413741f);
+        __m256 a4 = _mm256_set1_ps(-1.453152027f);
+        __m256 a5 = _mm256_set1_ps(1.061405429f);
+        __m256 p = _mm256_set1_ps(0.3275911f);
+        const __m256 zero = _mm256_set1_ps(0.0f);
+        __m256 positives = _mm256_and_ps(_mm256_cmp_ps(x, zero, _CMP_GT_OQ), _mm256_set1_ps(1.0f));
+        __m256 negatives = _mm256_and_ps(_mm256_cmp_ps(x, zero, _CMP_LT_OQ), _mm256_set1_ps(-1.0f));
+        __m256 s = _mm256_or_ps(positives, negatives);
+        __m256 x_abs = abs256_ps(x);
+        __m256 t = _mm256_rcp_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_set1_ps(1.0f), p), x_abs));
+        __m256 y = _mm256_set1_ps(1.0f);
+        __m256 err = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(a5, t), a4), t), a3), t), a2), t), a1), t);
+        err = exp256_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_setzero_ps(), x_abs), x_abs));
+        y = _mm256_sub_ps(y, err);
+        return _mm256_mul_ps(s, y);
+    }
+#if __AVX512F__
+    __m512 func_pack16(const __m512& x) const
+    {
+        __m512 a1 = _mm512_set1_ps(0.254829592f);
+        __m512 a2 = _mm512_set1_ps(-0.284496736f);
+        __m512 a3 = _mm512_set1_ps(1.421413741f);
+        __m512 a4 = _mm512_set1_ps(-1.453152027f);
+        __m512 a5 = _mm512_set1_ps(1.061405429f);
+        __m512 p = _mm512_set1_ps(0.3275911f);
+        __m512 s = _mm512_and_ps(x, _mm512_set1_ps(-0.0f));
+        __m512 x_abs = abs512_ps(x);
+__m512 t = _mm512_div_ps(_mm512_set1_ps(1.0f), _mm512_mul_ps(_mm512_add_ps(_mm512_set1_ps(1.0f), p), x_abs));
+        __m512 y = _mm512_set1_ps(1.0f);
+        __m512 err = _mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(a5, t), a4), t), a3), t), a2), t), a1), t);
+        err = exp512_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_setzero_ps(), x_abs), x_abs));
+        y = _mm512_sub_ps(y, err);
+        return _mm512_mul_ps(s, y);
+    }
+#endif // __AVX512F__
+#endif // __AVX__
+#endif // __SSE2__
+};
+
 } // namespace UnaryOp_x86_functor
 
 int UnaryOp_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -707,6 +775,9 @@ int UnaryOp_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     if (op_type == Operation_TRUNC)
         return unary_op_inplace<unary_op_trunc>(bottom_top_blob, opt);
 
+    if (op_type == Operation_ERF)
+        return UnaryOp::forward_inplace(bottom_top_blob, opt);
+
     return 0;
 }
 

diff --git a/tests/test_unaryop.cpp b/tests/test_unaryop.cpp
@@ -15,7 +15,7 @@
 #include "layer/unaryop.h"
 #include "testutil.h"
 
-#define OP_TYPE_MAX 20
+#define OP_TYPE_MAX 21
 
 static int op_type = 0;
 

diff --git a/tools/onnx/onnx2ncnn.cpp b/tools/onnx/onnx2ncnn.cpp
@@ -3714,6 +3714,10 @@ int main(int argc, char** argv)
         {
             fprintf(pp, "%-16s", "EmbedLayerNormalization");
         }
+        else if (op == "Erf")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
         else if (op == "Exp")
         {
             fprintf(pp, "%-16s", "UnaryOp");
@@ -4510,6 +4514,11 @@ int main(int argc, char** argv)
 
             fwrite_tensor_proto_data(B, bp);
         }
+        else if (op == "Erf")
+        {
+            int op_type = 20;
+            fprintf(pp, " 0=%d", op_type);
+        }
         else if (op == "Exp")
         {
             int op_type = 7;