diff --git a/.ci/test-coverage.yml b/.ci/test-coverage.yml
index f46bf6e3621..7ccc106564f 100644
--- a/.ci/test-coverage.yml
+++ b/.ci/test-coverage.yml
@@ -79,7 +79,7 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test
       run: |
         printf "[Processor]\nThreadCount=4\n" > build/tests/SwiftShader.ini
@@ -159,7 +159,7 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test
       run: |
         export LP_NUM_THREADS=4
@@ -230,9 +230,9 @@ jobs:
             -DNCNN_AVX512BF16=${{matrix.AVX512BF16}} \
             -DNCNN_AVX512FP16=${{matrix.AVX512FP16}} \
             ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test
-      run: cd build && ctest --output-on-failure -j $(nproc)
+      run: cd build && ctest --output-on-failure -j 4
     - name: lcov-collect
       run: |
         cd build
@@ -309,12 +309,12 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_GNU_INLINE_ASM=${{matrix.GNU_INLINE_ASM}} -DNCNN_VFPV4=ON -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test
       run: |
         export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j $(nproc)
+        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j 4
     - name: lcov-collect
       run: |
         cd build
@@ -327,12 +327,12 @@ jobs:
       run: |
         mkdir build-armhf-vfpv3-d16 && cd build-armhf-vfpv3-d16
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_GNU_INLINE_ASM=${{matrix.GNU_INLINE_ASM}} -DNCNN_VFPV4=OFF -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test-armhf-vfpv3-d16
       run: |
         export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
         cd build-armhf-vfpv3-d16
-        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)
+        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 4
     - name: lcov-collect-armhf-vfpv3-d16
       run: |
         cd build-armhf-vfpv3-d16
@@ -423,12 +423,12 @@ jobs:
             -DNCNN_ARM84BF16=${{matrix.ARM84BF16}} \
             -DNCNN_ARM84I8MM=${{matrix.ARM84I8MM}} \
             ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test
       run: |
         export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
+        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 4
     - name: lcov-collect
       run: |
         cd build
@@ -502,12 +502,12 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_MSA=OFF -DNCNN_MMI=OFF -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test
       run: |
         export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j $(nproc)
+        TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j 4
     - name: lcov-collect
       run: |
         cd build
@@ -581,12 +581,12 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_MSA=ON -DNCNN_MMI=OFF -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test
       run: |
         export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j $(nproc)
+        TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j 4
     - name: lcov-collect
       run: |
         cd build
@@ -660,12 +660,12 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/powerpc-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test
       run: |
         export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-ppc TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc-linux-gnu" ctest --output-on-failure -j $(nproc)
+        TESTS_EXECUTABLE_LOADER=qemu-ppc TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc-linux-gnu" ctest --output-on-failure -j 4
     - name: lcov-collect
       run: |
         cd build
@@ -739,12 +739,12 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/powerpc64le-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test
       run: |
         export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j $(nproc)
+        TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j 4
     - name: lcov-collect
       run: |
         cd build
@@ -824,12 +824,12 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test
       run: |
         export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j $(nproc)
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j 4
     - name: lcov-collect
       run: |
         cd build
@@ -951,12 +951,12 @@ jobs:
         export RISCV_ROOT_PATH=${{ci.workspace}}/rv64gcv-install
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DCMAKE_C_FLAGS="-O1" -DCMAKE_CXX_FLAGS="-O1" -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test-vlen128
       run: |
         export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,x-zvfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j $(nproc)
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,x-zvfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j 4
     - name: lcov-collect-vlen128
       run: |
         cd build
@@ -971,7 +971,7 @@ jobs:
       run: |
         export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,x-zvfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j $(nproc)
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,x-zvfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j 4
     - name: lcov-collect-vlen256
       run: |
         cd build
@@ -1051,12 +1051,12 @@ jobs:
         export LOONGARCH64_ROOT_PATH=${{ci.workspace}}/cross-tools
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_LSX=ON -DNCNN_LASX=OFF -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test
       run: |
         export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-loongarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;${{ci.workspace}}/cross-tools/target" ctest --output-on-failure -j $(nproc)
+        TESTS_EXECUTABLE_LOADER=qemu-loongarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;${{ci.workspace}}/cross-tools/target" ctest --output-on-failure -j 4
     - name: lcov-collect
       run: |
         cd build
@@ -1099,9 +1099,9 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j $(nproc)
+        cmake --build . -j 4
     - name: test
-      run: cd build && ctest --output-on-failure -j $(nproc)
+      run: cd build && ctest --output-on-failure -j 4
     - name: lcov-collect
       run: |
         cd build
diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
index fe025456f48..c55b3f27687 100644
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -241,13 +241,13 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
             op->create_pipeline(opt);
 
             // forward
-            op->forward(bottom_blob, top_blob, opt);
+            int ret = op->forward(bottom_blob, top_blob, opt);
 
             op->destroy_pipeline(opt);
 
             delete op;
 
-            return 0;
+            return ret;
         }
     }
 
@@ -401,6 +401,8 @@ int Convolution::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Optio
         opt_g.blob_allocator = opt.workspace_allocator;
 
         quantize_to_int8(bottom_blob, bottom_blob_unbordered, bottom_blob_int8_scales, opt_g);
+        if (bottom_blob_unbordered.empty())
+            return -100;
     }
 
     Mat bottom_blob_bordered;
diff --git a/tests/test_convolution_oom.cpp b/tests/test_convolution_oom.cpp
new file mode 100644
index 00000000000..6643753359a
--- /dev/null
+++ b/tests/test_convolution_oom.cpp
@@ -0,0 +1,149 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "testutil.h"
+
+static int test_convolution_oom(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
+{
+    ncnn::Mat a = RandomMat(w, h, c);
+
+    ncnn::ParamDict pd;
+    pd.set(0, outch);
+    pd.set(1, kernel);
+    pd.set(2, dilation);
+    pd.set(3, stride);
+    pd.set(4, pad);
+    pd.set(5, bias);
+    pd.set(6, outch * c * kernel * kernel);
+
+    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
+    ncnn::Mat activation_params(2);
+    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
+    activation_params[1] = RandomFloat(0, 1);                                               // beta
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    std::vector<ncnn::Mat> weights(bias ? 2 : 1);
+    weights[0] = RandomMat(outch * c * kernel * kernel);
+    if (bias)
+        weights[1] = RandomMat(outch);
+
+    int ret = test_layer_oom("Convolution", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_convolution_oom failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
+        return ret;
+    }
+
+    return ret;
+}
+
+static int test_convolution_0()
+{
+    return 0
+           || test_convolution_oom(9, 7, 31, 63, 1, 1, 1, 0, 1)
+           || test_convolution_oom(9, 7, 31, 63, 3, 1, 1, 1, 1);
+}
+
+#if NCNN_INT8
+static int test_convolution_oom_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, bool requant = false)
+{
+    ncnn::Mat a = RandomMat(w, h, c);
+
+    ncnn::ParamDict pd;
+    pd.set(0, outch);
+    pd.set(1, kernel);
+    pd.set(2, dilation);
+    pd.set(3, stride);
+    pd.set(4, pad);
+    pd.set(5, bias);
+    pd.set(6, outch * c * kernel * kernel);
+    pd.set(8, requant ? 101 : 1); // int8_scale_term
+
+    int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
+    ncnn::Mat activation_params(2);
+    activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha
+    activation_params[1] = RandomFloat(0, 1);                                               // beta
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    std::vector<ncnn::Mat> weights(bias ? 5 : 4);
+    weights[0] = RandomMat(outch * c * kernel * kernel);
+
+    ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel);
+    ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep);
+    ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat();
+
+    if (kernel == 3 && dilation == 1 && stride == 1)
+    {
+        // test for 6bit quant
+        for (int i = 0; i < weight_scales.w; i++)
+            weight_scales[i] = weight_scales[i] / 4.f;
+    }
+
+    if (bias)
+    {
+        weights[1] = RandomMat(outch);
+        weights[2] = weight_scales;
+        weights[3] = input_scales;
+        weights[4] = top_scales;
+    }
+    else
+    {
+        weights[1] = weight_scales;
+        weights[2] = input_scales;
+        weights[3] = top_scales;
+    }
+
+    int flag = TEST_LAYER_DISABLE_GPU_TESTING;
+    int ret = test_layer_oom("Convolution", pd, weights, a, flag);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_convolution_oom_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
+        return ret;
+    }
+
+    return ret;
+}
+
+static int test_convolution_1()
+{
+    return 0
+           || test_convolution_oom_int8(9, 7, 31, 63, 1, 1, 1, 0, 1)
+           || test_convolution_oom_int8(9, 7, 31, 63, 3, 1, 1, 1, 1);
+}
+
+static int test_convolution_2()
+{
+    return 0
+           || test_convolution_oom_int8(9, 7, 31, 63, 1, 1, 1, 0, 1, true)
+           || test_convolution_oom_int8(9, 7, 31, 63, 3, 1, 1, 1, 1, true);
+}
+#endif // NCNN_INT8
+
+int main()
+{
+    SRAND(7767517);
+
+#if __mips__ || __loongarch64 || __riscv
+    // TODO
+    return 0;
+#endif
+
+#if NCNN_INT8
+    return test_convolution_0() || test_convolution_1() || test_convolution_2();
+#else
+    return test_convolution_0();
+#endif
+}
diff --git a/tests/test_softmax_oom.cpp b/tests/test_softmax_oom.cpp
new file mode 100644
index 00000000000..5fea7636939
--- /dev/null
+++ b/tests/test_softmax_oom.cpp
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "testutil.h"
+
+static int test_softmax_oom(const ncnn::Mat& a, int axis)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, axis); // axis
+    pd.set(1, 1);    // fixbug0
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer_oom("Softmax", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_softmax_oom failed a.dims=%d a=(%d %d %d) axis=%d\n", a.dims, a.w, a.h, a.c, axis);
+    }
+
+    return ret;
+}
+
+static int test_softmax_0()
+{
+    ncnn::Mat a = RandomMat(25, 27, 32);
+    return test_softmax_oom(a, 0) || test_softmax_oom(a, 1) || test_softmax_oom(a, 2);
+}
+
+static int test_softmax_1()
+{
+    ncnn::Mat a = RandomMat(25, 32);
+    return test_softmax_oom(a, 0) || test_softmax_oom(a, 1);
+}
+
+static int test_softmax_2()
+{
+    ncnn::Mat a = RandomMat(128);
+    return test_softmax_oom(a, 0);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return 0
+           || test_softmax_0()
+           || test_softmax_1()
+           || test_softmax_2();
+}
diff --git a/tests/testutil.cpp b/tests/testutil.cpp
index 2e76f6f3901..07d95547d44 100644
--- a/tests/testutil.cpp
+++ b/tests/testutil.cpp
@@ -19,6 +19,7 @@
 #include "mat.h"
 #include "prng.h"
 
+#include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -323,6 +324,166 @@ int CompareMat(const std::vector<ncnn::Mat>& a, const std::vector<ncnn::Mat>& b,
     return 0;
 }
 
+static int convert_to_optimal_layout(const ncnn::Mat& a, ncnn::Mat& a4, const ncnn::Option& opt, const ncnn::Layer* op, int flag)
+{
+    // clang-format off
+    // *INDENT-OFF*
+#if NCNN_VFPV4
+    if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::cast_float32_to_float16(a, a4, opt);
+    }
+    else
+#endif // NCNN_VFPV4
+#if NCNN_RVV
+    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::cast_float32_to_float16(a, a4, opt);
+    }
+    else
+#endif // NCNN_RVV
+#if NCNN_BF16
+    if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::cast_float32_to_bfloat16(a, a4, opt);
+    }
+    else
+#endif // NCNN_BF16
+    if (opt.use_fp16_storage && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::cast_float32_to_float16(a, a4, opt);
+    }
+    else
+    {
+        a4 = a;
+    }
+    // *INDENT-ON*
+    // clang-format on
+
+    if (opt.use_packing_layout && op->support_packing && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_PACKING))
+    {
+        // resolve dst_elempack
+        int dims = a4.dims;
+        int elemcount = 0;
+        if (dims == 1) elemcount = a4.elempack * a4.w;
+        if (dims == 2) elemcount = a4.elempack * a4.h;
+        if (dims == 3 || dims == 4) elemcount = a4.elempack * a4.c;
+
+        int elembits = a4.elembits();
+
+        int dst_elempack = 1;
+
+        if (elembits == 32)
+        {
+#if NCNN_AVX512
+            if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
+                dst_elempack = 16;
+            else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
+                dst_elempack = 8;
+            else if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#elif NCNN_AVX
+            if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
+                dst_elempack = 8;
+            else if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#elif NCNN_RVV
+            const int packn = ncnn::cpu_riscv_vlenb() / (elembits / 8);
+            if (elemcount % packn == 0)
+                dst_elempack = packn;
+#else
+            if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#endif
+        }
+        if (elembits == 16)
+        {
+#if NCNN_ARM82
+            if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
+                dst_elempack = 8;
+            else if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#elif NCNN_RVV
+            const int packn = ncnn::cpu_riscv_vlenb() / 2;
+            if (elemcount % packn == 0)
+                dst_elempack = packn;
+#else
+            if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#endif
+        }
+        if (elembits == 8)
+        {
+#if NCNN_RVV
+            const int packn = ncnn::cpu_riscv_vlenb() / 1;
+            if (elemcount % packn == 0)
+                dst_elempack = packn;
+#else
+            if (elemcount % 8 == 0)
+                dst_elempack = 8;
+#endif
+        }
+
+        if (flag & TEST_LAYER_ENABLE_FORCE_INPUT_PACK8)
+            dst_elempack = 8;
+
+        ncnn::Mat a4_packed;
+        ncnn::convert_packing(a4, a4_packed, dst_elempack, opt);
+        a4 = a4_packed;
+    }
+
+    return 0;
+}
+
+static int convert_to_vanilla_layout(const ncnn::Mat& c4, ncnn::Mat& c, const ncnn::Option& opt, const ncnn::Layer* op, int flag)
+{
+    ncnn::Mat c4_unpacked;
+    if (c4.elempack != 1)
+    {
+        ncnn::convert_packing(c4, c4_unpacked, 1, opt);
+    }
+    else
+    {
+        c4_unpacked = c4;
+    }
+
+    // clang-format off
+    // *INDENT-OFF*
+#if NCNN_VFPV4
+    if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
+    {
+        ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
+    }
+    else
+#endif // NCNN_VFPV4
+#if NCNN_RVV
+    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
+    {
+        ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
+    }
+    else
+#endif // NCNN_RVV
+#if NCNN_BF16
+    if (opt.use_bf16_storage && op->support_bf16_storage && c4_unpacked.elembits() == 16)
+    {
+        ncnn::cast_bfloat16_to_float32(c4_unpacked, c, opt);
+    }
+    else
+#endif // NCNN_BF16
+    if (opt.use_fp16_storage && op->support_fp16_storage && c4_unpacked.elembits() == 16)
+    {
+        ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
+    }
+    else
+    {
+        c = c4_unpacked;
+    }
+    // *INDENT-ON*
+    // clang-format on
+
+    return 0;
+}
+
 int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& b, void (*func)(ncnn::Layer*), int flag)
 {
     ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
@@ -444,111 +605,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
 
     for (size_t i = 0; i < a4.size(); i++)
     {
-        // clang-format off
-        // *INDENT-OFF*
-#if NCNN_VFPV4
-        if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-        {
-            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
-        }
-        else
-#endif // NCNN_VFPV4
-#if NCNN_RVV
-        if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-        {
-            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
-        }
-        else
-#endif // NCNN_RVV
-#if NCNN_BF16
-        if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-        {
-            ncnn::cast_float32_to_bfloat16(a[i], a4[i], opt);
-        }
-        else
-#endif // NCNN_BF16
-        if (opt.use_fp16_storage && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-        {
-            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
-        }
-        else
-        {
-            a4[i] = a[i];
-        }
-        // *INDENT-ON*
-        // clang-format on
-
-        if (opt.use_packing_layout && op->support_packing && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_PACKING))
-        {
-            // resolve dst_elempack
-            int dims = a4[i].dims;
-            int elemcount = 0;
-            if (dims == 1) elemcount = a4[i].elempack * a4[i].w;
-            if (dims == 2) elemcount = a4[i].elempack * a4[i].h;
-            if (dims == 3 || dims == 4) elemcount = a4[i].elempack * a4[i].c;
-
-            int elembits = a4[i].elembits();
-
-            int dst_elempack = 1;
-
-            if (elembits == 32)
-            {
-#if NCNN_AVX512
-                if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
-                    dst_elempack = 16;
-                else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
-                    dst_elempack = 8;
-                else if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#elif NCNN_AVX
-                if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
-                    dst_elempack = 8;
-                else if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#elif NCNN_RVV
-                const int packn = ncnn::cpu_riscv_vlenb() / (elembits / 8);
-                if (elemcount % packn == 0)
-                    dst_elempack = packn;
-#else
-                if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#endif
-            }
-            if (elembits == 16)
-            {
-#if NCNN_ARM82
-                if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
-                    dst_elempack = 8;
-                else if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#elif NCNN_RVV
-                const int packn = ncnn::cpu_riscv_vlenb() / 2;
-                if (elemcount % packn == 0)
-                    dst_elempack = packn;
-#else
-                if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#endif
-            }
-            if (elembits == 8)
-            {
-#if NCNN_RVV
-                const int packn = ncnn::cpu_riscv_vlenb() / 1;
-                if (elemcount % packn == 0)
-                    dst_elempack = packn;
-#else
-                if (elemcount % 8 == 0)
-                    dst_elempack = 8;
-#endif
-            }
-
-            if (flag & TEST_LAYER_ENABLE_FORCE_INPUT_PACK8)
-                dst_elempack = 8;
-
-            ncnn::Mat a4_packed;
-            ncnn::convert_packing(a4[i], a4_packed, dst_elempack, opt);
-            a4[i] = a4_packed;
-        }
+        convert_to_optimal_layout(a[i], a4[i], opt, op, flag);
     }
 
     c.resize(top_blob_count);
@@ -569,43 +626,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
 
     for (size_t i = 0; i < c.size(); i++)
     {
-        // clang-format off
-        // *INDENT-OFF*
-#if NCNN_VFPV4
-        if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c[i].elembits() == 16)
-        {
-            ncnn::Mat c_fp32;
-            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
-            c[i] = c_fp32;
-        }
-        else
-#endif // NCNN_VFPV4
-#if NCNN_RVV
-        if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c[i].elembits() == 16)
-        {
-            ncnn::Mat c_fp32;
-            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
-            c[i] = c_fp32;
-        }
-        else
-#endif // NCNN_RVV
-#if NCNN_BF16
-        if (opt.use_bf16_storage && op->support_bf16_storage && c[i].elembits() == 16)
-        {
-            ncnn::Mat c_fp32;
-            ncnn::cast_bfloat16_to_float32(c[i], c_fp32, opt);
-            c[i] = c_fp32;
-        }
-        else
-#endif // NCNN_BF16
-        if (opt.use_fp16_storage && op->support_fp16_storage && c[i].elembits() == 16)
-        {
-            ncnn::Mat c_fp32;
-            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
-            c[i] = c_fp32;
-        }
-        // *INDENT-ON*
-        // clang-format on
+        convert_to_vanilla_layout(c[i], c[i], opt, op, flag);
     }
 
     op->destroy_pipeline(opt);
@@ -958,181 +979,40 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
     }
 
     ncnn::Mat a4;
+    convert_to_optimal_layout(a, a4, opt, op, flag);
 
-    // clang-format off
-    // *INDENT-OFF*
-#if NCNN_VFPV4
-    if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        ncnn::cast_float32_to_float16(a, a4, opt);
-    }
-    else
-#endif // NCNN_VFPV4
-#if NCNN_RVV
-    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    if (op->support_inplace)
     {
-        ncnn::cast_float32_to_float16(a, a4, opt);
+        c = a4.clone();
+        op->forward_inplace(c, opt);
     }
     else
-#endif // NCNN_RVV
-#if NCNN_BF16
-    if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
     {
-        ncnn::cast_float32_to_bfloat16(a, a4, opt);
+        op->forward(a4, c, opt);
     }
-    else
-#endif // NCNN_BF16
-    if (opt.use_fp16_storage && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+
+    convert_to_vanilla_layout(c, c, opt, op, flag);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+#if NCNN_VULKAN
+int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& d, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
+{
+    if (!_opt.use_packing_layout)
     {
-        ncnn::cast_float32_to_float16(a, a4, opt);
+        // pack1 test is useless for gpu
+        return 233;
     }
-    else
+
+    ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
+    if (!op)
     {
-        a4 = a;
-    }
-    // *INDENT-ON*
-    // clang-format on
-
-    if (opt.use_packing_layout && op->support_packing && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_PACKING))
-    {
-        // resolve dst_elempack
-        int dims = a4.dims;
-        int elemcount = 0;
-        if (dims == 1) elemcount = a4.elempack * a4.w;
-        if (dims == 2) elemcount = a4.elempack * a4.h;
-        if (dims == 3 || dims == 4) elemcount = a4.elempack * a4.c;
-
-        int elembits = a4.elembits();
-
-        int dst_elempack = 1;
-
-        if (elembits == 32)
-        {
-#if NCNN_AVX512
-            if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
-                dst_elempack = 16;
-            else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
-                dst_elempack = 8;
-            else if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#elif NCNN_AVX
-            if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
-                dst_elempack = 8;
-            else if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#elif NCNN_RVV
-            const int packn = ncnn::cpu_riscv_vlenb() / (elembits / 8);
-            if (elemcount % packn == 0)
-                dst_elempack = packn;
-#else
-            if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#endif
-        }
-        if (elembits == 16)
-        {
-#if NCNN_ARM82
-            if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
-                dst_elempack = 8;
-            else if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#elif NCNN_RVV
-            const int packn = ncnn::cpu_riscv_vlenb() / 2;
-            if (elemcount % packn == 0)
-                dst_elempack = packn;
-#else
-            if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#endif
-        }
-        if (elembits == 8)
-        {
-#if NCNN_RVV
-            const int packn = ncnn::cpu_riscv_vlenb() / 1;
-            if (elemcount % packn == 0)
-                dst_elempack = packn;
-#else
-            if (elemcount % 8 == 0)
-                dst_elempack = 8;
-#endif
-        }
-
-        if (flag & TEST_LAYER_ENABLE_FORCE_INPUT_PACK8)
-            dst_elempack = 8;
-
-        ncnn::Mat a4_packed;
-        ncnn::convert_packing(a4, a4_packed, dst_elempack, opt);
-        a4 = a4_packed;
-    }
-
-    if (op->support_inplace)
-    {
-        c = a4.clone();
-        op->forward_inplace(c, opt);
-    }
-    else
-    {
-        op->forward(a4, c, opt);
-    }
-
-    // clang-format off
-    // *INDENT-OFF*
-#if NCNN_VFPV4
-    if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c.elembits() == 16)
-    {
-        ncnn::Mat c_fp32;
-        ncnn::cast_float16_to_float32(c, c_fp32, opt);
-        c = c_fp32;
-    }
-    else
-#endif // NCNN_VFPV4
-#if NCNN_RVV
-    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c.elembits() == 16)
-    {
-        ncnn::Mat c_fp32;
-        ncnn::cast_float16_to_float32(c, c_fp32, opt);
-        c = c_fp32;
-    }
-    else
-#endif // NCNN_RVV
-#if NCNN_BF16
-    if (opt.use_bf16_storage && op->support_bf16_storage && c.elembits() == 16)
-    {
-        ncnn::Mat c_fp32;
-        ncnn::cast_bfloat16_to_float32(c, c_fp32, opt);
-        c = c_fp32;
-    }
-    else
-#endif // NCNN_BF16
-    if (opt.use_fp16_storage && op->support_fp16_storage && c.elembits() == 16)
-    {
-        ncnn::Mat c_fp32;
-        ncnn::cast_float16_to_float32(c, c_fp32, opt);
-        c = c_fp32;
-    }
-    // *INDENT-ON*
-    // clang-format on
-
-    op->destroy_pipeline(opt);
-
-    delete op;
-
-    return 0;
-}
-
-#if NCNN_VULKAN
-int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& d, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
-{
-    if (!_opt.use_packing_layout)
-    {
-        // pack1 test is useless for gpu
-        return 233;
-    }
-
-    ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
-    if (!op)
-    {
-        return 233;
+        return 233;
     }
 
     op->load_param(pd);
@@ -1581,3 +1461,354 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec
 
     return 0;
 }
+
+class TestOOMAllocator : public ncnn::UnlockedPoolAllocator
+{
+public:
+    TestOOMAllocator();
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+    ncnn::Mutex lock;
+    int counter;
+    int failid;
+};
+
+TestOOMAllocator::TestOOMAllocator()
+{
+    counter = 0;
+    failid = INT_MAX;
+}
+
+void* TestOOMAllocator::fastMalloc(size_t size)
+{
+    lock.lock();
+
+    void* ptr;
+    if (counter == failid)
+    {
+        ptr = 0;
+    }
+    else
+    {
+        ptr = ncnn::UnlockedPoolAllocator::fastMalloc(size);
+    }
+    counter++;
+
+    lock.unlock();
+
+    return ptr;
+}
+
+void TestOOMAllocator::fastFree(void* ptr)
+{
+    lock.lock();
+
+    ncnn::UnlockedPoolAllocator::fastFree(ptr);
+
+    lock.unlock();
+}
+
+int test_layer_oom_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, int flag)
+{
+    int typeindex = ncnn::layer_to_index(layer_type);
+    if (typeindex == -1)
+        return -1;
+
+    ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
+
+    if (!op->support_packing && _opt.use_packing_layout)
+    {
+        delete op;
+        return 233;
+    }
+    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
+    {
+        delete op;
+        return 233;
+    }
+
+    op->load_param(pd);
+
+    if (op->one_blob_only && a.size() != 1)
+    {
+        fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
+        delete op;
+        return -1;
+    }
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::Option opt = _opt;
+    opt.num_threads = 1;
+    opt.use_vulkan_compute = false;
+
+    op->create_pipeline(opt);
+
+    if (!op->support_packing && _opt.use_packing_layout)
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+
+    std::vector<ncnn::Mat> a4(a.size());
+
+    for (size_t i = 0; i < a4.size(); i++)
+    {
+        convert_to_optimal_layout(a[i], a4[i], opt, op, flag);
+    }
+
+    TestOOMAllocator test_oom_allocator;
+    opt.blob_allocator = &test_oom_allocator;
+    opt.workspace_allocator = &test_oom_allocator;
+
+    std::vector<ncnn::Mat> c;
+    c.resize(top_blob_count);
+
+    if (op->support_inplace)
+    {
+        for (size_t i = 0; i < a4.size(); i++)
+        {
+            c[i] = a4[i].clone();
+        }
+
+        op->forward_inplace(c, opt);
+    }
+    else
+    {
+        op->forward(a4, c, opt);
+    }
+
+    for (int i = 0; i < top_blob_count; i++)
+    {
+        c[i].release();
+    }
+
+    const int alloc_count = test_oom_allocator.counter;
+    for (int i = 0; i < alloc_count; i++)
+    {
+        test_oom_allocator.counter = 0;
+        test_oom_allocator.failid = i;
+
+        int ret = 0;
+        if (op->support_inplace)
+        {
+            for (size_t i = 0; i < a4.size(); i++)
+            {
+                c[i] = a4[i].clone();
+            }
+
+            ret = op->forward_inplace(c, opt);
+        }
+        else
+        {
+            ret = op->forward(a4, c, opt);
+        }
+
+        for (int i = 0; i < top_blob_count; i++)
+        {
+            c[i].release();
+        }
+
+        if (ret != -100)
+        {
+            fprintf(stderr, "oom not catched %d/%d\n", i, alloc_count);
+
+            op->destroy_pipeline(opt);
+
+            delete op;
+
+            return -1;
+        }
+    }
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+int test_layer_oom_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, int flag)
+{
+    int typeindex = ncnn::layer_to_index(layer_type);
+    if (typeindex == -1)
+        return -1;
+
+    ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
+
+    if (!op->support_packing && _opt.use_packing_layout)
+    {
+        delete op;
+        return 233;
+    }
+    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
+    {
+        delete op;
+        return 233;
+    }
+
+    op->load_param(pd);
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::Option opt = _opt;
+    opt.num_threads = 1;
+    opt.use_vulkan_compute = false;
+
+    op->create_pipeline(opt);
+
+    if (!op->support_packing && _opt.use_packing_layout)
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+
+    ncnn::Mat a4;
+    convert_to_optimal_layout(a, a4, opt, op, flag);
+
+    TestOOMAllocator test_oom_allocator;
+    opt.blob_allocator = &test_oom_allocator;
+    opt.workspace_allocator = &test_oom_allocator;
+
+    ncnn::Mat c;
+
+    if (op->support_inplace)
+    {
+        c = a4.clone();
+        op->forward_inplace(c, opt);
+    }
+    else
+    {
+        op->forward(a4, c, opt);
+    }
+
+    c.release();
+
+    const int alloc_count = test_oom_allocator.counter;
+    for (int i = 0; i < alloc_count; i++)
+    {
+        test_oom_allocator.counter = 0;
+        test_oom_allocator.failid = i;
+
+        int ret = 0;
+        if (op->support_inplace)
+        {
+            c = a4.clone();
+            ret = op->forward_inplace(c, opt);
+        }
+        else
+        {
+            ret = op->forward(a4, c, opt);
+        }
+
+        c.release();
+
+        if (ret != -100)
+        {
+            fprintf(stderr, "oom not catched %d/%d\n", i, alloc_count);
+
+            op->destroy_pipeline(opt);
+
+            delete op;
+
+            return -1;
+        }
+    }
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+int test_layer_oom(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, int flag)
+{
+    // pack fp16p fp16s fp16a bf16s shader8 image
+    const int options[][7] = {
+        {0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 1, 0, 0, 0, 0},
+        {0, 0, 1, 1, 1, 0, 0},
+        {1, 0, 0, 0, 0, 0, 0},
+        {1, 1, 0, 0, 1, 0, 0},
+        {1, 0, 1, 0, 0, 1, 0},
+        {1, 1, 1, 1, 0, 0, 0},
+        {1, 1, 1, 1, 1, 1, 1},
+    };
+
+    const int opt_count = sizeof(options) / sizeof(options[0]);
+
+    for (int i = 0; i < opt_count; i++)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = options[i][0];
+        opt.use_fp16_packed = options[i][1];
+        opt.use_fp16_storage = options[i][2];
+        opt.use_fp16_arithmetic = options[i][3];
+        opt.use_bf16_storage = options[i][4];
+        opt.use_shader_pack8 = options[i][5];
+        opt.use_image_storage = options[i][6];
+
+        int ret = test_layer_oom_opt(layer_type, pd, weights, opt, a, top_blob_count, flag);
+        if (ret != 233 && ret != 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+int test_layer_oom(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, int flag)
+{
+    // pack fp16p fp16s fp16a bf16s shader8 image
+    const int options[][7] = {
+        {0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 1, 0, 0, 0, 0},
+        {0, 0, 1, 1, 1, 0, 0},
+        {1, 0, 0, 0, 0, 0, 0},
+        {1, 1, 0, 0, 1, 0, 0},
+        {1, 0, 1, 0, 0, 1, 0},
+        {1, 1, 1, 1, 0, 0, 0},
+        {1, 1, 1, 1, 1, 1, 1},
+    };
+
+    const int opt_count = sizeof(options) / sizeof(options[0]);
+
+    for (int i = 0; i < opt_count; i++)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = options[i][0];
+        opt.use_fp16_packed = options[i][1];
+        opt.use_fp16_storage = options[i][2];
+        opt.use_fp16_arithmetic = options[i][3];
+        opt.use_bf16_storage = options[i][4];
+        opt.use_shader_pack8 = options[i][5];
+        opt.use_image_storage = options[i][6];
+
+        int ret = test_layer_oom_opt(layer_type, pd, weights, opt, a, flag);
+        if (ret != 233 && ret != 0)
+            return ret;
+    }
+
+    return 0;
+}
diff --git a/tests/testutil.h b/tests/testutil.h
index 12f9d0daa65..60ff4d65260 100644
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -106,4 +106,14 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec
 
 int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0);
 
+// oom test
+
+int test_layer_oom_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, int flag = 0);
+
+int test_layer_oom_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const ncnn::Mat& a, int flag = 0);
+
+int test_layer_oom(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, int flag = 0);
+
+int test_layer_oom(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, int flag = 0);
+
 #endif // TESTUTIL_H