diff --git a/.ci/pnnx.yml b/.ci/pnnx.yml
index 5c44354aaaa1..d49da39a0afc 100644
--- a/.ci/pnnx.yml
+++ b/.ci/pnnx.yml
@@ -4,12 +4,14 @@ on:
     branches: [master]
     paths:
     - '.ci/pnnx.yml'
+    - 'src/layer/*'
     - 'tools/pnnx/**'
     - '!tools/pnnx/README.md'
   mr:
     target-branches: [master]
     paths:
     - '.ci/pnnx.yml'
+    - 'src/layer/*'
     - 'tools/pnnx/**'
     - '!tools/pnnx/README.md'
 concurrency:
@@ -17,10 +19,10 @@ concurrency:
 
 variables:
   protobuf_version: 21.12
-  libtorch_version: 2.3.0
-  libtorchvision_version: 0.18.0
-  onnxruntime_version: 1.17.3
-  cache_date: 20240504
+  libtorch_version: 2.4.0
+  libtorchvision_version: 0.19.0
+  onnxruntime_version: 1.18.1
+  cache_date: 20240804
 
 jobs:
   ubuntu:
@@ -57,6 +59,9 @@ jobs:
           - torch-version: 2.3.0
             torchvision-version: 0.18.0
 
+          - torch-version: 2.4.0
+            torchvision-version: 0.19.0
+
     runs-on:
       pool-name: docker
       container:
@@ -160,7 +165,7 @@ jobs:
     - name: setup-pytorch
       run: |
         export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}}
-        pip3 install --user torch==${{matrix.torch-version}}+cpu torchvision==${{matrix.torchvision-version}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        pip3 install --user torch==${{matrix.torch-version}}+cpu torchvision==${{matrix.torchvision-version}}+cpu --index-url https://download.pytorch.org/whl/cpu
         pip3 install --user onnx
         pip3 install --user onnxscript
 
diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml
index d8304c0e33c0..6b6db4f0d2e9 100644
--- a/.github/workflows/release-python.yml
+++ b/.github/workflows/release-python.yml
@@ -87,7 +87,7 @@ jobs:
     # build wheels for ubuntu-20.04
     - name: Build wheels for ubuntu
       if: matrix.os == 'ubuntu-20.04'
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_LINUX: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
@@ -99,7 +99,7 @@ jobs:
     # build wheels for windows-2019
     - name: Build wheels for windows
       if: matrix.os == 'windows-2019' && (matrix.arch == 'AMD64' || matrix.arch == 'x86')
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_WINDOWS: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
@@ -112,7 +112,7 @@ jobs:
 
     - name: Build wheels for windows ARM64
       if: matrix.os == 'windows-2019' && matrix.arch == 'ARM64'
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_WINDOWS: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
@@ -184,41 +184,43 @@ jobs:
     - name: vulkansdk for macos
       if: matrix.os == 'macos-13'
       run: |
-        wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg
-        hdiutil attach vulkansdk-macos-1.3.236.0.dmg
-        sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install
+        wget https://sdk.lunarg.com/sdk/download/1.3.290.0/mac/vulkansdk-macos-1.3.290.0.dmg?Human=true -O vulkansdk-macos-1.3.290.0.dmg
+        hdiutil attach vulkansdk-macos-1.3.290.0.dmg
+        sudo /Volumes/vulkansdk-macos-1.3.290.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.290.0 --accept-licenses --default-answer --confirm-command install
 
     - name: Build wheels for macos x86_64
       if: matrix.os == 'macos-13' && matrix.arch == 'x86_64'
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_MACOS: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
         CIBW_BUILD_VERBOSITY: 1
         CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=3
           CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC ARCHS="x86_64"
-          DEPLOYMENT_TARGET="10.9" ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
+          DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
           OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp"
           OpenMP_C_LIB_NAMES="libomp" OpenMP_CXX_LIB_NAMES="libomp"
           OpenMP_libomp_LIBRARY="libomp.a"
-          Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib
+          Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.290.0/macOS/lib/libMoltenVK.dylib
+          MACOSX_DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET
       with:
         output-dir: wheelhouse
 
     - name: Build wheels for macos arm64
       if: matrix.os == 'macos-13' && matrix.arch == 'arm64'
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_MACOS: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
         CIBW_BUILD_VERBOSITY: 1
         CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=3
           CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC_ARM64 ARCHS="arm64"
-          DEPLOYMENT_TARGET="11.0" ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
+          DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
           OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp"
           OpenMP_C_LIB_NAMES="libomp" OpenMP_CXX_LIB_NAMES="libomp"
           OpenMP_libomp_LIBRARY="libomp.a"
-          Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib
+          Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.290.0/macOS/lib/libMoltenVK.dylib
+          MACOSX_DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET
       with:
         output-dir: wheelhouse
 
@@ -244,7 +246,7 @@ jobs:
       fail-fast: false
       matrix:
         arch: [aarch64, ppc64le, s390x]
-        build_cp: [cp36, cp37, cp38, cp39, cp310, cp311, cp312]
+        build_cp: [cp36, cp37, cp38, cp39, cp310, cp311, cp312, cp313]
         build_sub: [manylinux, musllinux]
 
     steps:
@@ -262,7 +264,7 @@ jobs:
         platforms: all
 
     - name: Build wheels for manylinux with qemu
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_LINUX: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build_cp }}-${{ matrix.build_sub }}*
@@ -310,7 +312,7 @@ jobs:
         platforms: all
 
     - name: Build wheels for manylinux with qemu
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_LINUX: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build_pp }}-*
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 6309214e08f6..2e875fc51e73 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -53,11 +53,20 @@ jobs:
         name: ${{ env.PACKAGENAME }}
         path: /tmp/${{ env.PACKAGENAME }}.zip
 
-  ubuntu-2004:
+  ubuntu:
     needs: [setup]
-    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        opt:
+          - { shared-lib: OFF, os: ubuntu-20.04, id: ubuntu-2004        }
+          - { shared-lib: OFF, os: ubuntu-22.04, id: ubuntu-2204        }
+          - { shared-lib: OFF, os: ubuntu-24.04, id: ubuntu-2404        }
+          - { shared-lib: ON,  os: ubuntu-20.04, id: ubuntu-2004-shared }
+          - { shared-lib: ON,  os: ubuntu-22.04, id: ubuntu-2204-shared }
+          - { shared-lib: ON,  os: ubuntu-24.04, id: ubuntu-2404-shared }
+    runs-on: ${{ matrix.opt.os }}
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2004
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
     steps:
     - uses: actions/checkout@v4
       with:
@@ -69,71 +78,7 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
-        cmake --build . --target install/strip
-    - name: package
-      run: |
-        rm -rf ${{ env.PACKAGENAME }}
-        mkdir -p ${{ env.PACKAGENAME }}
-        cp -a build/install/* ${{ env.PACKAGENAME }}
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  ubuntu-2004-shared:
-    needs: [setup]
-    runs-on: ubuntu-20.04
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2004-shared
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: apt
-      run: |
-        sudo apt-get install -y libprotobuf-dev protobuf-compiler
-    - name: build
-      run: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: package
-      run: |
-        rm -rf ${{ env.PACKAGENAME }}
-        mkdir -p ${{ env.PACKAGENAME }}
-        cp -a -P build/install/* ${{ env.PACKAGENAME }}
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  ubuntu-2204:
-    needs: [setup]
-    runs-on: ubuntu-22.04
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2204
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: apt
-      run: |
-        sudo apt-get install -y libprotobuf-dev protobuf-compiler
-    - name: build
-      run: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
+            -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
@@ -149,38 +94,6 @@ jobs:
         name: ${{ env.PACKAGENAME }}
         path: ${{ env.PACKAGENAME }}.zip
 
-  ubuntu-2204-shared:
-    needs: [setup]
-    runs-on: ubuntu-22.04
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2204-shared
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: apt
-      run: |
-        sudo apt-get install -y libprotobuf-dev protobuf-compiler
-    - name: build
-      run: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: package
-      run: |
-        rm -rf ${{ env.PACKAGENAME }}
-        mkdir -p ${{ env.PACKAGENAME }}
-        cp -a -P build/install/* ${{ env.PACKAGENAME }}
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
   openmp-macos:
     runs-on: macos-13
     env:
@@ -255,85 +168,14 @@ jobs:
 
   macos:
     needs: [setup, openmp-macos]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: macos        }
+          - { vulkan: ON,  id: macos-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos
-      NCNN_CMAKE_OPTIONS: |
-        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
-        -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \
-        -DENABLE_BITCODE=$ENABLE_BITCODE \
-        -DENABLE_ARC=$ENABLE_ARC \
-        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
-        -DCMAKE_INSTALL_PREFIX=install \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-        -DOpenMP_libomp_LIBRARY="libomp.a" \
-        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_BUILD_TOOLS=OFF \
-        -DNCNN_BUILD_EXAMPLES=OFF \
-        -DNCNN_BUILD_BENCHMARK=OFF \
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: download-openmp-macos
-      uses: actions/download-artifact@v4
-      with:
-        name: openmp-macos
-        path: openmp-macos
-    - name: install-openmp
-      run: |
-        sudo cp openmp-macos/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include
-        sudo cp openmp-macos/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib
-    - name: build-x86_64
-      run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: build-arm64
-      run: |
-        mkdir build-arm64 && cd build-arm64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: package-openmp
-      run: |
-        rm -rf openmp.framework
-        mkdir -p openmp.framework/Versions/A/Headers
-        mkdir -p openmp.framework/Versions/A/Resources
-        ln -s A openmp.framework/Versions/Current
-        ln -s Versions/Current/Headers openmp.framework/Headers
-        ln -s Versions/Current/Resources openmp.framework/Resources
-        ln -s Versions/Current/openmp openmp.framework/openmp
-        cp openmp-macos/lib/libomp.a openmp.framework/Versions/A/openmp
-        cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
-      run: |
-        rm -rf ncnn.framework
-        mkdir -p ncnn.framework/Versions/A/Headers
-        mkdir -p ncnn.framework/Versions/A/Resources
-        ln -s A ncnn.framework/Versions/Current
-        ln -s Versions/Current/Headers ncnn.framework/Headers
-        ln -s Versions/Current/Resources ncnn.framework/Resources
-        ln -s Versions/Current/ncnn ncnn.framework/ncnn
-        lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  macos-gpu:
-    needs: [setup, openmp-macos]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \
@@ -346,10 +188,10 @@ jobs:
         -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_VULKAN=ON \
         -DNCNN_BUILD_TOOLS=OFF \
         -DNCNN_BUILD_EXAMPLES=OFF \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
@@ -389,6 +231,7 @@ jobs:
         cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
     - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
       run: |
         rm -rf glslang.framework
         mkdir -p glslang.framework/Versions/A/Headers
@@ -397,12 +240,26 @@ jobs:
         ln -s Versions/Current/Headers glslang.framework/Headers
         ln -s Versions/Current/Resources glslang.framework/Resources
         ln -s Versions/Current/glslang glslang.framework/glslang
-        libtool -static build-x86_64/install/lib/libglslang.a build-x86_64/install/lib/libMachineIndependent.a build-x86_64/install/lib/libGenericCodeGen.a build-x86_64/install/lib/libSPIRV.a build-x86_64/install/lib/libOGLCompiler.a build-x86_64/install/lib/libOSDependent.a -o build-x86_64/install/lib/libglslang_combined.a
-        libtool -static build-arm64/install/lib/libglslang.a build-arm64/install/lib/libMachineIndependent.a build-arm64/install/lib/libGenericCodeGen.a build-arm64/install/lib/libSPIRV.a build-arm64/install/lib/libOGLCompiler.a build-arm64/install/lib/libOSDependent.a -o build-arm64/install/lib/libglslang_combined.a
+        libtool -static \
+            build-x86_64/install/lib/libglslang.a \
+            build-x86_64/install/lib/libMachineIndependent.a \
+            build-x86_64/install/lib/libGenericCodeGen.a \
+            build-x86_64/install/lib/libSPIRV.a \
+            build-x86_64/install/lib/libOGLCompiler.a \
+            build-x86_64/install/lib/libOSDependent.a \
+            -o build-x86_64/install/lib/libglslang_combined.a
+        libtool -static \
+            build-arm64/install/lib/libglslang.a \
+            build-arm64/install/lib/libMachineIndependent.a \
+            build-arm64/install/lib/libGenericCodeGen.a \
+            build-arm64/install/lib/libSPIRV.a \
+            build-arm64/install/lib/libOGLCompiler.a \
+            build-arm64/install/lib/libOSDependent.a \
+            -o build-arm64/install/lib/libglslang_combined.a
         lipo -create build-x86_64/install/lib/libglslang_combined.a build-arm64/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang
         cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
         sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -412,8 +269,16 @@ jobs:
         ln -s Versions/Current/Resources ncnn.framework/Resources
         ln -s Versions/Current/ncnn ncnn.framework/ncnn
         lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/ncnn ncnn.framework/Versions/A/Headers/
+        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
@@ -485,77 +350,14 @@ jobs:
 
   ios:
     needs: [setup, openmp-ios]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: ios        }
+          - { vulkan: ON,  id: ios-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios
-      NCNN_CMAKE_OPTIONS: |
-        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
-        -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
-        -DENABLE_BITCODE=$ENABLE_BITCODE \
-        -DENABLE_ARC=$ENABLE_ARC \
-        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
-        -DCMAKE_INSTALL_PREFIX=install \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-        -DOpenMP_libomp_LIBRARY="libomp.a" \
-        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_BUILD_BENCHMARK=OFF \
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: download-openmp-ios
-      uses: actions/download-artifact@v4
-      with:
-        name: openmp-ios
-        path: openmp-ios
-    - name: install-openmp
-      run: |
-        sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include
-        sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib
-    - name: build-arm64
-      run: |
-        mkdir build-arm64 && cd build-arm64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=OS64 -DARCHS="arm64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: package-openmp
-      run: |
-        rm -rf openmp.framework
-        mkdir -p openmp.framework/Versions/A/Headers
-        mkdir -p openmp.framework/Versions/A/Resources
-        ln -s A openmp.framework/Versions/Current
-        ln -s Versions/Current/Headers openmp.framework/Headers
-        ln -s Versions/Current/Resources openmp.framework/Resources
-        ln -s Versions/Current/openmp openmp.framework/openmp
-        cp openmp-ios/lib/libomp.a openmp.framework/Versions/A/openmp
-        cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
-      run: |
-        rm -rf ncnn.framework
-        mkdir -p ncnn.framework/Versions/A/Headers
-        mkdir -p ncnn.framework/Versions/A/Resources
-        ln -s A ncnn.framework/Versions/Current
-        ln -s Versions/Current/Headers ncnn.framework/Headers
-        ln -s Versions/Current/Resources ncnn.framework/Resources
-        ln -s Versions/Current/ncnn ncnn.framework/ncnn
-        cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn
-        cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  ios-gpu:
-    needs: [setup, openmp-ios]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
@@ -568,8 +370,8 @@ jobs:
         -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_VULKAN=ON \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
@@ -603,6 +405,7 @@ jobs:
         cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
     - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
       run: |
         rm -rf glslang.framework
         mkdir -p glslang.framework/Versions/A/Headers
@@ -622,7 +425,7 @@ jobs:
         cp build-arm64/install/lib/libglslang_combined.a glslang.framework/Versions/A/glslang
         cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/
         sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -632,8 +435,16 @@ jobs:
         ln -s Versions/Current/Resources ncnn.framework/Resources
         ln -s Versions/Current/ncnn ncnn.framework/ncnn
         cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn
-        cp -a build-arm64/install/include/ncnn ncnn.framework/Versions/A/Headers/
+        cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
@@ -716,9 +527,14 @@ jobs:
 
   ios-simulator:
     needs: [setup, openmp-ios-simulator]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: ios-simulator        }
+          - { vulkan: ON,  id: ios-simulator-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
@@ -732,89 +548,12 @@ jobs:
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
-    - name: download-openmp-ios-simulator
-      uses: actions/download-artifact@v4
-      with:
-        name: openmp-ios-simulator
-        path: openmp-ios-simulator
-    - name: install-openmp
-      run: |
-        sudo cp openmp-ios-simulator/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include
-        sudo cp openmp-ios-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib
-    - name: build-x86_64
-      run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR64 -DARCHS="x86_64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: build-arm64
-      run: |
-        mkdir build-arm64 && cd build-arm64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64 -DARCHS="arm64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: package-openmp
-      run: |
-        rm -rf openmp.framework
-        mkdir -p openmp.framework/Versions/A/Headers
-        mkdir -p openmp.framework/Versions/A/Resources
-        ln -s A openmp.framework/Versions/Current
-        ln -s Versions/Current/Headers openmp.framework/Headers
-        ln -s Versions/Current/Resources openmp.framework/Resources
-        ln -s Versions/Current/openmp openmp.framework/openmp
-        cp openmp-ios-simulator/lib/libomp.a openmp.framework/Versions/A/openmp
-        cp -a openmp-ios-simulator/include/* openmp.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
-      run: |
-        rm -rf ncnn.framework
-        mkdir -p ncnn.framework/Versions/A/Headers
-        mkdir -p ncnn.framework/Versions/A/Resources
-        ln -s A ncnn.framework/Versions/Current
-        ln -s Versions/Current/Headers ncnn.framework/Headers
-        ln -s Versions/Current/Resources ncnn.framework/Resources
-        ln -s Versions/Current/ncnn ncnn.framework/ncnn
-        lipo -create \
-            build-x86_64/install/lib/libncnn.a \
-            build-arm64/install/lib/libncnn.a \
-            -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  ios-simulator-gpu:
-    needs: [setup, openmp-ios-simulator]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan
-      NCNN_CMAKE_OPTIONS: |
-        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
-        -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
-        -DENABLE_BITCODE=$ENABLE_BITCODE \
-        -DENABLE_ARC=$ENABLE_ARC \
-        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
-        -DCMAKE_INSTALL_PREFIX=install \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-        -DOpenMP_libomp_LIBRARY="libomp.a" \
-        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_VULKAN=ON \
-        -DNCNN_BUILD_BENCHMARK=OFF \
-
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
+      with:
+        submodules: true
     - name: download-openmp-ios-simulator
       uses: actions/download-artifact@v4
       with:
@@ -849,6 +588,7 @@ jobs:
         cp -a openmp-ios-simulator/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
     - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
       run: |
         rm -rf glslang.framework
         mkdir -p glslang.framework/Versions/A/Headers
@@ -879,7 +619,7 @@ jobs:
             -o glslang.framework/Versions/A/glslang
         cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
         sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -892,8 +632,16 @@ jobs:
             build-x86_64/install/lib/libncnn.a \
             build-arm64/install/lib/libncnn.a \
             -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/ncnn ncnn.framework/Versions/A/Headers/
+        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
@@ -976,86 +724,14 @@ jobs:
 
   mac-catalyst:
     needs: [setup, openmp-mac-catalyst]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: mac-catalyst        }
+          - { vulkan: ON,  id: mac-catalyst-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst
-      NCNN_CMAKE_OPTIONS: |
-        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
-        -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \
-        -DENABLE_BITCODE=$ENABLE_BITCODE \
-        -DENABLE_ARC=$ENABLE_ARC \
-        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
-        -DCMAKE_INSTALL_PREFIX=install \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-        -DOpenMP_libomp_LIBRARY="libomp.a" \
-        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_BUILD_BENCHMARK=OFF \
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: download-openmp-mac-catalyst
-      uses: actions/download-artifact@v4
-      with:
-        name: openmp-mac-catalyst
-        path: openmp-mac-catalyst
-    - name: install-openmp
-      run: |
-        sudo cp openmp-mac-catalyst/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include
-        sudo cp openmp-mac-catalyst/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib
-    - name: build-x86_64
-      run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="x86_64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: build-arm64
-      run: |
-        mkdir build-arm64 && cd build-arm64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="arm64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: package-openmp
-      run: |
-        rm -rf openmp.framework
-        mkdir -p openmp.framework/Versions/A/Headers
-        mkdir -p openmp.framework/Versions/A/Resources
-        ln -s A openmp.framework/Versions/Current
-        ln -s Versions/Current/Headers openmp.framework/Headers
-        ln -s Versions/Current/Resources openmp.framework/Resources
-        ln -s Versions/Current/openmp openmp.framework/openmp
-        cp openmp-mac-catalyst/lib/libomp.a openmp.framework/Versions/A/openmp
-        cp -a openmp-mac-catalyst/include/* openmp.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
-      run: |
-        rm -rf ncnn.framework
-        mkdir -p ncnn.framework/Versions/A/Headers
-        mkdir -p ncnn.framework/Versions/A/Resources
-        ln -s A ncnn.framework/Versions/Current
-        ln -s Versions/Current/Headers ncnn.framework/Headers
-        ln -s Versions/Current/Resources ncnn.framework/Resources
-        ln -s Versions/Current/ncnn ncnn.framework/ncnn
-        lipo -create \
-            build-x86_64/install/lib/libncnn.a \
-            build-arm64/install/lib/libncnn.a \
-            -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  mac-catalyst-gpu:
-    needs: [setup, openmp-mac-catalyst]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \
@@ -1068,8 +744,8 @@ jobs:
         -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_VULKAN=ON \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
@@ -1109,6 +785,7 @@ jobs:
         cp -a openmp-mac-catalyst/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
     - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
       run: |
         rm -rf glslang.framework
         mkdir -p glslang.framework/Versions/A/Headers
@@ -1139,7 +816,7 @@ jobs:
             -o glslang.framework/Versions/A/glslang
         cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
         sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -1152,8 +829,16 @@ jobs:
             build-x86_64/install/lib/libncnn.a \
             build-arm64/install/lib/libncnn.a \
             -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/ncnn ncnn.framework/Versions/A/Headers/
+        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
@@ -1534,86 +1219,14 @@ jobs:
 
   tvos:
     needs: [setup, openmp-tvos]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: tvos        }
+          - { vulkan: ON,  id: tvos-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos
-      NCNN_CMAKE_OPTIONS: |
-        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
-        -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
-        -DENABLE_BITCODE=$ENABLE_BITCODE \
-        -DENABLE_ARC=$ENABLE_ARC \
-        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
-        -DCMAKE_INSTALL_PREFIX=install \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-        -DOpenMP_libomp_LIBRARY="libomp.a" \
-        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_BUILD_BENCHMARK=OFF \
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: download-openmp-tvos
-      uses: actions/download-artifact@v4
-      with:
-        name: openmp-tvos
-        path: openmp-tvos
-    - name: install-openmp
-      run: |
-        sudo cp openmp-tvos/include/* $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/include
-        sudo cp openmp-tvos/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/lib
-    - name: build-arm64
-      run: |
-        mkdir build-arm64 && cd build-arm64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: build-arm64e
-      run: |
-        mkdir build-arm64e && cd build-arm64e
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64e" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: package-openmp
-      run: |
-        rm -rf openmp.framework
-        mkdir -p openmp.framework/Versions/A/Headers
-        mkdir -p openmp.framework/Versions/A/Resources
-        ln -s A openmp.framework/Versions/Current
-        ln -s Versions/Current/Headers openmp.framework/Headers
-        ln -s Versions/Current/Resources openmp.framework/Resources
-        ln -s Versions/Current/openmp openmp.framework/openmp
-        cp openmp-tvos/lib/libomp.a openmp.framework/Versions/A/openmp
-        cp -a openmp-tvos/include/* openmp.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
-      run: |
-        rm -rf ncnn.framework
-        mkdir -p ncnn.framework/Versions/A/Headers
-        mkdir -p ncnn.framework/Versions/A/Resources
-        ln -s A ncnn.framework/Versions/Current
-        ln -s Versions/Current/Headers ncnn.framework/Headers
-        ln -s Versions/Current/Resources ncnn.framework/Resources
-        ln -s Versions/Current/ncnn ncnn.framework/ncnn
-        lipo -create \
-            build-arm64/install/lib/libncnn.a \
-            build-arm64e/install/lib/libncnn.a \
-            -o ncnn.framework/Versions/A/ncnn
-        cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  tvos-gpu:
-    needs: [setup, openmp-tvos]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
@@ -1626,8 +1239,8 @@ jobs:
         -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_VULKAN=ON \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
@@ -1667,6 +1280,7 @@ jobs:
         cp -a openmp-tvos/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
     - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
       run: |
         rm -rf glslang.framework
         mkdir -p glslang.framework/Versions/A/Headers
@@ -1697,7 +1311,7 @@ jobs:
             -o glslang.framework/Versions/A/glslang
         cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/
         sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -1712,6 +1326,14 @@ jobs:
             -o ncnn.framework/Versions/A/ncnn
         cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
@@ -1794,9 +1416,14 @@ jobs:
 
   tvos-simulator:
     needs: [setup, openmp-tvos-simulator]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: tvos-simulator        }
+          - { vulkan: ON,  id: tvos-simulator-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
@@ -1810,9 +1437,12 @@ jobs:
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
+      with:
+        submodules: true
     - name: download-openmp-tvos-simulator
       uses: actions/download-artifact@v4
       with:
@@ -1846,87 +1476,8 @@ jobs:
         cp openmp-tvos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp
         cp -a openmp-tvos-simulator/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
-      run: |
-        rm -rf ncnn.framework
-        mkdir -p ncnn.framework/Versions/A/Headers
-        mkdir -p ncnn.framework/Versions/A/Resources
-        ln -s A ncnn.framework/Versions/Current
-        ln -s Versions/Current/Headers ncnn.framework/Headers
-        ln -s Versions/Current/Resources ncnn.framework/Resources
-        ln -s Versions/Current/ncnn ncnn.framework/ncnn
-        lipo -create \
-            build-x86_64/install/lib/libncnn.a \
-            build-arm64/install/lib/libncnn.a \
-            -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  tvos-simulator-gpu:
-    needs: [setup, openmp-tvos-simulator]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan
-      NCNN_CMAKE_OPTIONS: |
-        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
-        -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
-        -DENABLE_BITCODE=$ENABLE_BITCODE \
-        -DENABLE_ARC=$ENABLE_ARC \
-        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
-        -DCMAKE_INSTALL_PREFIX=install \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-        -DOpenMP_libomp_LIBRARY="libomp.a" \
-        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_VULKAN=ON \
-        -DNCNN_BUILD_BENCHMARK=OFF \
-
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: download-openmp-tvos-simulator
-      uses: actions/download-artifact@v4
-      with:
-        name: openmp-tvos-simulator
-        path: openmp-tvos-simulator
-    - name: install-openmp
-      run: |
-        sudo cp openmp-tvos-simulator/include/* $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/include
-        sudo cp openmp-tvos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/lib
-    - name: build-x86_64
-      run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="x86_64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: build-arm64
-      run: |
-        mkdir build-arm64 && cd build-arm64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="arm64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: package-openmp
-      run: |
-        rm -rf openmp.framework
-        mkdir -p openmp.framework/Versions/A/Headers
-        mkdir -p openmp.framework/Versions/A/Resources
-        ln -s A openmp.framework/Versions/Current
-        ln -s Versions/Current/Headers openmp.framework/Headers
-        ln -s Versions/Current/Resources openmp.framework/Resources
-        ln -s Versions/Current/openmp openmp.framework/openmp
-        cp openmp-tvos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp
-        cp -a openmp-tvos-simulator/include/* openmp.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package-glslang
+    - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
       run: |
         rm -rf glslang.framework
         mkdir -p glslang.framework/Versions/A/Headers
@@ -1957,7 +1508,7 @@ jobs:
             -o glslang.framework/Versions/A/glslang
         cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
         sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -1972,6 +1523,14 @@ jobs:
             -o ncnn.framework/Versions/A/ncnn
         cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
@@ -2043,9 +1602,14 @@ jobs:
 
   visionos:
     needs: [setup, openmp-visionos]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: visionos        }
+          - { vulkan: ON,  id: visionos-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-visionos
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \
@@ -2059,9 +1623,12 @@ jobs:
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
+      with:
+        submodules: true
     - name: download-openmp-visionos
       uses: actions/download-artifact@v4
       with:
@@ -2089,7 +1656,28 @@ jobs:
         cp openmp-visionos/lib/libomp.a openmp.framework/Versions/A/openmp
         cp -a openmp-visionos/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
+      run: |
+        rm -rf glslang.framework
+        mkdir -p glslang.framework/Versions/A/Headers
+        mkdir -p glslang.framework/Versions/A/Resources
+        ln -s A glslang.framework/Versions/Current
+        ln -s Versions/Current/Headers glslang.framework/Headers
+        ln -s Versions/Current/Resources glslang.framework/Resources
+        ln -s Versions/Current/glslang glslang.framework/glslang
+        libtool -static \
+            build-arm64/install/lib/libglslang.a \
+            build-arm64/install/lib/libMachineIndependent.a \
+            build-arm64/install/lib/libGenericCodeGen.a \
+            build-arm64/install/lib/libSPIRV.a \
+            build-arm64/install/lib/libOGLCompiler.a \
+            build-arm64/install/lib/libOSDependent.a \
+            -o build-arm64/install/lib/libglslang_combined.a
+        cp build-arm64/install/lib/libglslang_combined.a glslang.framework/Versions/A/glslang
+        cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/
+        sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -2101,8 +1689,16 @@ jobs:
         cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn
         cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
       uses: actions/upload-artifact@v4
       with:
@@ -2183,9 +1779,14 @@ jobs:
 
   visionos-simulator:
     needs: [setup, openmp-visionos-simulator]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: visionos-simulator        }
+          - { vulkan: ON,  id: visionos-simulator-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \
@@ -2199,9 +1800,12 @@ jobs:
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
+      with:
+        submodules: true
     - name: download-openmp-visionos-simulator
       uses: actions/download-artifact@v4
       with:
@@ -2235,7 +1839,39 @@ jobs:
         cp openmp-visionos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp
         cp -a openmp-visionos-simulator/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
+      run: |
+        rm -rf glslang.framework
+        mkdir -p glslang.framework/Versions/A/Headers
+        mkdir -p glslang.framework/Versions/A/Resources
+        ln -s A glslang.framework/Versions/Current
+        ln -s Versions/Current/Headers glslang.framework/Headers
+        ln -s Versions/Current/Resources glslang.framework/Resources
+        ln -s Versions/Current/glslang glslang.framework/glslang
+        libtool -static \
+            build-x86_64/install/lib/libglslang.a \
+            build-x86_64/install/lib/libMachineIndependent.a \
+            build-x86_64/install/lib/libGenericCodeGen.a \
+            build-x86_64/install/lib/libSPIRV.a \
+            build-x86_64/install/lib/libOGLCompiler.a \
+            build-x86_64/install/lib/libOSDependent.a \
+            -o build-x86_64/install/lib/libglslang_combined.a
+        libtool -static \
+            build-arm64/install/lib/libglslang.a \
+            build-arm64/install/lib/libMachineIndependent.a \
+            build-arm64/install/lib/libGenericCodeGen.a \
+            build-arm64/install/lib/libSPIRV.a \
+            build-arm64/install/lib/libOGLCompiler.a \
+            build-arm64/install/lib/libOSDependent.a \
+            -o build-arm64/install/lib/libglslang_combined.a
+        lipo -create \
+            build-x86_64/install/lib/libglslang_combined.a \
+            build-arm64/install/lib/libglslang_combined.a \
+            -o glslang.framework/Versions/A/glslang
+        cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
+        sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -2250,8 +1886,16 @@ jobs:
             -o ncnn.framework/Versions/A/ncnn
         cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
       uses: actions/upload-artifact@v4
       with:
@@ -2260,51 +1904,63 @@ jobs:
 
   android:
     needs: [setup]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, shared-lib: OFF, id: android               }
+          - { vulkan: OFF, shared-lib: ON,  id: android-shared        }
+          - { vulkan: ON,  shared-lib: OFF, id: android-vulkan        }
+          - { vulkan: ON,  shared-lib: ON,  id: android-vulkan-shared }
     runs-on: ubuntu-latest
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
+      NCNN_CMAKE_OPTIONS: |
+        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake \
+        -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_INSTALL_PREFIX=install \
+        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
+        -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
+        -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} \
+        -DNCNN_AVX512BF16=OFF \
+
     steps:
     - uses: actions/checkout@v4
+      with:
+        submodules: true
     - name: ndk-fix-debug
       run: sed -i -e '/^  -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake
-    - name: build-armv7
+    - name: build-armeabi-v7a
       run: |
-        mkdir build-armv7 && cd build-armv7
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
-            -DNCNN_BUILD_BENCHMARK=OFF ..
+        mkdir build-armeabi-v7a && cd build-armeabi-v7a
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
-    - name: build-aarch64
+    - name: build-arm64-v8a
       run: |
-        mkdir build-aarch64 && cd build-aarch64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_BUILD_BENCHMARK=OFF ..
+        mkdir build-arm64-v8a && cd build-arm64-v8a
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-x86
       run: |
         mkdir build-x86 && cd build-x86
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
-            -DNCNN_BUILD_BENCHMARK=OFF ..
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-x86_64
       run: |
         mkdir build-x86_64 && cd build-x86_64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_BUILD_BENCHMARK=OFF ..
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
       run: |
         rm -rf ${{ env.PACKAGENAME }}
         mkdir -p ${{ env.PACKAGENAME }}
-        cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a
-        cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a
+        cp -a build-armeabi-v7a/install ${{ env.PACKAGENAME }}/armeabi-v7a
+        cp -a build-arm64-v8a/install ${{ env.PACKAGENAME }}/arm64-v8a
         cp -a build-x86/install ${{ env.PACKAGENAME }}/x86
         cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64
         rm -f ${{ env.PACKAGENAME }}.zip
@@ -2315,55 +1971,63 @@ jobs:
         name: ${{ env.PACKAGENAME }}
         path: ${{ env.PACKAGENAME }}.zip
 
-  android-shared:
+  webassembly:
     needs: [setup]
     runs-on: ubuntu-latest
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-shared
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-webassembly
     steps:
     - uses: actions/checkout@v4
-    - name: ndk-fix-debug
-      run: sed -i -e '/^  -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake
-    - name: build-armv7
+    - name: emsdk
+      run: |
+        git clone https://github.com/emscripten-core/emsdk.git
+        cd emsdk
+        ./emsdk install $EMSCRIPTEN_VERSION
+        ./emsdk activate $EMSCRIPTEN_VERSION
+    - name: build
       run: |
-        mkdir build-armv7 && cd build-armv7
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
-            -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
+        source emsdk/emsdk_env.sh
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
+            -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
-    - name: build-aarch64
+    - name: build-simd
       run: |
-        mkdir build-aarch64 && cd build-aarch64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
+        source emsdk/emsdk_env.sh
+        mkdir build-simd && cd build-simd
+        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
+            -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
-    - name: build-x86
+    - name: build-threads
       run: |
-        mkdir build-x86 && cd build-x86
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
-            -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
+        source emsdk/emsdk_env.sh
+        mkdir build-threads && cd build-threads
+        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
+            -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
-    - name: build-x86_64
+    - name: build-simd-threads
       run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
+        source emsdk/emsdk_env.sh
+        mkdir build-simd-threads && cd build-simd-threads
+        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
+            -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
       run: |
         rm -rf ${{ env.PACKAGENAME }}
         mkdir -p ${{ env.PACKAGENAME }}
-        cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a
-        cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a
-        cp -a build-x86/install ${{ env.PACKAGENAME }}/x86
-        cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64
+        cp -a build/install ${{ env.PACKAGENAME }}/basic
+        cp -a build-simd/install ${{ env.PACKAGENAME }}/simd
+        cp -a build-threads/install ${{ env.PACKAGENAME }}/threads
+        cp -a build-simd-threads/install ${{ env.PACKAGENAME }}/simd-threads
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
     - name: upload-zip
@@ -2372,692 +2036,96 @@ jobs:
         name: ${{ env.PACKAGENAME }}
         path: ${{ env.PACKAGENAME }}.zip
 
-  android-gpu:
+  windows:
     needs: [setup]
-    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        opt:
+          - { shared-lib: OFF, os: windows-2019, toolset-version: v140, id: vs2015 }
+          - { shared-lib: OFF, os: windows-2019, toolset-version: v141, id: vs2017 }
+          - { shared-lib: OFF, os: windows-2019, toolset-version: v142, id: vs2019 }
+          - { shared-lib: OFF, os: windows-2022, toolset-version: v143, id: vs2022 }
+          - { shared-lib: ON,  os: windows-2019, toolset-version: v140, id: vs2015-shared }
+          - { shared-lib: ON,  os: windows-2019, toolset-version: v141, id: vs2017-shared }
+          - { shared-lib: ON,  os: windows-2019, toolset-version: v142, id: vs2019-shared }
+          - { shared-lib: ON,  os: windows-2022, toolset-version: v143, id: vs2022-shared }
+    runs-on: ${{ matrix.opt.os }}
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-vulkan
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-${{ matrix.opt.id }}
+      UseMultiToolTask: true
+      NCNN_CMAKE_OPTIONS: |
+        -T ${{ matrix.opt.toolset-version }},host=x64 `
+        -DCMAKE_BUILD_TYPE=Release `
+        -DCMAKE_INSTALL_PREFIX=install `
+        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" `
+        -DNCNN_BUILD_EXAMPLES=OFF `
+        -DNCNN_BUILD_TOOLS=ON `
+        -DNCNN_BUILD_BENCHMARK=OFF `
+        -DNCNN_VULKAN=ON `
+        -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} `
+
     steps:
     - uses: actions/checkout@v4
       with:
         submodules: true
-    - name: ndk-fix-debug
-      run: sed -i -e '/^  -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake
-    - name: build-armv7
-      run: |
-        mkdir build-armv7 && cd build-armv7
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-aarch64
+    - name: cache-protobuf
+      id: cache-protobuf
+      uses: actions/cache@v4
+      with:
+        path: "protobuf-install"
+        key: protobuf-${{ matrix.opt.toolset-version }}-x86-x64-install
+    - name: protobuf
+      if: steps.cache-protobuf.outputs.cache-hit != 'true'
       run: |
-        mkdir build-aarch64 && cd build-aarch64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
+        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
+        7z x ./protobuf-3.11.2.zip
+        cd protobuf-3.11.2
+        mkdir build-x86; cd build-x86;
+        cmake -T ${{ matrix.opt.toolset-version }},host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
+        cmake --build . --config Release -j 4
+        cmake --build . --config Release --target install
+        cd ..
+        mkdir build-x64; cd build-x64;
+        cmake -T ${{ matrix.opt.toolset-version }},host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
+        cmake --build . --config Release -j 4
+        cmake --build . --config Release --target install
     - name: build-x86
       run: |
-        mkdir build-x86 && cd build-x86
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-x86_64
+        mkdir build-x86; cd build-x86
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A Win32 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" ..
+        cmake --build . --config Release -j 4
+        cmake --build . --config Release --target install
+    - name: build-x64
       run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: package
-      run: |
-        rm -rf ${{ env.PACKAGENAME }}
-        mkdir -p ${{ env.PACKAGENAME }}
-        cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a
-        cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a
-        cp -a build-x86/install ${{ env.PACKAGENAME }}/x86
-        cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  android-gpu-shared:
-    needs: [setup]
-    runs-on: ubuntu-latest
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-vulkan-shared
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: ndk-fix-debug
-      run: sed -i -e '/^  -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake
-    - name: build-armv7
-      run: |
-        mkdir build-armv7 && cd build-armv7
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-aarch64
-      run: |
-        mkdir build-aarch64 && cd build-aarch64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-x86
-      run: |
-        mkdir build-x86 && cd build-x86
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-x86_64
-      run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: package
-      run: |
-        rm -rf ${{ env.PACKAGENAME }}
-        mkdir -p ${{ env.PACKAGENAME }}
-        cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a
-        cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a
-        cp -a build-x86/install ${{ env.PACKAGENAME }}/x86
-        cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  webassembly:
-    needs: [setup]
-    runs-on: ubuntu-latest
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-webassembly
-    steps:
-    - uses: actions/checkout@v4
-    - name: emsdk
-      run: |
-        git clone https://github.com/emscripten-core/emsdk.git
-        cd emsdk
-        ./emsdk install $EMSCRIPTEN_VERSION
-        ./emsdk activate $EMSCRIPTEN_VERSION
-    - name: build
-      run: |
-        source emsdk/emsdk_env.sh
-        mkdir build && cd build
-        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
-            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-simd
-      run: |
-        source emsdk/emsdk_env.sh
-        mkdir build-simd && cd build-simd
-        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
-            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-threads
-      run: |
-        source emsdk/emsdk_env.sh
-        mkdir build-threads && cd build-threads
-        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
-            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-simd-threads
-      run: |
-        source emsdk/emsdk_env.sh
-        mkdir build-simd-threads && cd build-simd-threads
-        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
-            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: package
-      run: |
-        rm -rf ${{ env.PACKAGENAME }}
-        mkdir -p ${{ env.PACKAGENAME }}
-        cp -a build/install ${{ env.PACKAGENAME }}/basic
-        cp -a build-simd/install ${{ env.PACKAGENAME }}/simd
-        cp -a build-threads/install ${{ env.PACKAGENAME }}/threads
-        cp -a build-simd-threads/install ${{ env.PACKAGENAME }}/simd-threads
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2015:
-    needs: [setup]
-    runs-on: windows-2019
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2015-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: package
-      run: |
-        mkdir ${{ env.PACKAGENAME }}
-        mkdir ${{ env.PACKAGENAME }}/x86
-        mkdir ${{ env.PACKAGENAME }}/x64
-        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
-        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2015-shared:
-    needs: [setup]
-    runs-on: windows-2019
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015-shared
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2015-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: package
-      run: |
-        mkdir ${{ env.PACKAGENAME }}
-        mkdir ${{ env.PACKAGENAME }}/x86
-        mkdir ${{ env.PACKAGENAME }}/x64
-        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
-        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2017:
-    needs: [setup]
-    runs-on: windows-2019
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2017-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: package
-      run: |
-        mkdir ${{ env.PACKAGENAME }}
-        mkdir ${{ env.PACKAGENAME }}/x86
-        mkdir ${{ env.PACKAGENAME }}/x64
-        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
-        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2017-shared:
-    needs: [setup]
-    runs-on: windows-2019
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017-shared
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2017-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: package
-      run: |
-        mkdir ${{ env.PACKAGENAME }}
-        mkdir ${{ env.PACKAGENAME }}/x86
-        mkdir ${{ env.PACKAGENAME }}/x64
-        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
-        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2019:
-    needs: [setup]
-    runs-on: windows-latest
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2019-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm
-      run: |
-        mkdir build-arm; cd build-arm
-        cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm64
-      run: |
-        mkdir build-arm64; cd build-arm64
-        cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: package
-      run: |
-        mkdir ${{ env.PACKAGENAME }}
-        mkdir ${{ env.PACKAGENAME }}/x86
-        mkdir ${{ env.PACKAGENAME }}/x64
-        mkdir ${{ env.PACKAGENAME }}/arm
-        mkdir ${{ env.PACKAGENAME }}/arm64
-        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
-        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm"
-        Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64"
-        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2019-shared:
-    needs: [setup]
-    runs-on: windows-latest
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019-shared
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2019-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm
-      run: |
-        mkdir build-arm; cd build-arm
-        cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm64
-      run: |
-        mkdir build-arm64; cd build-arm64
-        cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: package
-      run: |
-        mkdir ${{ env.PACKAGENAME }}
-        mkdir ${{ env.PACKAGENAME }}/x86
-        mkdir ${{ env.PACKAGENAME }}/x64
-        mkdir ${{ env.PACKAGENAME }}/arm
-        mkdir ${{ env.PACKAGENAME }}/arm64
-        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
-        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm"
-        Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64"
-        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2022:
-    needs: [setup]
-    runs-on: windows-latest
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2022-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm
-      run: |
-        mkdir build-arm; cd build-arm
-        cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm64
-      run: |
-        mkdir build-arm64; cd build-arm64
-        cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
+        mkdir build-x64; cd build-x64
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" ..
+        cmake --build . --config Release -j 4
+        cmake --build . --config Release --target install
+    - name: build-arm
+      if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143'
+      run: |
+        mkdir build-arm; cd build-arm
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A arm ..
+        cmake --build . --config Release -j 4
+        cmake --build . --config Release --target install
+    - name: build-arm64
+      if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143'
+      run: |
+        mkdir build-arm64; cd build-arm64
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A arm64 ..
+        cmake --build . --config Release -j 4
+        cmake --build . --config Release --target install
     - name: package
+      if: matrix.opt.toolset-version == 'v140' || matrix.opt.toolset-version == 'v141'
       run: |
         mkdir ${{ env.PACKAGENAME }}
         mkdir ${{ env.PACKAGENAME }}/x86
         mkdir ${{ env.PACKAGENAME }}/x64
-        mkdir ${{ env.PACKAGENAME }}/arm
-        mkdir ${{ env.PACKAGENAME }}/arm64
         Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
         Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm"
-        Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64"
         7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2022-shared:
-    needs: [setup]
-    runs-on: windows-latest
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022-shared
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2022-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm
-      run: |
-        mkdir build-arm; cd build-arm
-        cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm64
-      run: |
-        mkdir build-arm64; cd build-arm64
-        cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
     - name: package
+      if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143'
       run: |
         mkdir ${{ env.PACKAGENAME }}
         mkdir ${{ env.PACKAGENAME }}/x86
@@ -3087,30 +2155,49 @@ jobs:
       with:
         path: artifacts
 
-    - name: create-xcframwork
+    - name: unzip
       run: |
-        mkdir -p ncnn-macos
         mkdir -p ncnn-ios
+        mkdir -p ncnn-ios-vulkan
         mkdir -p ncnn-ios-simulator
+        mkdir -p ncnn-ios-simulator-vulkan
         mkdir -p ncnn-mac-catalyst
-        mkdir -p ncnn-watchos
-        mkdir -p ncnn-watchos-simulator
+        mkdir -p ncnn-mac-catalyst-vulkan
+        mkdir -p ncnn-macos
+        mkdir -p ncnn-macos-vulkan
         mkdir -p ncnn-tvos
+        mkdir -p ncnn-tvos-vulkan
         mkdir -p ncnn-tvos-simulator
+        mkdir -p ncnn-tvos-simulator-vulkan
         mkdir -p ncnn-visionos
+        mkdir -p ncnn-visionos-vulkan
         mkdir -p ncnn-visionos-simulator
+        mkdir -p ncnn-visionos-simulator-vulkan
+        mkdir -p ncnn-watchos
+        mkdir -p ncnn-watchos-simulator
 
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos/ncnn-${{ needs.setup.outputs.VERSION }}-macos.zip -d ncnn-macos
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios/ncnn-${{ needs.setup.outputs.VERSION }}-ios.zip -d ncnn-ios
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan.zip -d ncnn-ios-vulkan
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator.zip -d ncnn-ios-simulator
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan.zip -d ncnn-ios-simulator-vulkan
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst.zip -d ncnn-mac-catalyst
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan.zip -d ncnn-mac-catalyst-vulkan
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos/ncnn-${{ needs.setup.outputs.VERSION }}-macos.zip -d ncnn-macos
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan.zip -d ncnn-macos-vulkan
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos/ncnn-${{ needs.setup.outputs.VERSION }}-tvos.zip -d ncnn-tvos
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan.zip -d ncnn-tvos-vulkan
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator.zip -d ncnn-tvos-simulator
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan.zip -d ncnn-tvos-simulator-vulkan
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos/ncnn-${{ needs.setup.outputs.VERSION }}-visionos.zip -d ncnn-visionos
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-vulkan.zip -d ncnn-visionos-vulkan
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator.zip -d ncnn-visionos-simulator
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator-vulkan.zip -d ncnn-visionos-simulator-vulkan
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator
 
+    - name: create-xcframwork
+      run: |
+        rm -rf openmp.xcframework
         xcodebuild -create-xcframework \
             -framework ncnn-macos/openmp.framework \
             -framework ncnn-ios/openmp.framework \
@@ -3124,6 +2211,7 @@ jobs:
             -framework ncnn-visionos-simulator/openmp.framework \
             -output openmp.xcframework
 
+        rm -rf ncnn.xcframework
         xcodebuild -create-xcframework \
             -framework ncnn-macos/ncnn.framework \
             -framework ncnn-ios/ncnn.framework \
@@ -3139,48 +2227,9 @@ jobs:
 
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.xcframework ncnn.xcframework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  apple-gpu:
-    needs: [setup, macos-gpu, ios-gpu, ios-simulator-gpu, mac-catalyst-gpu, watchos, watchos-simulator, tvos-gpu, tvos-simulator-gpu, visionos, visionos-simulator]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-apple-vulkan
-    steps:
-    - run: sudo xcode-select --switch /Applications/Xcode_15.2.app
-    - name: download
-      uses: actions/download-artifact@v4
-      with:
-        path: artifacts
-
-    - name: create-xcframwork
+    - name: create-xcframwork-vulkan
       run: |
-        mkdir -p ncnn-macos-vulkan
-        mkdir -p ncnn-ios-vulkan
-        mkdir -p ncnn-ios-simulator-vulkan
-        mkdir -p ncnn-mac-catalyst-vulkan
-        mkdir -p ncnn-watchos
-        mkdir -p ncnn-watchos-simulator
-        mkdir -p ncnn-tvos-vulkan
-        mkdir -p ncnn-tvos-simulator-vulkan
-        mkdir -p ncnn-visionos
-        mkdir -p ncnn-visionos-simulator
-
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan.zip -d ncnn-macos-vulkan
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan.zip -d ncnn-ios-vulkan
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan.zip -d ncnn-ios-simulator-vulkan
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan.zip -d ncnn-mac-catalyst-vulkan
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan.zip -d ncnn-tvos-vulkan
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan.zip -d ncnn-tvos-simulator-vulkan
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos/ncnn-${{ needs.setup.outputs.VERSION }}-visionos.zip -d ncnn-visionos
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator.zip -d ncnn-visionos-simulator
-
+        rm -rf openmp.xcframework
         xcodebuild -create-xcframework \
             -framework ncnn-macos-vulkan/openmp.framework \
             -framework ncnn-ios-vulkan/openmp.framework \
@@ -3194,6 +2243,7 @@ jobs:
             -framework ncnn-visionos-simulator/openmp.framework \
             -output openmp.xcframework
 
+        rm -rf glslang.xcframework
         xcodebuild -create-xcframework \
             -framework ncnn-macos-vulkan/glslang.framework \
             -framework ncnn-ios-vulkan/glslang.framework \
@@ -3201,8 +2251,11 @@ jobs:
             -framework ncnn-mac-catalyst-vulkan/glslang.framework \
             -framework ncnn-tvos-vulkan/glslang.framework \
             -framework ncnn-tvos-simulator-vulkan/glslang.framework \
+            -framework ncnn-visionos-vulkan/glslang.framework \
+            -framework ncnn-visionos-simulator-vulkan/glslang.framework \
             -output glslang.xcframework
 
+        rm -rf ncnn.xcframework
         xcodebuild -create-xcframework \
             -framework ncnn-macos-vulkan/ncnn.framework \
             -framework ncnn-ios-vulkan/ncnn.framework \
@@ -3212,22 +2265,27 @@ jobs:
             -framework ncnn-watchos-simulator/ncnn.framework \
             -framework ncnn-tvos-vulkan/ncnn.framework \
             -framework ncnn-tvos-simulator-vulkan/ncnn.framework \
-            -framework ncnn-visionos/ncnn.framework \
-            -framework ncnn-visionos-simulator/ncnn.framework \
+            -framework ncnn-visionos-vulkan/ncnn.framework \
+            -framework ncnn-visionos-simulator-vulkan/ncnn.framework \
             -output ncnn.xcframework
 
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.xcframework glslang.xcframework ncnn.xcframework
+        rm -f ${{ env.PACKAGENAME }}-vulkan.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}-vulkan.zip openmp.xcframework glslang.xcframework ncnn.xcframework
     - name: upload-zip
       uses: actions/upload-artifact@v4
       with:
         name: ${{ env.PACKAGENAME }}
         path: ${{ env.PACKAGENAME }}.zip
+    - name: upload-zip-vulkan
+      uses: actions/upload-artifact@v4
+      with:
+        name: ${{ env.PACKAGENAME }}-vulkan
+        path: ${{ env.PACKAGENAME }}-vulkan.zip
 
   release:
     permissions:
       contents: write  # for softprops/action-gh-release to create a release
-    needs: [setup, full-source, ubuntu-2004, ubuntu-2004-shared, ubuntu-2204, ubuntu-2204-shared, macos, macos-gpu, ios, ios-gpu, ios-simulator, ios-simulator-gpu, mac-catalyst, mac-catalyst-gpu, watchos, watchos-simulator, tvos, tvos-simulator, android, android-shared, android-gpu, android-gpu-shared, webassembly, windows-vs2015, windows-vs2015-shared, windows-vs2017, windows-vs2017-shared, windows-vs2019, windows-vs2019-shared, windows-vs2022, windows-vs2022-shared, apple, apple-gpu]
+    needs: [setup, full-source, ubuntu, macos, ios, ios-simulator, mac-catalyst, watchos, watchos-simulator, tvos, tvos-simulator, android, webassembly, windows, apple]
     runs-on: ubuntu-latest
     steps:
     - name: download
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 309e3b8fbd0a..0f32a80c86ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,11 @@ if(POLICY CMP0025)
     cmake_policy(SET CMP0025 NEW)
 endif()
 
+if(POLICY CMP0057)
+    # reference from https://cmake.org/cmake/help/latest/policy/CMP0057.html
+    cmake_policy(SET CMP0057 NEW)
+endif()
+
 project(ncnn)
 
 if(MSVC AND NOT CMAKE_VERSION VERSION_LESS "3.15")
diff --git a/README.md b/README.md
index a9bb1c116fa4..146b04b1a4ed 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Source</td>
 <td colspan=2>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-full-source.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-full-source.zip)
 
 </td>
 </tr>
@@ -97,8 +97,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Android</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android.zip)
 
 </td>
 <td rowspan=2>
@@ -111,8 +111,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Android shared</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android-vulkan-shared.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android-vulkan-shared.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android-shared.zip)
 
 </td>
 </tr>
@@ -159,8 +159,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>iOS</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios.zip)
 
 </td>
 <td rowspan=2>
@@ -173,8 +173,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>iOS-Simulator</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios-simulator-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios-simulator.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios-simulator-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios-simulator.zip)
 
 </td>
 </tr>
@@ -193,8 +193,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>macOS</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-macos-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-macos.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-macos-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-macos.zip)
 
 </td>
 <td rowspan=1>
@@ -207,8 +207,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Mac-Catalyst</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-mac-catalyst-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-mac-catalyst.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-mac-catalyst-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-mac-catalyst.zip)
 
 </td>
 <td rowspan=1>
@@ -221,7 +221,7 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>watchOS</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-watchos.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-watchos.zip)
 
 </td>
 <td rowspan=2>
@@ -234,7 +234,7 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>watchOS-Simulator</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-watchos-simulator.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-watchos-simulator.zip)
 
 </td>
 </tr>
@@ -242,8 +242,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>tvOS</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos.zip)
 
 </td>
 <td rowspan=2>
@@ -256,8 +256,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>tvOS-Simulator</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos-simulator-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos-simulator.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos-simulator-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos-simulator.zip)
 
 </td>
 </tr>
@@ -265,7 +265,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>visionOS</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-visionos.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos.zip)
 
 </td>
 <td rowspan=2>
@@ -278,7 +279,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>visionOS-Simulator</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-visionos-simulator.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos-simulator-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos-simulator.zip)
 
 </td>
 </tr>
@@ -286,8 +288,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Apple xcframework</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-apple-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-apple.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-apple-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-apple.zip)
 
 </td>
 <td rowspan=1>
@@ -296,10 +298,10 @@ https://github.com/Tencent/ncnn/releases/latest
 </tr>
 
 <tr>
-<td rowspan=3>
+<td rowspan=4>
   <img src="https://user-images.githubusercontent.com/25181517/186884153-99edc188-e4aa-4c84-91b0-e2df260ebc33.png" width="120" height="auto">
 </td>
-<td colspan=3>
+<td colspan=4>
 
 - [Build for Linux / NVIDIA Jetson / Raspberry Pi3, Pi4 / POWER](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-linux)
 
@@ -309,11 +311,11 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Ubuntu 20.04</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2004.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2004-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2004.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2004-shared.zip)
 
 </td>
-<td rowspan=2>
+<td rowspan=3>
 
   [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/linux-x64-gpu-gcc.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-x64-gpu-gcc)
 
@@ -323,8 +325,17 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Ubuntu 22.04</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2204.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2204-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2204.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2204-shared.zip)
+
+</td>
+</tr>
+<tr>
+<td>Ubuntu 24.04</td>
+<td>
+
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2404.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2404-shared.zip)
 
 </td>
 </tr>
@@ -344,8 +355,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>VS2015</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2015.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2015-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2015.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2015-shared.zip)
 
 </td>
 <td rowspan=4>
@@ -358,8 +369,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>VS2017</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2017.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2017-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2017.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2017-shared.zip)
 
 </td>
 </tr>
@@ -367,8 +378,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>VS2019</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2019.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2019-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2019.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2019-shared.zip)
 
 </td>
 </tr>
@@ -376,8 +387,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>VS2022</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2022.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2022-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2022.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2022-shared.zip)
 
 </td>
 </tr>
@@ -396,7 +407,7 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>WebAssembly</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-webassembly.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-webassembly.zip)
 
 </td>
 <td>
@@ -560,7 +571,7 @@ https://github.com/Tencent/ncnn/releases/latest
 
 **[use netron for ncnn model visualization](https://netron.app)**
 
-**[out-of-the-box web model conversion](https://convertmodel.com/#outputFormat=ncnn)**
+**[use ncnn with pytorch or onnx](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-pytorch-or-onnx)**
 
 [ncnn low-level operation api](https://github.com/Tencent/ncnn/wiki/low-level-operation-api)
 
diff --git a/benchmark/README.md b/benchmark/README.md
index 1927acf81cd4..df9e55de4a8e 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -5911,6 +5911,298 @@ cooling_down = 0
           FastestDet  min =    5.13  max =    5.47  avg =    5.30
 ```
 
+### HUAWEI Kunpeng 920 7260 (x64 cores)
+test on Ubuntu 20.04 (gcc 9.4.0)
+```
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 1 0 -1 0
+loop_count = 300
+num_threads = 1
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =   11.64  max =   12.11  avg =   11.71
+     squeezenet_int8  min =   12.22  max =   13.22  avg =   12.37
+           mobilenet  min =   20.00  max =   20.79  avg =   20.08
+      mobilenet_int8  min =   17.44  max =   19.09  avg =   17.64
+        mobilenet_v2  min =   13.29  max =   14.25  avg =   13.39
+        mobilenet_v3  min =   11.06  max =   11.84  avg =   11.11
+          shufflenet  min =    7.56  max =    7.74  avg =    7.59
+       shufflenet_v2  min =    7.84  max =    8.37  avg =    7.88
+             mnasnet  min =   13.07  max =   13.78  avg =   13.14
+     proxylessnasnet  min =   15.71  max =   16.31  avg =   15.77
+     efficientnet_b0  min =   34.79  max =   35.98  avg =   34.92
+   efficientnetv2_b0  min =   35.28  max =   36.36  avg =   35.41
+        regnety_400m  min =   17.06  max =   17.74  avg =   17.16
+           blazeface  min =    2.99  max =    3.04  avg =    3.01
+           googlenet  min =   50.76  max =   51.74  avg =   51.00
+      googlenet_int8  min =   50.31  max =   52.27  avg =   50.65
+            resnet18  min =   34.97  max =   37.17  avg =   35.82
+       resnet18_int8  min =   40.47  max =   42.03  avg =   40.78
+             alexnet  min =   39.19  max =   39.80  avg =   39.32
+               vgg16  min =  176.62  max =  181.29  avg =  177.07
+          vgg16_int8  min =  352.35  max =  358.38  avg =  355.15
+            resnet50  min =   96.76  max =   98.63  avg =   97.09
+       resnet50_int8  min =   90.00  max =   92.74  avg =   90.81
+      squeezenet_ssd  min =   33.23  max =   33.99  avg =   33.39
+ squeezenet_ssd_int8  min =   38.50  max =   41.53  avg =   39.28
+       mobilenet_ssd  min =   42.49  max =   44.78  avg =   42.72
+  mobilenet_ssd_int8  min =   37.06  max =   39.97  avg =   37.57
+      mobilenet_yolo  min =   96.34  max =   98.91  avg =   96.73
+  mobilenetv2_yolov3  min =   50.88  max =   52.97  avg =   51.15
+         yolov4-tiny  min =   65.56  max =   67.13  avg =   65.80
+           nanodet_m  min =   19.94  max =   20.82  avg =   20.04
+    yolo-fastest-1.1  min =    7.66  max =    7.81  avg =    7.71
+      yolo-fastestv2  min =    6.82  max =    7.23  avg =    6.87
+  vision_transformer  min = 1535.03  max = 1552.84  avg = 1543.73
+          FastestDet  min =    7.17  max =    7.50  avg =    7.21
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 2 0 -1 0
+loop_count = 300
+num_threads = 2
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    6.35  max =    9.15  avg =    7.33
+     squeezenet_int8  min =    8.06  max =    8.60  avg =    8.14
+           mobilenet  min =   10.30  max =   11.86  avg =   11.48
+      mobilenet_int8  min =    8.93  max =   11.87  avg =   10.47
+        mobilenet_v2  min =    9.05  max =   11.50  avg =    9.19
+        mobilenet_v3  min =    6.32  max =    6.42  avg =    6.36
+          shufflenet  min =    6.73  max =    8.55  avg =    6.81
+       shufflenet_v2  min =    4.94  max =    6.65  avg =    6.32
+             mnasnet  min =    7.38  max =   10.77  avg =    8.82
+     proxylessnasnet  min =    8.57  max =    9.72  avg =    8.63
+     efficientnet_b0  min =   18.61  max =   22.53  avg =   20.42
+   efficientnetv2_b0  min =   18.75  max =   21.93  avg =   20.79
+        regnety_400m  min =   11.86  max =   15.09  avg =   14.60
+           blazeface  min =    1.95  max =    3.37  avg =    2.06
+           googlenet  min =   28.66  max =   32.24  avg =   28.94
+      googlenet_int8  min =   27.64  max =   32.15  avg =   30.84
+            resnet18  min =   20.33  max =   20.77  avg =   20.47
+       resnet18_int8  min =   22.63  max =   23.72  avg =   22.88
+             alexnet  min =   20.41  max =   29.37  avg =   27.22
+               vgg16  min =  101.72  max =  140.33  avg =  103.29
+          vgg16_int8  min =  187.56  max =  211.44  avg =  189.92
+            resnet50  min =   51.07  max =   59.25  avg =   58.35
+       resnet50_int8  min =   46.50  max =   52.55  avg =   48.93
+      squeezenet_ssd  min =   22.48  max =   28.59  avg =   22.98
+ squeezenet_ssd_int8  min =   25.56  max =   26.82  avg =   25.99
+       mobilenet_ssd  min =   22.81  max =   26.21  avg =   24.88
+  mobilenet_ssd_int8  min =   19.31  max =   25.53  avg =   21.74
+      mobilenet_yolo  min =   59.58  max =   62.04  avg =   59.99
+  mobilenetv2_yolov3  min =   33.26  max =   35.74  avg =   33.51
+         yolov4-tiny  min =   41.14  max =   45.34  avg =   42.46
+           nanodet_m  min =   12.10  max =   16.69  avg =   15.02
+    yolo-fastest-1.1  min =    5.44  max =    7.78  avg =    7.24
+      yolo-fastestv2  min =    5.03  max =    8.08  avg =    6.75
+  vision_transformer  min =  994.46  max = 1090.68  avg = 1045.50
+          FastestDet  min =    6.76  max =    6.91  avg =    6.83
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 4 0 -1 0
+loop_count = 300
+num_threads = 4
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    3.79  max =    6.99  avg =    4.55
+     squeezenet_int8  min =    5.13  max =    5.68  avg =    5.20
+           mobilenet  min =    6.25  max =    6.55  avg =    6.30
+      mobilenet_int8  min =    5.96  max =    6.10  avg =    6.03
+        mobilenet_v2  min =    5.34  max =    7.15  avg =    5.62
+        mobilenet_v3  min =    4.05  max =    5.74  avg =    5.01
+          shufflenet  min =    3.69  max =    5.81  avg =    5.15
+       shufflenet_v2  min =    4.31  max =    6.02  avg =    4.56
+             mnasnet  min =    4.48  max =    6.05  avg =    5.54
+     proxylessnasnet  min =    5.05  max =    8.08  avg =    6.03
+     efficientnet_b0  min =   10.17  max =   12.21  avg =   11.58
+   efficientnetv2_b0  min =   10.86  max =   15.78  avg =   12.70
+        regnety_400m  min =    9.24  max =   14.13  avg =   11.98
+           blazeface  min =    1.89  max =    1.97  avg =    1.93
+           googlenet  min =   15.19  max =   20.31  avg =   16.90
+      googlenet_int8  min =   17.97  max =   19.40  avg =   18.11
+            resnet18  min =   11.18  max =   11.48  avg =   11.29
+       resnet18_int8  min =   12.26  max =   12.78  avg =   12.44
+             alexnet  min =   14.43  max =   16.94  avg =   14.68
+               vgg16  min =   62.40  max =   78.42  avg =   64.96
+          vgg16_int8  min =  101.52  max =  109.42  avg =  104.46
+            resnet50  min =   29.19  max =   39.69  avg =   32.99
+       resnet50_int8  min =   26.94  max =   28.82  avg =   27.16
+      squeezenet_ssd  min =   12.90  max =   16.52  avg =   15.20
+ squeezenet_ssd_int8  min =   15.58  max =   18.40  avg =   16.28
+       mobilenet_ssd  min =   13.68  max =   14.45  avg =   13.87
+  mobilenet_ssd_int8  min =   12.20  max =   14.58  avg =   12.84
+      mobilenet_yolo  min =   34.85  max =   36.54  avg =   35.05
+  mobilenetv2_yolov3  min =   18.61  max =   20.93  avg =   19.92
+         yolov4-tiny  min =   26.09  max =   32.32  avg =   28.03
+           nanodet_m  min =    7.85  max =   12.48  avg =   11.00
+    yolo-fastest-1.1  min =    6.19  max =    6.49  avg =    6.31
+      yolo-fastestv2  min =    3.66  max =    6.83  avg =    5.11
+  vision_transformer  min =  605.95  max =  624.99  avg =  609.79
+          FastestDet  min =    4.32  max =    5.41  avg =    5.17
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 8 0 -1 0
+loop_count = 300
+num_threads = 8
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    2.72  max =    3.74  avg =    3.05
+     squeezenet_int8  min =    3.80  max =    4.71  avg =    4.03
+           mobilenet  min =    3.94  max =    5.15  avg =    4.00
+      mobilenet_int8  min =    3.73  max =    3.87  avg =    3.80
+        mobilenet_v2  min =    4.51  max =    6.57  avg =    4.68
+        mobilenet_v3  min =    4.12  max =    4.38  avg =    4.28
+          shufflenet  min =    4.60  max =    6.27  avg =    4.88
+       shufflenet_v2  min =    4.07  max =    4.20  avg =    4.11
+             mnasnet  min =    4.26  max =    4.51  avg =    4.36
+     proxylessnasnet  min =    4.71  max =    7.40  avg =    4.80
+     efficientnet_b0  min =    8.49  max =    8.74  avg =    8.56
+   efficientnetv2_b0  min =    9.34  max =    9.68  avg =    9.41
+        regnety_400m  min =    8.00  max =   12.85  avg =   10.64
+           blazeface  min =    1.76  max =    1.84  avg =    1.80
+           googlenet  min =   10.89  max =   11.33  avg =   10.98
+      googlenet_int8  min =   11.66  max =   14.07  avg =   11.83
+            resnet18  min =    6.48  max =    6.61  avg =    6.54
+       resnet18_int8  min =    7.30  max =    7.79  avg =    7.51
+             alexnet  min =    8.33  max =    8.95  avg =    8.62
+               vgg16  min =   29.94  max =   47.54  avg =   31.95
+          vgg16_int8  min =   54.67  max =   60.76  avg =   56.03
+            resnet50  min =   16.13  max =   20.79  avg =   20.03
+       resnet50_int8  min =   15.64  max =   20.13  avg =   16.11
+      squeezenet_ssd  min =   11.58  max =   12.02  avg =   11.77
+ squeezenet_ssd_int8  min =   11.14  max =   13.72  avg =   12.10
+       mobilenet_ssd  min =    8.27  max =   10.77  avg =    8.76
+  mobilenet_ssd_int8  min =    8.13  max =    9.09  avg =    8.29
+      mobilenet_yolo  min =   23.90  max =   24.69  avg =   24.17
+  mobilenetv2_yolov3  min =   14.83  max =   15.72  avg =   15.19
+         yolov4-tiny  min =   19.78  max =   23.66  avg =   20.05
+           nanodet_m  min =    8.92  max =   10.76  avg =    9.09
+    yolo-fastest-1.1  min =    5.49  max =    5.77  avg =    5.63
+      yolo-fastestv2  min =    5.04  max =    5.21  avg =    5.10
+  vision_transformer  min =  318.42  max =  379.40  avg =  363.66
+          FastestDet  min =    4.18  max =    4.54  avg =    4.38
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 16 0 -1 0
+loop_count = 300
+num_threads = 16
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    2.70  max =    3.14  avg =    2.81
+     squeezenet_int8  min =    3.21  max =    4.22  avg =    3.39
+           mobilenet  min =    3.13  max =    3.26  avg =    3.20
+      mobilenet_int8  min =    3.17  max =    5.05  avg =    3.30
+        mobilenet_v2  min =    4.31  max =    6.24  avg =    4.62
+        mobilenet_v3  min =    3.57  max =    3.77  avg =    3.68
+          shufflenet  min =    4.70  max =    6.45  avg =    4.80
+       shufflenet_v2  min =    3.73  max =    4.27  avg =    3.87
+             mnasnet  min =    3.67  max =    3.87  avg =    3.75
+     proxylessnasnet  min =    4.28  max =    4.81  avg =    4.35
+     efficientnet_b0  min =    7.31  max =    7.77  avg =    7.53
+   efficientnetv2_b0  min =    9.87  max =   12.33  avg =   10.07
+        regnety_400m  min =   17.95  max =   18.53  avg =   18.26
+           blazeface  min =    2.26  max =    2.40  avg =    2.33
+           googlenet  min =    9.51  max =    9.99  avg =    9.68
+      googlenet_int8  min =   10.98  max =   11.36  avg =   11.18
+            resnet18  min =    5.59  max =    6.08  avg =    5.71
+       resnet18_int8  min =    6.55  max =    7.28  avg =    6.77
+             alexnet  min =    6.26  max =    6.50  avg =    6.36
+               vgg16  min =   23.98  max =   27.37  avg =   24.89
+          vgg16_int8  min =   38.07  max =   39.66  avg =   39.02
+            resnet50  min =   12.81  max =   14.19  avg =   13.76
+       resnet50_int8  min =   12.42  max =   12.84  avg =   12.55
+      squeezenet_ssd  min =   10.80  max =   11.49  avg =   11.12
+ squeezenet_ssd_int8  min =   11.57  max =   12.21  avg =   11.74
+       mobilenet_ssd  min =    7.46  max =    8.08  avg =    7.84
+  mobilenet_ssd_int8  min =    7.47  max =    8.07  avg =    7.63
+      mobilenet_yolo  min =   21.70  max =   23.43  avg =   21.92
+  mobilenetv2_yolov3  min =   12.55  max =   14.56  avg =   12.90
+         yolov4-tiny  min =   17.68  max =   19.85  avg =   18.18
+           nanodet_m  min =    8.35  max =    8.70  avg =    8.45
+    yolo-fastest-1.1  min =    5.70  max =    7.11  avg =    6.05
+      yolo-fastestv2  min =    4.85  max =    5.70  avg =    5.37
+  vision_transformer  min =  214.36  max =  259.56  avg =  245.47
+          FastestDet  min =    5.01  max =    5.42  avg =    5.17
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 32 0 -1 0
+loop_count = 300
+num_threads = 32
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    2.30  max =    2.94  avg =    2.46
+     squeezenet_int8  min =    3.08  max =    4.88  avg =    4.03
+           mobilenet  min =    2.49  max =    2.76  avg =    2.53
+      mobilenet_int8  min =    2.86  max =    3.73  avg =    2.95
+        mobilenet_v2  min =    4.51  max =    5.20  avg =    4.74
+        mobilenet_v3  min =    5.11  max =    6.91  avg =    6.10
+          shufflenet  min =    5.57  max =    6.51  avg =    5.78
+       shufflenet_v2  min =    4.37  max =    4.66  avg =    4.48
+             mnasnet  min =    3.72  max =    4.08  avg =    3.90
+     proxylessnasnet  min =    4.19  max =    6.18  avg =    4.79
+     efficientnet_b0  min =    6.80  max =    7.22  avg =    6.89
+   efficientnetv2_b0  min =   13.98  max =   17.55  avg =   15.06
+        regnety_400m  min =   16.10  max =   16.72  avg =   16.26
+           blazeface  min =    2.12  max =    2.53  avg =    2.17
+           googlenet  min =    8.63  max =    9.89  avg =    8.77
+      googlenet_int8  min =    9.90  max =   11.09  avg =   10.08
+            resnet18  min =    6.54  max =    6.99  avg =    6.73
+       resnet18_int8  min =    8.34  max =    9.00  avg =    8.67
+             alexnet  min =    6.64  max =    7.15  avg =    6.93
+               vgg16  min =   22.79  max =   23.91  avg =   23.50
+          vgg16_int8  min =   32.37  max =   37.51  avg =   33.13
+            resnet50  min =   11.19  max =   16.40  avg =   11.47
+       resnet50_int8  min =   11.92  max =   12.55  avg =   12.13
+      squeezenet_ssd  min =   10.75  max =   12.28  avg =   11.12
+ squeezenet_ssd_int8  min =   11.31  max =   12.29  avg =   11.57
+       mobilenet_ssd  min =   10.25  max =   11.26  avg =   10.79
+  mobilenet_ssd_int8  min =   11.39  max =   16.99  avg =   11.98
+      mobilenet_yolo  min =   52.11  max =   60.46  avg =   53.84
+  mobilenetv2_yolov3  min =   12.07  max =   12.47  avg =   12.20
+         yolov4-tiny  min =   17.48  max =   17.79  avg =   17.58
+           nanodet_m  min =   13.06  max =   14.71  avg =   13.64
+    yolo-fastest-1.1  min =    5.70  max =    5.89  avg =    5.79
+      yolo-fastestv2  min =    8.89  max =    9.99  avg =    9.21
+  vision_transformer  min =  158.92  max =  187.40  avg =  168.21
+          FastestDet  min =    8.70  max =    9.43  avg =    9.00
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 64 0 -1 0
+loop_count = 300
+num_threads = 64
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    6.85  max =   78.56  avg =    7.81
+     squeezenet_int8  min =    8.06  max =   88.91  avg =    9.23
+           mobilenet  min =    3.02  max =   86.86  avg =    5.89
+      mobilenet_int8  min =    3.58  max =    4.55  avg =    3.68
+        mobilenet_v2  min =    5.05  max =  150.06  avg =   13.04
+        mobilenet_v3  min =    4.85  max =  125.22  avg =    8.34
+          shufflenet  min =   17.80  max =  220.55  avg =   21.01
+       shufflenet_v2  min =   11.23  max =  381.95  avg =   13.71
+             mnasnet  min =    9.83  max =  128.42  avg =   11.10
+     proxylessnasnet  min =   10.53  max =   68.52  avg =   12.03
+     efficientnet_b0  min =   16.78  max =  968.87  avg =   23.94
+   efficientnetv2_b0  min =   26.23  max =  551.18  avg =   31.34
+        regnety_400m  min =   70.14  max =  407.92  avg =   78.30
+           blazeface  min =    7.27  max =  191.44  avg =    9.37
+           googlenet  min =   16.69  max =  820.58  avg =   25.06
+      googlenet_int8  min =   20.58  max =  849.09  avg =   29.87
+            resnet18  min =    8.67  max =  349.00  avg =   11.33
+       resnet18_int8  min =   10.40  max =  128.98  avg =   11.45
+             alexnet  min =    6.15  max =  196.01  avg =   10.24
+               vgg16  min =   21.11  max =  288.66  avg =   29.37
+          vgg16_int8  min =   30.72  max =  251.95  avg =   37.68
+            resnet50  min =   19.10  max =  114.08  avg =   22.00
+       resnet50_int8  min =   18.99  max =  436.89  avg =   24.36
+      squeezenet_ssd  min =   22.22  max =  510.52  avg =   28.76
+ squeezenet_ssd_int8  min =   23.42  max =  614.70  avg =   30.82
+       mobilenet_ssd  min =    7.62  max =  202.66  avg =   14.59
+  mobilenet_ssd_int8  min =    7.89  max =  109.82  avg =    8.80
+      mobilenet_yolo  min =   31.43  max =  742.10  avg =   45.52
+  mobilenetv2_yolov3  min =   18.31  max =  273.05  avg =   20.78
+         yolov4-tiny  min =   21.03  max =  400.05  avg =   33.64
+           nanodet_m  min =   19.94  max =  114.18  avg =   21.89
+    yolo-fastest-1.1  min =    7.20  max =  174.60  avg =    9.13
+      yolo-fastestv2  min =    7.50  max =  170.55  avg =    9.01
+  vision_transformer  min =  126.90  max =  335.71  avg =  157.38
+          FastestDet  min =    6.59  max =   19.77  avg =    6.77
+```
+
 ### Intel Atom x5-Z8350
 ```
 nihui@nihui-ROCK-Pi-X:~/ncnn/build/benchmark$ ./benchncnn 20 4 0 -1 1
diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
index 05996f8d7354..de4d6b428e99 100644
--- a/docs/developer-guide/operators.md
+++ b/docs/developer-guide/operators.md
@@ -71,6 +71,7 @@
 * [Reorg](#reorg)
 * [Requantize](#requantize)
 * [Reshape](#reshape)
+* [RMSNorm](#rmsnorm)
 * [RNN](#rnn)
 * [Scale](#scale)
 * [SELU](#selu)
@@ -836,11 +837,13 @@ y = embedding(x)
 | 1         | input_dim     | int   | 0         |                   |
 | 2         | bias_term     | int   | 0         |                   |
 | 3         | weight_data_size | int | 0        |                   |
+| 18        | int8_scale_term| int  | 0         |                   |
 
 | weight        | type  | shape                 |
 | ------------- | ----- | --------------------- |
 | weight_data   | float | [weight_data_size]    |
 | bias_term     | float | [num_output]          |
+| weight_data_int8_scales| float | [1]          |
 
 # Exp
 ```
@@ -1670,6 +1673,26 @@ Reshape flag:
 - -1 = remaining
 - -233 = drop this dim(default)
 
+# RMSNorm
+```
+split x along outmost axis into part x0, x1 ...
+root mean square normalize for each part x0, x1 ...
+y = x * gamma by elementwise
+```
+
+* one_blob_only
+* support_inplace
+
+| param id  | name          | type  | default   | description       |
+| --------- | ------------- | ----- | --------- | ----------------- |
+| 0         | affine_size   | int   | 0         |                   |
+| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
+| 2         | affine        | int   | 1         |                   |
+
+| weight        | type  | shape                 |
+| ------------- | ----- | --------------------- |
+| gamma_data    | float | [affine_size]         |
+
 # RNN
 Apply a single-layer RNN to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
diff --git a/docs/faq.en.md b/docs/faq.en.md
index 807c4a9e3ee6..44d0068263b6 100644
--- a/docs/faq.en.md
+++ b/docs/faq.en.md
@@ -262,7 +262,7 @@ Fully customizable op, first change to one that can export (e.g. concat slice),
 
    Set net.opt.use_vulkan_compute = true before load_param / load_model;
 
-- ## How to ececute multiple blob inputs, multiple blob outputs？
+- ## How to execute multiple blob inputs, multiple blob outputs？
    Multiple execute `ex.input()` and `ex.extract()` like following
     ```
     ex.input("data1", in_1);
diff --git a/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md b/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md
index 9b0559a8eb8f..e0195aa1403c 100644
--- a/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md
+++ b/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md
@@ -2,8 +2,114 @@ Here is a practical guide for converting pytorch model to ncnn
 
 resnet18 is used as the example
 
-## pytorch to onnx
-
+## pytorch to ncnn, onnx to ncnn
+
+### What's the pnnx?
+PyTorch Neural Network eXchange(PNNX) is an open standard for PyTorch model interoperability. PNNX provides an open model format for PyTorch. It defines computation graph as well as high level operators strictly matches PyTorch.
+It is recommended to use the `pnnx` tool to convert your `onnx` or `pytorch` model into a ncnn model now.
+
+### How to install pnnx?
+* A. python pip (recommended)
+  * Windows/Linux/macOS 64bit
+  * python 3.7 or later
+
+  ```shell
+  pip3 install pnnx
+  ```
+
+* B. portable binary package (recommended if you hate python)
+  * Windows/Linux/macOS 64bit
+  * For Linux, glibc 2.17+
+
+  Download portable pnnx binary package from https://github.com/pnnx/pnnx/releases and extract it.
+
+* C. build from source
+  1. install pytorch
+  2. (optional) install torchvision for pnnx torchvision operator support
+  3. (optional) install protobuf for pnnx onnx-zero support
+  4. clone https://github.com/Tencent/ncnn.git
+  5. build pnnx in ncnn/tools/pnnx with cmake
+
+  You will probably refer https://github.com/pnnx/pnnx/blob/main/.github/workflows/release.yml for detailed steps
+
+  ```shell
+  git clone https://github.com/Tencent/ncnn.git
+  mkdir ncnn/tools/pnnx/build
+  cd ncnn/tools/pnnx/build
+  cmake -DCMAKE_INSTALL_PREFIX=install -DTorch_INSTALL_DIR=<your libtorch install dir> -DTorchVision_INSTALL_DIR=<your torchvision install dir> ..
+  cmake --build . --config Release -j 4
+  cmake --build . --config Release --target install
+  ```
+
+### How to use pnnx?
+* A. python
+  1. optimize and export your torch model with pnnx.export()
+      ```python
+      import torch
+      import torchvision.models as models
+      import pnnx
+
+      model = models.resnet18(pretrained=True)
+
+      x = torch.rand(1, 3, 224, 224)
+
+      opt_model = pnnx.export(model, "resnet18.pt", x)
+
+      # use tuple for model with multiple inputs
+      # opt_model = pnnx.export(model, "resnet18.pt", (x, y, z))
+      ```
+  2. use optimized module just like the normal one
+      ```python
+      result = opt_model(x) 
+      ```
+  3. pick resnet18_pnnx.py for pnnx-optimized torch model
+  4. pick resnet18.ncnn.param and resnet18.ncnn.bin for ncnn inference
+
+B. command line
+  1. export your torch model to torchscript / onnx
+      ```python
+      import torch
+      import torchvision.models as models
+
+      net = models.resnet18(pretrained=True)
+      net = net.eval()
+
+      x = torch.rand(1, 3, 224, 224)
+
+      # You could try disabling checking when tracing raises error
+      # mod = torch.jit.trace(net, x, check_trace=False)
+      mod = torch.jit.trace(net, x)
+
+      mod.save("resnet18.pt")
+
+      # You could also try exporting to the good-old onnx
+      torch.onnx.export(net, x, 'resnet18.onnx')
+      ```
+
+  2. pnnx convert torchscript / onnx to optimized pnnx model and ncnn model files
+      ```shell
+      ./pnnx resnet18.pt inputshape=[1,3,224,224]
+      ./pnnx resnet18.onnx inputshape=[1,3,224,224]
+      ```
+      macOS zsh user may need double quotes to prevent ambiguity
+      ```shell
+      ./pnnx resnet18.pt "inputshape=[1,3,224,224]"
+      ```
+      For model with multiple inputs, use list
+      ```shell
+      ./pnnx resnet18.pt inputshape=[1,3,224,224],[1,32]
+      ```
+      For model with non-fp32 input data type, add type suffix
+      ```shell
+      ./pnnx resnet18.pt inputshape=[1,3,224,224]f32,[1,32]i64
+      ```
+  3. pick resnet18_pnnx.py for pnnx-optimized torch model
+  4. pick resnet18.ncnn.param and resnet18.ncnn.bin for ncnn inference
+
+see more pnnx informations: https://github.com/pnnx/pnnx
+
+## pytorch to onnx (deprecated)
+<details><summary>pytorch to onnx</summary>
 The official pytorch tutorial for exporting onnx model
 
 https://pytorch.org/tutorials/advanced/super_resolution_with_caffe2.html
@@ -22,9 +128,10 @@ x = torch.rand(1, 3, 224, 224)
 # Export the model
 torch_out = torch.onnx._export(model, x, "resnet18.onnx", export_params=True)
 ```
+</details>
 
-## simplify onnx model
-
+## simplify onnx model (deprecated)
+<details><summary>simplify onnx model</summary>
 The exported resnet18.onnx model may contains many redundant operators such as Shape, Gather and Unsqueeze that is not supported in ncnn
 
 ```
@@ -37,19 +144,36 @@ Unsqueeze not supported yet!
   # axes 7
 ```
 
-Fortunately, daquexian developed a handy tool to eliminate them. cheers!
+### onnxsim
 
-https://github.com/daquexian/onnx-simplifier
+Fortunately, [@daquexian](https://github.com/daquexian) developed a handy tool to eliminate them. cheers!
 
+#### how to use onnxsim?
+```shell
+pip install onnxsim
+python -m onnxsim resnet18.onnx resnet18-sim.onnx
 ```
-python3 -m onnxsim resnet18.onnx resnet18-sim.onnx
-```
+more informations: https://github.com/daquexian/onnx-simplifier
 
-## onnx to ncnn
+### onnxslim
 
-Finally, you can convert the model to ncnn using tools/onnx2ncnn
+Or you can use another powerful model simplification tool implemented in pure Python development by [@inisis](https://github.com/inisis):
 
+#### how to use onnxslim?
+```shell
+pip install onnxslim
+python -m onnxslim resnet18.onnx resnet18-slim.onnx
 ```
-onnx2ncnn resnet18-sim.onnx resnet18.param resnet18.bin
-```
+more informations: https://github.com/inisis/OnnxSlim
+</details>
+
+## onnx2ncnn (deprecated)
+
+~~The onnx2ncnn tool has stopped maintenance. It is recommended to use the PNNX tool~~
+
+<details><summary>onnx2ncnn tool</summary>
+
+~~Finally, you can convert the model to ncnn using tools/onnx2ncnn~~
 
+~~onnx2ncnn resnet18-sim.onnx resnet18.param resnet18.bin~~
+</details>
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a7739be27e51..bf3017dbe680 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -69,6 +69,7 @@ if(NCNN_PIXEL)
             ncnn_add_example(yolov4)
             ncnn_add_example(rvm)
             ncnn_add_example(p2pnet)
+            ncnn_add_example(yolov8)
         endif()
     else()
         message(WARNING "OpenCV not found and NCNN_SIMPLEOCV disabled, examples won't be built")
diff --git a/examples/yolov8.cpp b/examples/yolov8.cpp
new file mode 100644
index 000000000000..e166e6c1d174
--- /dev/null
+++ b/examples/yolov8.cpp
@@ -0,0 +1,410 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Copyright (C) 2024 whyb(https://github.com/whyb). All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+// ReadMe
+// Convert yolov8 model to ncnn model workflow:
+//
+// step 1:
+// If you don't want to train the model yourself. You should go to the ultralytics website download the pretrained model file.
+// original pretrained model from https://docs.ultralytics.com/models/yolov8/#supported-tasks-and-modes
+//
+// step 2:
+// run this command.
+// conda create --name yolov8 python=3.11
+// conda activate yolov8
+// pip install ultralytics onnx numpy protobuf
+//
+// step 3:
+// save source code file(export_model_to_ncnn.py):
+// from ultralytics import YOLO
+// detection_models = [
+//     ["./Detection-pt/yolov8n.pt", "./Detection-pt/"],
+//     ["./Detection-pt/yolov8s.pt", "./Detection-pt/"],
+//     ["./Detection-pt/yolov8m.pt", "./Detection-pt/"],
+//     ["./Detection-pt/yolov8l.pt", "./Detection-pt/"],
+//     ["./Detection-pt/yolov8x.pt", "./Detection-pt/"]
+// ]
+// for model_dict in detection_models:
+//     model = YOLO(model_dict[0])  # load an official pretrained weight model
+//     model.export(format="ncnn", dynamic=True, save_dir=model_dict[1], simplify=True)
+//
+// step 4:
+// run command: python export_model_to_ncnn.py
+
+#include <memory>
+#include <vector>
+#include <algorithm>
+#include "layer.h"
+#include "net.h"
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <float.h>
+#include <stdio.h>
+
+#define MAX_STRIDE 32
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = objects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (objects[i].prob > p)
+            i++;
+
+        while (objects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(objects[i], objects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(objects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(objects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects)
+{
+    if (objects.empty())
+        return;
+
+    qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            if (!agnostic && a.label != b.label)
+                continue;
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static inline float sigmoid(float x)
+{
+    return static_cast<float>(1.f / (1.f + exp(-x)));
+}
+
+static inline float clampf(float d, float min, float max)
+{
+    const float t = d < min ? min : d;
+    return t > max ? max : t;
+}
+
+static void parse_yolov8_detections(
+    float* inputs, float confidence_threshold,
+    int num_channels, int num_anchors, int num_labels,
+    int infer_img_width, int infer_img_height,
+    std::vector<Object>& objects)
+{
+    std::vector<Object> detections;
+    cv::Mat output = cv::Mat((int)num_channels, (int)num_anchors, CV_32F, inputs).t();
+
+    for (int i = 0; i < num_anchors; i++)
+    {
+        const float* row_ptr = output.row(i).ptr<float>();
+        const float* bboxes_ptr = row_ptr;
+        const float* scores_ptr = row_ptr + 4;
+        const float* max_s_ptr = std::max_element(scores_ptr, scores_ptr + num_labels);
+        float score = *max_s_ptr;
+        if (score > confidence_threshold)
+        {
+            float x = *bboxes_ptr++;
+            float y = *bboxes_ptr++;
+            float w = *bboxes_ptr++;
+            float h = *bboxes_ptr;
+
+            float x0 = clampf((x - 0.5f * w), 0.f, (float)infer_img_width);
+            float y0 = clampf((y - 0.5f * h), 0.f, (float)infer_img_height);
+            float x1 = clampf((x + 0.5f * w), 0.f, (float)infer_img_width);
+            float y1 = clampf((y + 0.5f * h), 0.f, (float)infer_img_height);
+
+            cv::Rect_<float> bbox;
+            bbox.x = x0;
+            bbox.y = y0;
+            bbox.width = x1 - x0;
+            bbox.height = y1 - y0;
+            Object object;
+            object.label = max_s_ptr - scores_ptr;
+            object.prob = score;
+            object.rect = bbox;
+            detections.push_back(object);
+        }
+    }
+    objects = detections;
+}
+
+static int detect_yolov8(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolov8;
+
+    yolov8.opt.use_vulkan_compute = true; // if you want detect in hardware, then enable it
+
+    yolov8.load_param("yolov8n.param");
+    yolov8.load_model("yolov8n.bin");
+
+    const int target_size = 640;
+    const float prob_threshold = 0.25f;
+    const float nms_threshold = 0.45f;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    // letterbox pad to multiple of MAX_STRIDE
+    int w = img_w;
+    int h = img_h;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);
+
+    int wpad = (target_size + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - w;
+    int hpad = (target_size + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
+
+    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
+    in_pad.substract_mean_normalize(0, norm_vals);
+
+    ncnn::Extractor ex = yolov8.create_extractor();
+
+    ex.input("in0", in_pad);
+
+    std::vector<Object> proposals;
+
+    // stride 32
+    {
+        ncnn::Mat out;
+        ex.extract("out0", out);
+
+        std::vector<Object> objects32;
+        const int num_labels = 80; // COCO has detect 80 object labels.
+        parse_yolov8_detections(
+            (float*)out.data, prob_threshold,
+            out.h, out.w, num_labels,
+            in_pad.w, in_pad.h,
+            objects32);
+        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    static const unsigned char colors[19][3] = {
+        {54, 67, 244},
+        {99, 30, 233},
+        {176, 39, 156},
+        {183, 58, 103},
+        {181, 81, 63},
+        {243, 150, 33},
+        {244, 169, 3},
+        {212, 188, 0},
+        {136, 150, 0},
+        {80, 175, 76},
+        {74, 195, 139},
+        {57, 220, 205},
+        {59, 235, 255},
+        {7, 193, 255},
+        {0, 152, 255},
+        {34, 87, 255},
+        {72, 85, 121},
+        {158, 158, 158},
+        {139, 125, 96}
+    };
+
+    int color_index = 0;
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        const unsigned char* color = colors[color_index % 19];
+        color_index++;
+
+        cv::Scalar cc(color[0], color[1], color[2]);
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cc, 2);
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cc, -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(255, 255, 255));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolov8(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/python/src/main.cpp b/python/src/main.cpp
index a7ed0528c6ab..e5b1264264c9 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -34,6 +34,20 @@ using namespace ncnn;
 
 namespace py = pybind11;
 
+class DataReaderFromMemoryCopy : public DataReaderFromMemory
+{
+public:
+    explicit DataReaderFromMemoryCopy(const unsigned char*& mem)
+        : DataReaderFromMemory(mem)
+    {
+    }
+
+    virtual size_t reference(size_t size, const void** buf) const
+    {
+        return 0;
+    }
+};
+
 struct LayerFactory
 {
     std::string name;
@@ -956,6 +970,13 @@ PYBIND11_MODULE(ncnn, m)
 #endif // NCNN_STRING
     .def("load_param_bin", (int (Net::*)(const char*)) & Net::load_param_bin, py::arg("protopath"))
     .def("load_model", (int (Net::*)(const char*)) & Net::load_model, py::arg("modelpath"))
+    .def(
+    "load_model_mem", [](Net& net, const char* mem) {
+        const unsigned char* _mem = (const unsigned char*)mem;
+        DataReaderFromMemoryCopy dr(_mem);
+        net.load_model(dr);
+    },
+    py::arg("mem"))
 #endif // NCNN_STDIO
 
     .def("clear", &Net::clear)
diff --git a/python/tests/test_net.py b/python/tests/test_net.py
index 03271aff4623..362cc4791fb8 100644
--- a/python/tests/test_net.py
+++ b/python/tests/test_net.py
@@ -42,6 +42,32 @@ def test_net():
         assert len(net.blobs()) == 0 and len(net.layers()) == 0
 
 
+def test_net_mem():
+    modelbin = bytearray(303940)
+    modelbin[0:4] = 71,107,48,1
+    modelbin[180:184] = 71,107,48,1
+
+    with ncnn.Net() as net:
+        ret = net.load_param("tests/test.param")
+        net.load_model_mem(bytes(modelbin))
+        assert ret == 0 and len(net.blobs()) == 3 and len(net.layers()) == 3
+
+        input_names = net.input_names()
+        output_names = net.output_names()
+        assert len(input_names) > 0 and len(output_names) > 0
+
+        in_mat = ncnn.Mat((227, 227, 3))
+
+        with net.create_extractor() as ex:
+            ex.input("data", in_mat)
+            ret, out_mat = ex.extract("output")
+
+        assert ret == 0 and out_mat.dims == 1 and out_mat.w == 1
+
+        net.clear()
+        assert len(net.blobs()) == 0 and len(net.layers()) == 0
+
+
 def test_net_vulkan():
     if not hasattr(ncnn, "get_gpu_count"):
         return
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d3f55ce77900..803c34a780d4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -166,6 +166,7 @@ ncnn_add_layer(Erf)
 ncnn_add_layer(Diag)
 ncnn_add_layer(CELU)
 ncnn_add_layer(Shrink)
+ncnn_add_layer(RMSNorm)
 
 if(NCNN_VULKAN)
     ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 5662d1b51554..f8146e054c27 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -1240,6 +1240,13 @@ void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt)
     ((Net*)net->pthis)->opt = *((Option*)opt);
 }
 
+#if NCNN_VULKAN
+void ncnn_net_set_vulkan_device(ncnn_net_t net, int device_index)
+{
+    ((Net*)net->pthis)->set_vulkan_device(device_index);
+}
+#endif
+
 static ::ncnn::Layer* __Layer_c_api_layer_creator(void* userdata)
 {
     ncnn_net_custom_layer_factory_t ud = (ncnn_net_custom_layer_factory_t)userdata;
diff --git a/src/c_api.h b/src/c_api.h
index d153b2a4ef0f..f752bfed6636 100644
--- a/src/c_api.h
+++ b/src/c_api.h
@@ -275,6 +275,10 @@ NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
 NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
 NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
 
+#if NCNN_VULKAN
+NCNN_EXPORT void ncnn_net_set_vulkan_device(ncnn_net_t net, int device_index);
+#endif
+
 #if NCNN_STRING
 NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
 #endif /* NCNN_STRING */
diff --git a/src/cpu.cpp b/src/cpu.cpp
index ba050e7b1e62..e42bcfafeb21 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -47,10 +47,9 @@
 #include <emscripten/threading.h>
 #endif
 
-#if defined _WIN32 && !(defined __MINGW32__)
+#if defined _WIN32
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
-#include <powerbase.h>
 #endif
 
 #if defined __ANDROID__ || defined __linux__
@@ -130,8 +129,10 @@
 #include <immintrin.h>
 #endif
 
+#if (defined _WIN32 && (__aarch64__ || __arm__))
 #define RUAPU_IMPLEMENTATION
 #include "ruapu.h"
+#endif
 
 // topology info
 static int g_cpucount;
@@ -597,9 +598,6 @@ static int get_cpu_support_x86_avx2()
 
 static int get_cpu_support_x86_avx_vnni()
 {
-#if __APPLE__
-    return ruapu_supports("avxvnni");
-#else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
 
@@ -618,13 +616,16 @@ static int get_cpu_support_x86_avx_vnni()
 
     x86_cpuid_sublevel(7, 1, cpu_info);
     return cpu_info[0] & (1u << 4);
-#endif
 }
 
 static int get_cpu_support_x86_avx512()
 {
 #if __APPLE__
-    return ruapu_supports("avx512f") && ruapu_supports("avx512bw") && ruapu_supports("avx512cd") && ruapu_supports("avx512dq") && ruapu_supports("avx512vl");
+    return get_hw_capability("hw.optional.avx512f")
+           && get_hw_capability("hw.optional.avx512bw")
+           && get_hw_capability("hw.optional.avx512cd")
+           && get_hw_capability("hw.optional.avx512dq")
+           && get_hw_capability("hw.optional.avx512vl");
 #else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
@@ -654,7 +655,7 @@ static int get_cpu_support_x86_avx512()
 static int get_cpu_support_x86_avx512_vnni()
 {
 #if __APPLE__
-    return ruapu_supports("avx512vnni");
+    return get_hw_capability("hw.optional.avx512vnni");
 #else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
@@ -684,7 +685,7 @@ static int get_cpu_support_x86_avx512_vnni()
 static int get_cpu_support_x86_avx512_bf16()
 {
 #if __APPLE__
-    return ruapu_supports("avx512bf16");
+    return get_hw_capability("hw.optional.avx512bf16");
 #else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
@@ -710,7 +711,7 @@ static int get_cpu_support_x86_avx512_bf16()
 static int get_cpu_support_x86_avx512_fp16()
 {
 #if __APPLE__
-    return ruapu_supports("avx512fp16");
+    return get_hw_capability("hw.optional.avx512fp16");
 #else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
@@ -746,7 +747,7 @@ static int get_cpucount()
         count = emscripten_num_logical_cores();
     else
         count = 1;
-#elif (defined _WIN32 && !(defined __MINGW32__))
+#elif defined _WIN32
     SYSTEM_INFO system_info;
     GetSystemInfo(&system_info);
     count = system_info.dwNumberOfProcessors;
@@ -813,7 +814,7 @@ static int get_thread_siblings(int cpuid)
 static int get_physical_cpucount()
 {
     int count = 0;
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
     typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
     LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
     if (glpi == NULL)
@@ -1051,7 +1052,7 @@ static int get_big_cpu_data_cache_size(int level)
 static int get_cpu_level2_cachesize()
 {
     int size = 0;
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
     typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
     LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
     if (glpi != NULL)
@@ -1121,7 +1122,7 @@ static int get_cpu_level2_cachesize()
 static int get_cpu_level3_cachesize()
 {
     int size = 0;
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
     typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
     LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
     if (glpi != NULL)
@@ -1168,7 +1169,7 @@ static int get_cpu_level3_cachesize()
     return size;
 }
 
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
 static ncnn::CpuSet get_smt_cpu_mask()
 {
     ncnn::CpuSet smt_cpu_mask;
@@ -1262,7 +1263,7 @@ static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
 
     return 0;
 }
-#endif // (defined _WIN32 && !(defined __MINGW32__))
+#endif // defined _WIN32
 
 #if defined __ANDROID__ || defined __linux__
 static int get_max_freq_khz(int cpuid)
@@ -1436,7 +1437,7 @@ static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::Cp
         mask_all.enable(i);
     }
 
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
     // get max freq mhz for all cores
     int max_freq_mhz_min = INT_MAX;
     int max_freq_mhz_max = 0;
@@ -1953,7 +1954,7 @@ static void initialize_global_cpu_info()
     g_powersave = 0;
     initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);
 
-#if (defined _WIN32 && (__aarch64__ || __arm__)) || __APPLE__
+#if (defined _WIN32 && (__aarch64__ || __arm__))
     if (!is_being_debugged())
     {
         ruapu_init();
@@ -2030,7 +2031,7 @@ static inline void try_initialize_global_cpu_info()
 
 namespace ncnn {
 
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
 CpuSet::CpuSet()
 {
     disable_all();
@@ -2799,7 +2800,7 @@ const CpuSet& get_cpu_thread_affinity_mask(int powersave)
 int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
 {
     try_initialize_global_cpu_info();
-#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__))
+#if defined __ANDROID__ || defined __linux__ || defined _WIN32
 #ifdef _OPENMP
     int num_threads = thread_affinity_mask.num_enabled();
 
diff --git a/src/cpu.h b/src/cpu.h
index 7d6bfce1108a..2ae6b8c3ffe9 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -17,7 +17,7 @@
 
 #include <stddef.h>
 
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #endif
@@ -40,7 +40,7 @@ class NCNN_EXPORT CpuSet
     int num_enabled() const;
 
 public:
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
     ULONG_PTR mask;
 #endif
 #if defined __ANDROID__ || defined __linux__
diff --git a/src/layer/arm/convolution_3x3_pack1to8_fp16s.h b/src/layer/arm/convolution_3x3_pack1to8_fp16s.h
index bd03d450b2e8..40e276cdedff 100644
--- a/src/layer/arm/convolution_3x3_pack1to8_fp16s.h
+++ b/src/layer/arm/convolution_3x3_pack1to8_fp16s.h
@@ -68,8 +68,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "sub    %0, %0, #64                 \n"
 
                         "prfm   pldl1keep, [%1, #128]       \n"
-                        "ld1    {v0.8h}, [%1], #16          \n" // r0
-                        "ld1    {v1.4h}, [%1]               \n"
+                        "ldr    q0, [%1], #16               \n" // r0
+                        "ldr    s1, [%1]                    \n"
 
                         "fmla   v24.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v25.8h, %8.8h, v0.h[1]      \n"
@@ -99,8 +99,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %10.8h, v1.h[1]     \n"
 
                         "prfm   pldl1keep, [%2, #128]       \n"
-                        "ld1    {v2.8h}, [%2], #16          \n" // r1
-                        "ld1    {v3.4h}, [%2]               \n"
+                        "ldr    q2, [%2], #16               \n" // r1
+                        "ldr    s3, [%2]                    \n"
 
                         "fmla   v24.8h, %11.8h, v2.h[0]     \n"
                         "fmla   v25.8h, %11.8h, v2.h[1]     \n"
@@ -130,8 +130,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %13.8h, v3.h[1]     \n"
 
                         "prfm   pldl1keep, [%3, #128]       \n"
-                        "ld1    {v4.8h}, [%3], #16          \n" // r2
-                        "ld1    {v5.4h}, [%3]               \n"
+                        "ldr    q4, [%3], #16               \n" // r2
+                        "ldr    s5, [%3]                    \n"
 
                         "fmla   v24.8h, %14.8h, v4.h[0]     \n"
                         "fmla   v25.8h, %14.8h, v4.h[1]     \n"
@@ -189,7 +189,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 sum1 sum2 sum3
 
                         "prfm   pldl1keep, [%1, #128]       \n"
-                        "ld1    {v0.8h}, [%1]               \n" // r0
+                        "ldr    q0, [%1]                    \n" // r0
 
                         "fmla   v28.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v29.8h, %8.8h, v0.h[1]      \n"
@@ -207,7 +207,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %10.8h, v0.h[5]     \n"
 
                         "prfm   pldl1keep, [%2, #128]       \n"
-                        "ld1    {v1.8h}, [%2]               \n" // r1
+                        "ldr    q1, [%2]                    \n" // r1
 
                         "fmla   v28.8h, %11.8h, v1.h[0]     \n"
                         "fmla   v29.8h, %11.8h, v1.h[1]     \n"
@@ -225,7 +225,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %13.8h, v1.h[5]     \n"
 
                         "prfm   pldl1keep, [%3, #128]       \n"
-                        "ld1    {v2.8h}, [%3]               \n" // r2
+                        "ldr    q2, [%3]                    \n" // r2
 
                         "fmla   v28.8h, %14.8h, v2.h[0]     \n"
                         "fmla   v29.8h, %14.8h, v2.h[1]     \n"
@@ -274,7 +274,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "ld1    {v30.8h, v31.8h}, [%0]      \n" // sum0 sum1
 
                         "prfm   pldl1keep, [%1, #64]        \n"
-                        "ld1    {v0.4h}, [%1]               \n" // r0
+                        "ldr    d0, [%1]                    \n" // r0
 
                         "fmla   v30.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v31.8h, %8.8h, v0.h[1]      \n"
@@ -284,7 +284,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %10.8h, v0.h[3]     \n"
 
                         "prfm   pldl1keep, [%2, #64]        \n"
-                        "ld1    {v1.4h}, [%2]               \n" // r1
+                        "ldr    d1, [%2]                    \n" // r1
 
                         "fmla   v30.8h, %11.8h, v1.h[0]     \n"
                         "fmla   v31.8h, %11.8h, v1.h[1]     \n"
@@ -294,7 +294,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %13.8h, v1.h[3]     \n"
 
                         "prfm   pldl1keep, [%3, #64]        \n"
-                        "ld1    {v2.4h}, [%3]               \n" // r2
+                        "ldr    d2, [%3]                    \n" // r2
 
                         "fmla   v30.8h, %14.8h, v2.h[0]     \n"
                         "fmla   v31.8h, %14.8h, v2.h[1]     \n"
@@ -332,24 +332,24 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                 {
                     asm volatile(
                         "prfm   pldl1keep, [%0, #128]       \n"
-                        "ld1    {v30.8h}, [%0]              \n" // sum0
+                        "ldr    q30, [%0]                   \n" // sum0
 
                         "prfm   pldl1keep, [%1, #64]        \n"
-                        "ld1    {v0.4h}, [%1]               \n" // r0
+                        "ldr    d0, [%1]                    \n" // r0
 
                         "fmla   v30.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v30.8h, %9.8h, v0.h[1]      \n"
                         "fmla   v30.8h, %10.8h, v0.h[2]     \n"
 
                         "prfm   pldl1keep, [%2, #64]        \n"
-                        "ld1    {v1.4h}, [%2]               \n" // r1
+                        "ldr    d1, [%2]                    \n" // r1
 
                         "fmla   v30.8h, %11.8h, v1.h[0]     \n"
                         "fmla   v30.8h, %12.8h, v1.h[1]     \n"
                         "fmla   v30.8h, %13.8h, v1.h[2]     \n"
 
                         "prfm   pldl1keep, [%3, #64]        \n"
-                        "ld1    {v2.4h}, [%3]               \n" // r2
+                        "ldr    d2, [%3]                    \n" // r2
 
                         "fmla   v30.8h, %14.8h, v2.h[0]     \n"
                         "fmla   v30.8h, %15.8h, v2.h[1]     \n"
@@ -359,7 +359,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "add    %2, %2, #2                  \n"
                         "add    %3, %3, #2                  \n"
 
-                        "st1    {v30.8h}, [%0], #16         \n"
+                        "str    q30, [%0], #16              \n"
 
                         : "=r"(outptr0), // %0
                         "=r"(r0),      // %1
@@ -445,8 +445,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 sum1 sum2 sum3
 
                         "prfm   pldl1keep, [%1, #128]       \n"
-                        "ld1    {v0.8h}, [%1], #16          \n" // r0
-                        "ld1    {v1.h}[0], [%1]             \n"
+                        "ldr    q0, [%1], #16               \n" // r0
+                        "ldr    h1, [%1]                    \n"
 
                         "fmla   v28.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v29.8h, %8.8h, v0.h[2]      \n"
@@ -464,8 +464,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %10.8h, v1.h[0]     \n"
 
                         "prfm   pldl1keep, [%2, #128]       \n"
-                        "ld1    {v2.8h}, [%2], #16          \n" // r1
-                        "ld1    {v3.h}[0], [%2]             \n"
+                        "ldr    q2, [%2], #16               \n" // r1
+                        "ldr    h3, [%2]                    \n"
 
                         "fmla   v28.8h, %11.8h, v2.h[0]     \n"
                         "fmla   v29.8h, %11.8h, v2.h[2]     \n"
@@ -483,8 +483,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %13.8h, v3.h[0]     \n"
 
                         "prfm   pldl1keep, [%3, #128]       \n"
-                        "ld1    {v4.8h}, [%3], #16          \n" // r2
-                        "ld1    {v5.h}[0], [%3]             \n"
+                        "ldr    q4, [%3], #16               \n" // r2
+                        "ldr    h5, [%3]                    \n"
 
                         "fmla   v28.8h, %14.8h, v4.h[0]     \n"
                         "fmla   v29.8h, %14.8h, v4.h[2]     \n"
@@ -529,8 +529,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "ld1    {v30.8h, v31.8h}, [%0]      \n" // sum0 sum1
 
                         "prfm   pldl1keep, [%1, #64]        \n"
-                        "ld1    {v0.4h}, [%1], #8           \n" // r0
-                        "ld1    {v1.h}[0], [%1]             \n"
+                        "ldr    d0, [%1], #8                \n" // r0
+                        "ldr    h1, [%1]                    \n"
 
                         "fmla   v30.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v31.8h, %8.8h, v0.h[2]      \n"
@@ -540,8 +540,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %10.8h, v1.h[0]     \n"
 
                         "prfm   pldl1keep, [%2, #64]        \n"
-                        "ld1    {v2.4h}, [%2], #8           \n" // r1
-                        "ld1    {v3.h}[0], [%2]             \n"
+                        "ldr    d2, [%2], #8                \n" // r1
+                        "ldr    h3, [%2]                    \n"
 
                         "fmla   v30.8h, %11.8h, v2.h[0]     \n"
                         "fmla   v31.8h, %11.8h, v2.h[2]     \n"
@@ -551,8 +551,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %13.8h, v3.h[0]     \n"
 
                         "prfm   pldl1keep, [%3, #64]        \n"
-                        "ld1    {v4.4h}, [%3], #8           \n" // r2
-                        "ld1    {v5.h}[0], [%3]             \n"
+                        "ldr    d4, [%3], #8                \n" // r2
+                        "ldr    h5, [%3]                    \n"
 
                         "fmla   v30.8h, %14.8h, v4.h[0]     \n"
                         "fmla   v31.8h, %14.8h, v4.h[2]     \n"
@@ -586,24 +586,24 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                 {
                     asm volatile(
                         "prfm   pldl1keep, [%0, #128]       \n"
-                        "ld1    {v30.8h}, [%0]              \n" // sum0
+                        "ldr    q30, [%0]                   \n" // sum0
 
                         "prfm   pldl1keep, [%1, #64]        \n"
-                        "ld1    {v0.4h}, [%1]               \n" // r0
+                        "ldr    d0, [%1]                    \n" // r0
 
                         "fmla   v30.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v30.8h, %9.8h, v0.h[1]      \n"
                         "fmla   v30.8h, %10.8h, v0.h[2]     \n"
 
                         "prfm   pldl1keep, [%2, #64]        \n"
-                        "ld1    {v1.4h}, [%2]               \n" // r1
+                        "ldr    d1, [%2]                    \n" // r1
 
                         "fmla   v30.8h, %11.8h, v1.h[0]     \n"
                         "fmla   v30.8h, %12.8h, v1.h[1]     \n"
                         "fmla   v30.8h, %13.8h, v1.h[2]     \n"
 
                         "prfm   pldl1keep, [%3, #64]        \n"
-                        "ld1    {v2.4h}, [%3]               \n" // r2
+                        "ldr    d2, [%3]                    \n" // r2
 
                         "fmla   v30.8h, %14.8h, v2.h[0]     \n"
                         "fmla   v30.8h, %15.8h, v2.h[1]     \n"
@@ -613,7 +613,7 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "add    %2, %2, #4                  \n"
                         "add    %3, %3, #4                  \n"
 
-                        "st1    {v30.8h}, [%0], #16         \n"
+                        "str    q30, [%0], #16              \n"
 
                         : "=r"(outptr0), // %0
                         "=r"(r0),      // %1
diff --git a/src/layer/arm/convolution_im2col_gemm.h b/src/layer/arm/convolution_im2col_gemm.h
index af501efa2f80..25a3e94d781a 100644
--- a/src/layer/arm/convolution_im2col_gemm.h
+++ b/src/layer/arm/convolution_im2col_gemm.h
@@ -3377,7 +3377,7 @@ static void convolution_gemm_transB_packed_tile(const Mat& AT_tile, const Mat& B
                 "cbz    %w10, 0f                    \n"
 
                 "ld1    {v30.4s, v31.4s}, [%0]      \n"
-                "b      3f                          \n"
+                "b      2f                          \n"
 
                 "0:                                 \n"
                 // if pC
diff --git a/src/layer/arm/convolution_im2col_gemm_bf16s.h b/src/layer/arm/convolution_im2col_gemm_bf16s.h
index 82319d05850c..95819e2d679f 100644
--- a/src/layer/arm/convolution_im2col_gemm_bf16s.h
+++ b/src/layer/arm/convolution_im2col_gemm_bf16s.h
@@ -3110,7 +3110,7 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const
                 "cbz    %w10, 0f                    \n"
 
                 "ld1    {v30.4s, v31.4s}, [%0]      \n"
-                "b      3f                          \n"
+                "b      2f                          \n"
 
                 "0:                                 \n"
                 // if pC
@@ -3125,15 +3125,13 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const
                 "eor    v31.16b, v31.16b, v31.16b   \n"
 
                 "2:                                 \n"
-
-                "3:                                 \n"
                 "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                 "cmp    w4, #0                      \n"
-                "beq    5f                          \n"
+                "beq    4f                          \n"
 
                 "eor    v28.16b, v28.16b, v28.16b   \n"
                 "eor    v29.16b, v29.16b, v29.16b   \n"
-                "4:                                 \n"
+                "3:                                 \n"
                 "prfm   pldl1keep, [%2, #64]        \n"
                 "ld1    {v0.4h}, [%2], #8           \n"
                 "shll   v0.4s, v0.4h, #16           \n"
@@ -3156,16 +3154,16 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const
                 "subs   w4, w4, #1                  \n"
                 "fmla   v30.4s, v10.4s, v0.s[3]     \n"
                 "fmla   v31.4s, v11.4s, v0.s[3]     \n"
-                "bne    4b                          \n"
+                "bne    3b                          \n"
                 "fadd   v30.4s, v30.4s, v28.4s      \n"
                 "fadd   v31.4s, v31.4s, v29.4s      \n"
 
-                "5:                                 \n"
+                "4:                                 \n"
                 "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                 "cmp    w4, #0                      \n"
-                "beq    7f                          \n"
+                "beq    6f                          \n"
 
-                "6:                                 \n"
+                "5:                                 \n"
                 "ld1r   {v0.4h}, [%2], #2           \n"
                 "shll   v0.4s, v0.4h, #16           \n"
                 "ld1    {v3.8h}, [%1], #16          \n"
@@ -3174,26 +3172,26 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const
                 "subs   w4, w4, #1                  \n"
                 "fmla   v30.4s, v4.4s, v0.4s        \n"
                 "fmla   v31.4s, v5.4s, v0.4s        \n"
-                "bne    6b                          \n"
+                "bne    5b                          \n"
 
-                "7:                                 \n"
+                "6:                                 \n"
                 "shrn   v30.4h, v30.4s, #16         \n"
                 "shrn   v31.4h, v31.4s, #16         \n"
                 "tst    %w11, #255                  \n"
-                "beq    10f                         \n"
+                "beq    9f                          \n"
 
                 // if out_elempack == 4
                 "cmp    %w12, #4                    \n"
-                "bne    8f                          \n"
+                "bne    7f                          \n"
 
                 "lsl    w4, %w13, #2                \n"
                 "add    x4, %3, w4, sxtw 1          \n"
                 "st1    {v30.4h}, [%3], #8          \n"
                 "st1    {v31.4h}, [x4]              \n"
-                "b      9f                          \n"
+                "b      8f                          \n"
 
                 // if out_elempack == 1
-                "8:                                 \n"
+                "7:                                 \n"
                 "add    x4, %3, %w13, sxtw 1        \n"
                 "st1    {v30.h}[0], [%3], #2        \n"
                 "st1    {v30.h}[1], [x4]            \n"
@@ -3210,14 +3208,14 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const
                 "add    x4, x4, %w13, sxtw 1        \n"
                 "st1    {v31.h}[3], [x4]            \n"
 
-                "9:                                 \n"
+                "8:                                 \n"
                 "add    %0, %0, #32                 \n"
-                "b      11f                         \n"
+                "b      10f                         \n"
 
-                "10:                                \n"
+                "9:                                 \n"
                 "st1    {v30.4s, v31.4s}, [%0], #32 \n"
 
-                "11:                                \n"
+                "10:                                \n"
 
                 : "=r"(outptr), // %0
                 "=r"(pA),     // %1
diff --git a/src/layer/arm/rmsnorm_arm.cpp b/src/layer/arm/rmsnorm_arm.cpp
new file mode 100644
index 000000000000..e19136ca29d6
--- /dev/null
+++ b/src/layer/arm/rmsnorm_arm.cpp
@@ -0,0 +1,417 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rmsnorm_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+#include "arm_usability.h"
+#include "cpu.h"
+
+namespace ncnn {
+
+RMSNorm_arm::RMSNorm_arm()
+{
+#if __ARM_NEON
+    support_packing = true;
+#if NCNN_ARM82
+    support_fp16_storage = cpu_support_arm_asimdhp();
+#endif
+#endif // __ARM_NEON
+
+#if NCNN_BF16
+    support_bf16_storage = true;
+#endif
+}
+
+static void rmsnorm(float* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack)
+{
+    const int size = elemcount * elempack;
+
+#if __ARM_NEON
+    float32x4_t _rms = vdupq_n_f32(0.f);
+#endif // __ARM_NEON
+    float rms = 0.f;
+    {
+        const float* ptr0 = ptr;
+
+        int i = 0;
+#if __ARM_NEON
+        for (; i + 3 < size; i += 4)
+        {
+            float32x4_t _p = vld1q_f32(ptr0);
+            _rms = vmlaq_f32(_rms, _p, _p);
+            ptr0 += 4;
+        }
+#endif // __ARM_NEON
+        for (; i < size; i++)
+        {
+            rms += ptr0[0] * ptr0[0];
+            ptr0++;
+        }
+    }
+
+#if __ARM_NEON
+    if (elempack == 4)
+    {
+        float32x4_t _elemcount = vdupq_n_f32(elemcount);
+        float32x4_t _eps = vdupq_n_f32(eps);
+
+#if __aarch64__
+        _rms = vdivq_f32(_rms, _elemcount);
+        _rms = vaddq_f32(_rms, _eps);
+#else
+        float32x4_t _inv_elemcount = vrecpeq_f32(_elemcount);
+        _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount);
+        _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount);
+        _rms = vmlaq_f32(_eps, _rms, _inv_elemcount);
+#endif
+
+        float32x4_t _rsqrt_rms = vrsqrteq_f32(_rms);
+        _rsqrt_rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms);
+        _rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms);
+    }
+#endif // __ARM_NEON
+    if (elempack == 1)
+    {
+#if __ARM_NEON
+#if __aarch64__
+        rms += vaddvq_f32(_rms);
+#else
+        float32x2_t _s2 = vadd_f32(vget_low_f32(_rms), vget_high_f32(_rms));
+        _s2 = vpadd_f32(_s2, _s2);
+        rms += vget_lane_f32(_s2, 0);
+#endif
+#endif // __ARM_NEON
+
+        rms = 1.f / sqrtf(rms / elemcount + eps);
+#if __ARM_NEON
+        _rms = vdupq_n_f32(rms);
+#endif // __ARM_NEON
+    }
+
+    if (gamma_ptr)
+    {
+        int i = 0;
+#if __ARM_NEON
+        if (elempack == 4)
+        {
+            for (; i + 3 < size; i += 4)
+            {
+                float32x4_t _p = vld1q_f32(ptr);
+                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
+                _p = vmulq_f32(_p, _rms);
+                _p = vmulq_f32(_p, _gamma);
+                vst1q_f32(ptr, _p);
+                ptr += 4;
+                gamma_ptr += 1;
+            }
+        }
+        if (elempack == 1)
+        {
+            for (; i + 3 < size; i += 4)
+            {
+                float32x4_t _p = vld1q_f32(ptr);
+                float32x4_t _gamma = vld1q_f32(gamma_ptr);
+                _p = vmulq_f32(_p, _rms);
+                _p = vmulq_f32(_p, _gamma);
+                vst1q_f32(ptr, _p);
+                ptr += 4;
+                gamma_ptr += 4;
+            }
+        }
+#endif // __ARM_NEON
+        for (; i < size; i++)
+        {
+            ptr[0] = (ptr[0] * rms) * gamma_ptr[0];
+            ptr++;
+            gamma_ptr++;
+        }
+    }
+    else
+    {
+        int i = 0;
+#if __ARM_NEON
+        for (; i + 3 < size; i += 4)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            _p = vmulq_f32(_p, _rms);
+            vst1q_f32(ptr, _p);
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+        for (; i < size; i++)
+        {
+            ptr[0] = ptr[0] * rms;
+            ptr++;
+        }
+    }
+}
+
+int RMSNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int elembits = bottom_top_blob.elembits();
+
+#if NCNN_ARM82
+    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
+        return forward_inplace_fp16s(bottom_top_blob, opt);
+#endif
+
+#if NCNN_BF16
+    if (opt.use_bf16_storage && elembits == 16)
+        return forward_inplace_bf16s(bottom_top_blob, opt);
+#endif
+
+    const int dims = bottom_top_blob.dims;
+    const int w = bottom_top_blob.w;
+    const int h = bottom_top_blob.h;
+    const int channels = bottom_top_blob.c;
+    const int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        // assert affine_size == w
+
+        float* ptr = bottom_top_blob;
+        rmsnorm(ptr, gamma_data, eps, w * elempack, 1);
+    }
+
+    if (dims == 2)
+    {
+        // assert affine_size == w
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+            rmsnorm(ptr, gamma_data, eps, w, elempack);
+        }
+    }
+
+    if (dims == 3)
+    {
+        if (affine_size == w)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                for (int i = 0; i < h; i++)
+                {
+                    float* ptr = bottom_top_blob.channel(q).row(i);
+                    rmsnorm(ptr, gamma_data, eps, w, elempack);
+                }
+            }
+        }
+        else // if (affine_size == w * h)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                float* ptr = bottom_top_blob.channel(q);
+                rmsnorm(ptr, gamma_data, eps, w * h, elempack);
+            }
+        }
+    }
+
+    return 0;
+}
+
+#if NCNN_BF16
+static void rmsnorm_bf16s(unsigned short* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack)
+{
+    const int size = elemcount * elempack;
+
+#if __ARM_NEON
+    float32x4_t _rms = vdupq_n_f32(0.f);
+#endif // __ARM_NEON
+    float rms = 0.f;
+    {
+        const unsigned short* ptr0 = ptr;
+
+        int i = 0;
+#if __ARM_NEON
+        for (; i + 3 < size; i += 4)
+        {
+            float32x4_t _p = bfloat2float(vld1_u16(ptr0));
+            _rms = vmlaq_f32(_rms, _p, _p);
+            ptr0 += 4;
+        }
+#endif // __ARM_NEON
+        for (; i < size; i++)
+        {
+            float v = bfloat16_to_float32(ptr0[0]);
+            rms += v * v;
+            ptr0++;
+        }
+    }
+
+#if __ARM_NEON
+    if (elempack == 4)
+    {
+        float32x4_t _elemcount = vdupq_n_f32(elemcount);
+        float32x4_t _eps = vdupq_n_f32(eps);
+
+#if __aarch64__
+        _rms = vdivq_f32(_rms, _elemcount);
+        _rms = vaddq_f32(_rms, _eps);
+#else
+        float32x4_t _inv_elemcount = vrecpeq_f32(_elemcount);
+        _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount);
+        _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount);
+        _rms = vmlaq_f32(_eps, _rms, _inv_elemcount);
+#endif
+
+        float32x4_t _rsqrt_rms = vrsqrteq_f32(_rms);
+        _rsqrt_rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms);
+        _rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms);
+    }
+#endif // __ARM_NEON
+    if (elempack == 1)
+    {
+#if __ARM_NEON
+#if __aarch64__
+        rms += vaddvq_f32(_rms);
+#else
+        float32x2_t _s2 = vadd_f32(vget_low_f32(_rms), vget_high_f32(_rms));
+        _s2 = vpadd_f32(_s2, _s2);
+        rms += vget_lane_f32(_s2, 0);
+#endif
+#endif // __ARM_NEON
+
+        rms = 1.f / sqrtf(rms / elemcount + eps);
+#if __ARM_NEON
+        _rms = vdupq_n_f32(rms);
+#endif // __ARM_NEON
+    }
+
+    if (gamma_ptr)
+    {
+        int i = 0;
+#if __ARM_NEON
+        if (elempack == 4)
+        {
+            for (; i + 3 < size; i += 4)
+            {
+                float32x4_t _p = bfloat2float(vld1_u16(ptr));
+                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
+                _p = vmulq_f32(_p, _rms);
+                _p = vmulq_f32(_p, _gamma);
+                vst1_u16(ptr, float2bfloat(_p));
+                ptr += 4;
+                gamma_ptr += 1;
+            }
+        }
+        if (elempack == 1)
+        {
+            for (; i + 3 < size; i += 4)
+            {
+                float32x4_t _p = bfloat2float(vld1_u16(ptr));
+                float32x4_t _gamma = vld1q_f32(gamma_ptr);
+                _p = vmulq_f32(_p, _rms);
+                _p = vmulq_f32(_p, _gamma);
+                vst1_u16(ptr, float2bfloat(_p));
+                ptr += 4;
+                gamma_ptr += 4;
+            }
+        }
+#endif // __ARM_NEON
+        for (; i < size; i++)
+        {
+            float v = bfloat16_to_float32(ptr[0]);
+            ptr[0] = float32_to_bfloat16((v * rms) * gamma_ptr[0]);
+            ptr++;
+            gamma_ptr++;
+        }
+    }
+    else
+    {
+        int i = 0;
+#if __ARM_NEON
+        for (; i + 3 < size; i += 4)
+        {
+            float32x4_t _p = bfloat2float(vld1_u16(ptr));
+            _p = vmulq_f32(_p, _rms);
+            vst1_u16(ptr, float2bfloat(_p));
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+        for (; i < size; i++)
+        {
+            float v = bfloat16_to_float32(ptr[0]);
+            ptr[0] = float32_to_bfloat16(v * rms);
+            ptr++;
+        }
+    }
+}
+
+int RMSNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    const int dims = bottom_top_blob.dims;
+    const int w = bottom_top_blob.w;
+    const int h = bottom_top_blob.h;
+    const int channels = bottom_top_blob.c;
+    const int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        // assert affine_size == w
+
+        unsigned short* ptr = bottom_top_blob;
+        rmsnorm_bf16s(ptr, gamma_data, eps, w * elempack, 1);
+    }
+
+    if (dims == 2)
+    {
+        // assert affine_size == w
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            unsigned short* ptr = bottom_top_blob.row<unsigned short>(i);
+            rmsnorm_bf16s(ptr, gamma_data, eps, w, elempack);
+        }
+    }
+
+    if (dims == 3)
+    {
+        if (affine_size == w)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                for (int i = 0; i < h; i++)
+                {
+                    unsigned short* ptr = bottom_top_blob.channel(q).row<unsigned short>(i);
+                    rmsnorm_bf16s(ptr, gamma_data, eps, w, elempack);
+                }
+            }
+        }
+        else // if (affine_size == w * h)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                unsigned short* ptr = bottom_top_blob.channel(q);
+                rmsnorm_bf16s(ptr, gamma_data, eps, w * h, elempack);
+            }
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_BF16
+
+} // namespace ncnn
diff --git a/src/layer/arm/rmsnorm_arm.h b/src/layer/arm/rmsnorm_arm.h
new file mode 100644
index 000000000000..440153333710
--- /dev/null
+++ b/src/layer/arm/rmsnorm_arm.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_RMSNORM_ARM_H
+#define LAYER_RMSNORM_ARM_H
+
+#include "rmsnorm.h"
+
+namespace ncnn {
+
+class RMSNorm_arm : public RMSNorm
+{
+public:
+    RMSNorm_arm();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+#if NCNN_ARM82
+    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
+#endif
+#if NCNN_BF16
+    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
+#endif
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RMSNORM_ARM_H
diff --git a/src/layer/arm/rmsnorm_arm_asimdhp.cpp b/src/layer/arm/rmsnorm_arm_asimdhp.cpp
new file mode 100644
index 000000000000..98d8e6964876
--- /dev/null
+++ b/src/layer/arm/rmsnorm_arm_asimdhp.cpp
@@ -0,0 +1,272 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rmsnorm_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#include "arm_usability.h"
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+static void rmsnorm_fp16s(__fp16* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack)
+{
+    const int size = elemcount * elempack;
+
+    float32x4_t _rms0 = vdupq_n_f32(0.f);
+    float32x4_t _rms1 = vdupq_n_f32(0.f);
+    float rms = 0.f;
+    {
+        const __fp16* ptr0 = ptr;
+
+        int i = 0;
+        for (; i + 7 < size; i += 8)
+        {
+            float16x8_t _p = vld1q_f16(ptr0);
+            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
+            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
+            _rms0 = vmlaq_f32(_rms0, _p0, _p0);
+            _rms1 = vmlaq_f32(_rms1, _p1, _p1);
+            ptr0 += 8;
+        }
+        for (; i + 3 < size; i += 4)
+        {
+            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr0));
+            _rms0 = vmlaq_f32(_rms0, _p, _p);
+            ptr0 += 4;
+        }
+        for (; i < size; i++)
+        {
+            rms += (float)ptr0[0] * (float)ptr0[0];
+            ptr0++;
+        }
+    }
+
+    if (elempack == 8)
+    {
+        float32x4_t _elemcount = vdupq_n_f32(elemcount);
+        float32x4_t _eps = vdupq_n_f32(eps);
+
+        _rms0 = vdivq_f32(_rms0, _elemcount);
+        _rms1 = vdivq_f32(_rms1, _elemcount);
+        _rms0 = vaddq_f32(_rms0, _eps);
+        _rms1 = vaddq_f32(_rms1, _eps);
+
+        float32x4_t _rsqrt_rms0 = vrsqrteq_f32(_rms0);
+        float32x4_t _rsqrt_rms1 = vrsqrteq_f32(_rms1);
+        _rsqrt_rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0);
+        _rsqrt_rms1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms1, _rsqrt_rms1), _rsqrt_rms1), _rsqrt_rms1);
+        _rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0);
+        _rms1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms1, _rsqrt_rms1), _rsqrt_rms1), _rsqrt_rms1);
+    }
+    if (elempack == 4)
+    {
+        _rms0 = vaddq_f32(_rms0, _rms1);
+
+        float32x4_t _elemcount = vdupq_n_f32(elemcount);
+        float32x4_t _eps = vdupq_n_f32(eps);
+
+        _rms0 = vdivq_f32(_rms0, _elemcount);
+        _rms0 = vaddq_f32(_rms0, _eps);
+
+        float32x4_t _rsqrt_rms0 = vrsqrteq_f32(_rms0);
+        _rsqrt_rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0);
+        _rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0);
+        _rms1 = _rms0;
+    }
+    if (elempack == 1)
+    {
+        _rms0 = vaddq_f32(_rms0, _rms1);
+        rms += vaddvq_f32(_rms0);
+
+        rms = 1.f / sqrtf(rms / elemcount + eps);
+        _rms0 = vdupq_n_f32(rms);
+        _rms1 = _rms0;
+    }
+
+    if (gamma_ptr)
+    {
+        int i = 0;
+        if (elempack == 8)
+        {
+            for (; i + 7 < size; i += 8)
+            {
+                float16x8_t _p = vld1q_f16(ptr);
+                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
+                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
+                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
+                _p0 = vmulq_f32(_p0, _rms0);
+                _p1 = vmulq_f32(_p1, _rms1);
+                _p0 = vmulq_f32(_p0, _gamma);
+                _p1 = vmulq_f32(_p1, _gamma);
+                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
+                vst1q_f16(ptr, _p);
+                ptr += 8;
+                gamma_ptr += 1;
+            }
+        }
+        if (elempack == 4)
+        {
+            for (; i + 7 < size; i += 8)
+            {
+                float16x8_t _p = vld1q_f16(ptr);
+                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
+                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
+                float32x4_t _gamma0 = vdupq_n_f32(gamma_ptr[0]);
+                float32x4_t _gamma1 = vdupq_n_f32(gamma_ptr[1]);
+                _p0 = vmulq_f32(_p0, _rms0);
+                _p1 = vmulq_f32(_p1, _rms1);
+                _p0 = vmulq_f32(_p0, _gamma0);
+                _p1 = vmulq_f32(_p1, _gamma1);
+                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
+                vst1q_f16(ptr, _p);
+                ptr += 8;
+                gamma_ptr += 2;
+            }
+            for (; i + 3 < size; i += 4)
+            {
+                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
+                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
+                _p = vmulq_f32(_p, _rms0);
+                _p = vmulq_f32(_p, _gamma);
+                vst1_f16(ptr, vcvt_f16_f32(_p));
+                ptr += 4;
+                gamma_ptr += 1;
+            }
+        }
+        if (elempack == 1)
+        {
+            for (; i + 7 < size; i += 8)
+            {
+                float16x8_t _p = vld1q_f16(ptr);
+                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
+                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
+                float32x4_t _gamma0 = vld1q_f32(gamma_ptr);
+                float32x4_t _gamma1 = vld1q_f32(gamma_ptr + 4);
+                _p0 = vmulq_f32(_p0, _rms0);
+                _p1 = vmulq_f32(_p1, _rms1);
+                _p0 = vmulq_f32(_p0, _gamma0);
+                _p1 = vmulq_f32(_p1, _gamma1);
+                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
+                vst1q_f16(ptr, _p);
+                ptr += 8;
+                gamma_ptr += 8;
+            }
+            for (; i + 3 < size; i += 4)
+            {
+                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
+                float32x4_t _gamma = vld1q_f32(gamma_ptr);
+                _p = vmulq_f32(_p, _rms0);
+                _p = vmulq_f32(_p, _gamma);
+                vst1_f16(ptr, vcvt_f16_f32(_p));
+                ptr += 4;
+                gamma_ptr += 4;
+            }
+        }
+        for (; i < size; i++)
+        {
+            ptr[0] = (__fp16)(((float)ptr[0] * rms) * gamma_ptr[0]);
+            ptr++;
+            gamma_ptr++;
+        }
+    }
+    else
+    {
+        int i = 0;
+        for (; i + 7 < size; i += 8)
+        {
+            float16x8_t _p = vld1q_f16(ptr);
+            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
+            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
+            _p0 = vmulq_f32(_p0, _rms0);
+            _p1 = vmulq_f32(_p1, _rms1);
+            _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
+            vst1q_f16(ptr, _p);
+            ptr += 8;
+        }
+        for (; i + 3 < size; i += 4)
+        {
+            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
+            _p = vmulq_f32(_p, _rms0);
+            vst1_f16(ptr, vcvt_f16_f32(_p));
+            ptr += 4;
+        }
+        for (; i < size; i++)
+        {
+            ptr[0] = (__fp16)((float)ptr[0] * rms);
+            ptr++;
+        }
+    }
+}
+
+int RMSNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    const int dims = bottom_top_blob.dims;
+    const int w = bottom_top_blob.w;
+    const int h = bottom_top_blob.h;
+    const int channels = bottom_top_blob.c;
+    const int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        // assert affine_size == w
+
+        __fp16* ptr = bottom_top_blob;
+        rmsnorm_fp16s(ptr, gamma_data, eps, w * elempack, 1);
+    }
+
+    if (dims == 2)
+    {
+        // assert affine_size == w
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            __fp16* ptr = bottom_top_blob.row<__fp16>(i);
+            rmsnorm_fp16s(ptr, gamma_data, eps, w, elempack);
+        }
+    }
+
+    if (dims == 3)
+    {
+        if (affine_size == w)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                for (int i = 0; i < h; i++)
+                {
+                    __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i);
+                    rmsnorm_fp16s(ptr, gamma_data, eps, w, elempack);
+                }
+            }
+        }
+        else // if (affine_size == w * h)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                __fp16* ptr = bottom_top_blob.channel(q);
+                rmsnorm_fp16s(ptr, gamma_data, eps, w * h, elempack);
+            }
+        }
+    }
+
+    return 0;
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+} // namespace ncnn
diff --git a/src/layer/embed.cpp b/src/layer/embed.cpp
index ddda6b8bf199..2b9f8a60042c 100644
--- a/src/layer/embed.cpp
+++ b/src/layer/embed.cpp
@@ -30,6 +30,7 @@ int Embed::load_param(const ParamDict& pd)
     input_dim = pd.get(1, 0);
     bias_term = pd.get(2, 0);
     weight_data_size = pd.get(3, 0);
+    int8_scale_term = pd.get(18, 0);
 
     return 0;
 }
@@ -47,18 +48,23 @@ int Embed::load_model(const ModelBin& mb)
             return -100;
     }
 
+#if NCNN_INT8
+    if (int8_scale_term)
+    {
+        weight_data_int8_scale = mb.load(1, 1)[0];
+    }
+#endif // NCNN_INT8
+
     return 0;
 }
 
-int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+static void embed(const Mat& bottom_blob, const Mat& weight_data, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
 {
-    int words = static_cast<int>(bottom_blob.total());
+    const int num_output = top_blob.w;
+    const int words = top_blob.h;
 
-    top_blob.create(num_output, words, 4u, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
+    const float* bias_ptr = bias_data;
 
-    // num_output
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int q = 0; q < words; q++)
     {
@@ -73,15 +79,79 @@ int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) con
 
         const float* em = (const float*)weight_data + num_output * word_index;
 
-        memcpy(outptr, em, num_output * sizeof(float));
+        if (bias_ptr)
+        {
+            for (int p = 0; p < num_output; p++)
+            {
+                outptr[p] = em[p] + bias_ptr[p];
+            }
+        }
+        else
+        {
+            memcpy(outptr, em, num_output * sizeof(float));
+        }
+    }
+}
+
+#if NCNN_INT8
+static void embed_int8(const Mat& bottom_blob, const Mat& weight_data, float weight_data_int8_scale, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
+{
+    const int num_output = top_blob.w;
+    const int words = top_blob.h;
+
+    const float* bias_ptr = bias_data;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < words; q++)
+    {
+        float* outptr = top_blob.row(q);
+
+        int word_index = ((const int*)bottom_blob)[q];
 
-        if (bias_term)
+        if (word_index < 0)
+            word_index = 0;
+        if (word_index >= input_dim)
+            word_index = input_dim - 1;
+
+        const float descale_em = 1.f / weight_data_int8_scale;
+
+        const signed char* em = (const signed char*)weight_data + num_output * word_index;
+
+        if (bias_ptr)
         {
             for (int p = 0; p < num_output; p++)
             {
-                outptr[p] += bias_data[p];
+                outptr[p] = em[p] * descale_em + bias_ptr[p];
             }
         }
+        else
+        {
+            for (int p = 0; p < num_output; p++)
+            {
+                outptr[p] = em[p] * descale_em;
+            }
+        }
+    }
+}
+#endif // NCNN_INT8
+
+int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int words = static_cast<int>(bottom_blob.total());
+
+    top_blob.create(num_output, words, 4u, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+#if NCNN_INT8
+    if (int8_scale_term)
+    {
+        embed_int8(bottom_blob, weight_data, weight_data_int8_scale, bias_data, top_blob, input_dim, opt);
+    }
+    else
+#endif // NCNN_INT8
+    {
+        embed(bottom_blob, weight_data, bias_data, top_blob, input_dim, opt);
     }
 
     return 0;
diff --git a/src/layer/embed.h b/src/layer/embed.h
index 8e2366567163..b94c2b17bee4 100644
--- a/src/layer/embed.h
+++ b/src/layer/embed.h
@@ -38,9 +38,15 @@ class Embed : public Layer
 
     int weight_data_size;
 
+    int int8_scale_term;
+
     // model
     Mat weight_data;
     Mat bias_data;
+
+#if NCNN_INT8
+    float weight_data_int8_scale;
+#endif
 };
 
 } // namespace ncnn
diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h
index 980261a14966..2ec10bae48a5 100644
--- a/src/layer/riscv/rvv_mathfun.h
+++ b/src/layer/riscv/rvv_mathfun.h
@@ -308,7 +308,7 @@ _RVV_FLOAT32_COS_OP(8, 4)
                                                                                                  \
         /* clamp the inputs to the range [-9, 9] since anything outside */                       \
         /* this range is -/+1.0f in single-precision.                   */                       \
-        x2 = vfmin_vf_f32m##LMUL(x, c_tanh_hi, vl);                                              \
+        x2 = vfmin_vf_f32m##LMUL(x2, c_tanh_hi, vl);                                             \
                                                                                                  \
         /* since the polynomials are odd/even, we need x**2. */                                  \
         vfloat32m##LMUL##_t z = vfmul_vv_f32m##LMUL(x2, x2, vl);                                 \
diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h
index ee5ffe4a304b..2cf5d08f4f0b 100644
--- a/src/layer/riscv/rvv_mathfun_fp16s.h
+++ b/src/layer/riscv/rvv_mathfun_fp16s.h
@@ -308,7 +308,7 @@ _RVV_FLOAT16_COS_OP(8, 2)
                                                                                                  \
         /* clamp the inputs to the range [-9, 9] since anything outside */                       \
         /* this range is -/+1.0f in single-precision.                   */                       \
-        x2 = vfmin_vf_f16m##LMUL(x, c_tanh_hi, vl);                                              \
+        x2 = vfmin_vf_f16m##LMUL(x2, c_tanh_hi, vl);                                             \
                                                                                                  \
         /* since the polynomials are odd/even, we need x**2. */                                  \
         vfloat16m##LMUL##_t z = vfmul_vv_f16m##LMUL(x2, x2, vl);                                 \
diff --git a/src/layer/rmsnorm.cpp b/src/layer/rmsnorm.cpp
new file mode 100644
index 000000000000..77c74c6bccbb
--- /dev/null
+++ b/src/layer/rmsnorm.cpp
@@ -0,0 +1,200 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rmsnorm.h"
+
+namespace ncnn {
+
+RMSNorm::RMSNorm()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+int RMSNorm::load_param(const ParamDict& pd)
+{
+    affine_size = pd.get(0, 0);
+    eps = pd.get(1, 0.001f);
+    affine = pd.get(2, 1);
+
+    return 0;
+}
+
+int RMSNorm::load_model(const ModelBin& mb)
+{
+    if (affine == 0)
+        return 0;
+
+    gamma_data = mb.load(affine_size, 1);
+    if (gamma_data.empty())
+        return -100;
+
+    return 0;
+}
+
+int RMSNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    // x = x / sqrt(rms + eps) * gamma
+
+    int dims = bottom_top_blob.dims;
+
+    if (dims == 1)
+    {
+        int w = bottom_top_blob.w;
+        // assert affine_size == w
+
+        float* ptr = bottom_top_blob;
+
+        float sqsum = 0.f;
+        for (int i = 0; i < w; i++)
+        {
+            sqsum += ptr[i] * ptr[i];
+        }
+        float rms = sqrtf(sqsum / w + eps);
+
+        float a = 1.f / rms;
+
+        if (affine)
+        {
+            for (int i = 0; i < w; i++)
+            {
+                ptr[i] = (ptr[i] * a) * gamma_data[i];
+            }
+        }
+        else
+        {
+            for (int i = 0; i < w; i++)
+            {
+                ptr[i] = ptr[i] * a;
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        // assert affine_size == w
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+
+            float sqsum = 0.f;
+            for (int j = 0; j < w; j++)
+            {
+                sqsum += ptr[j] * ptr[j];
+            }
+            float rms = sqrtf(sqsum / w + eps);
+
+            float a = 1.f / rms;
+
+            if (affine)
+            {
+                for (int j = 0; j < w; j++)
+                {
+                    ptr[j] = (ptr[j] * a) * gamma_data[j];
+                }
+            }
+            else
+            {
+                for (int j = 0; j < w; j++)
+                {
+                    ptr[j] = ptr[j] * a;
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int channels = bottom_top_blob.c;
+        int size = w * h;
+
+        if (affine_size == w)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                for (int i = 0; i < h; i++)
+                {
+                    float* ptr = bottom_top_blob.channel(q).row(i);
+
+                    float sqsum = 0.f;
+                    for (int j = 0; j < w; j++)
+                    {
+                        sqsum += ptr[j] * ptr[j];
+                    }
+                    float rms = sqrtf(sqsum / w + eps);
+
+                    float a = 1.f / rms;
+
+                    if (affine)
+                    {
+                        for (int j = 0; j < w; j++)
+                        {
+                            ptr[j] = (ptr[j] * a) * gamma_data[j];
+                        }
+                    }
+                    else
+                    {
+                        for (int j = 0; j < w; j++)
+                        {
+                            ptr[j] = ptr[j] * a;
+                        }
+                    }
+                }
+            }
+        }
+        else // if (affine_size == size)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                float* ptr = bottom_top_blob.channel(q);
+
+                float sqsum = 0.f;
+                for (int i = 0; i < size; i++)
+                {
+                    sqsum += ptr[i] * ptr[i];
+                }
+                float rms = sqrtf(sqsum / size + eps);
+
+                float a = 1.f / rms;
+
+                if (affine)
+                {
+                    for (int i = 0; i < size; i++)
+                    {
+                        ptr[i] = (ptr[i] * a) * gamma_data[i];
+                    }
+                }
+                else
+                {
+                    for (int i = 0; i < size; i++)
+                    {
+                        ptr[i] = ptr[i] * a;
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/rmsnorm.h b/src/layer/rmsnorm.h
new file mode 100644
index 000000000000..4a09f2548bdf
--- /dev/null
+++ b/src/layer/rmsnorm.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_RMSNORM_H
+#define LAYER_RMSNORM_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class RMSNorm : public Layer
+{
+public:
+    RMSNorm();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int load_model(const ModelBin& mb);
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+public:
+    int affine_size;
+    float eps;
+    int affine;
+
+    Mat gamma_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RMSNORM_H
diff --git a/src/layer/x86/rmsnorm_x86.cpp b/src/layer/x86/rmsnorm_x86.cpp
new file mode 100644
index 000000000000..db592c3e3810
--- /dev/null
+++ b/src/layer/x86/rmsnorm_x86.cpp
@@ -0,0 +1,413 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rmsnorm_x86.h"
+
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif // __AVX__
+#endif // __SSE2__
+
+#include "x86_usability.h"
+
+namespace ncnn {
+
+RMSNorm_x86::RMSNorm_x86()
+{
+#if __SSE2__
+    support_packing = true;
+#endif // __SSE2__
+}
+
+static void rmsnorm(float* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack)
+{
+    const int size = elemcount * elempack;
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    __m512 _rms_avx512 = _mm512_set1_ps(0.f);
+#endif // __AVX512F__
+    __m256 _rms_avx = _mm256_set1_ps(0.f);
+#endif // __AVX__
+    __m128 _rms = _mm_set1_ps(0.f);
+#endif // __SSE2__
+    float rms = 0.f;
+    {
+        const float* ptr0 = ptr;
+
+        int i = 0;
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+        for (; i + 15 < size; i += 16)
+        {
+            __m512 _p = _mm512_loadu_ps(ptr0);
+            _rms_avx512 = _mm512_fmadd_ps(_p, _p, _rms_avx512);
+            ptr0 += 16;
+        }
+#endif // __AVX512F__
+        for (; i + 7 < size; i += 8)
+        {
+            __m256 _p = _mm256_loadu_ps(ptr0);
+            _rms_avx = _mm256_comp_fmadd_ps(_p, _p, _rms_avx);
+            ptr0 += 8;
+        }
+#endif // __AVX__
+        for (; i + 3 < size; i += 4)
+        {
+            __m128 _p = _mm_loadu_ps(ptr0);
+            _rms = _mm_comp_fmadd_ps(_p, _p, _rms);
+            ptr0 += 4;
+        }
+#endif // __SSE2__
+        for (; i < size; i++)
+        {
+            rms += ptr0[0] * ptr0[0];
+            ptr0++;
+        }
+    }
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    if (elempack == 16)
+    {
+        __m512 _elemcount = _mm512_set1_ps((float)elemcount);
+        __m512 _eps = _mm512_set1_ps(eps);
+
+        _rms_avx512 = _mm512_div_ps(_rms_avx512, _elemcount);
+        _rms_avx512 = _mm512_add_ps(_rms_avx512, _eps);
+
+        __m256 _rms0 = _mm256_rsqrt_ps(_mm512_extractf32x8_ps(_rms_avx512, 0));
+        __m256 _rms1 = _mm256_rsqrt_ps(_mm512_extractf32x8_ps(_rms_avx512, 1));
+        _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms0), _rms1, 1);
+    }
+#endif // __AVX512F__
+    if (elempack == 8)
+    {
+#if __AVX512F__
+        {
+            __m256 _rms0 = _mm512_castps512_ps256(_rms_avx512);
+            __m256 _rms1 = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(_rms_avx512), 1));
+            _rms_avx = _mm256_add_ps(_rms_avx, _rms0);
+            _rms_avx = _mm256_add_ps(_rms_avx, _rms1);
+        }
+#endif // __AVX512F__
+
+        __m256 _elemcount = _mm256_set1_ps((float)elemcount);
+        __m256 _eps = _mm256_set1_ps(eps);
+
+        _rms_avx = _mm256_div_ps(_rms_avx, _elemcount);
+        _rms_avx = _mm256_add_ps(_rms_avx, _eps);
+
+        _rms_avx = _mm256_rsqrt_ps(_rms_avx);
+#if __AVX512F__
+        _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms_avx), _rms_avx, 1);
+#endif // __AVX512F__
+    }
+#endif // __AVX__
+    if (elempack == 4)
+    {
+#if __AVX__
+#if __AVX512F__
+        {
+            __m256 _rms0 = _mm512_castps512_ps256(_rms_avx512);
+            __m256 _rms1 = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(_rms_avx512), 1));
+            _rms_avx = _mm256_add_ps(_rms_avx, _rms0);
+            _rms_avx = _mm256_add_ps(_rms_avx, _rms1);
+        }
+#endif // __AVX512F__
+        {
+            __m128 _rms0 = _mm256_castps256_ps128(_rms_avx);
+            __m128 _rms1 = _mm256_extractf128_ps(_rms_avx, 1);
+            _rms = _mm_add_ps(_rms, _rms0);
+            _rms = _mm_add_ps(_rms, _rms1);
+        }
+#endif // __AVX__
+
+        __m128 _elemcount = _mm_set1_ps((float)elemcount);
+        __m128 _eps = _mm_set1_ps(eps);
+
+        _rms = _mm_div_ps(_rms, _elemcount);
+        _rms = _mm_add_ps(_rms, _eps);
+
+        _rms = _mm_rsqrt_ps(_rms);
+#if __AVX__
+        _rms_avx = _mm256_insertf128_ps(_mm256_castps128_ps256(_rms), _rms, 1);
+#if __AVX512F__
+        _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms_avx), _rms_avx, 1);
+#endif // __AVX512F__
+#endif // __AVX__
+    }
+#endif // __SSE2__
+    if (elempack == 1)
+    {
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+        rms += _mm512_comp_reduce_add_ps(_rms_avx512);
+#endif // __AVX512F__
+        rms += _mm256_reduce_add_ps(_rms_avx);
+#endif // __AVX__
+        rms += _mm_reduce_add_ps(_rms);
+#endif // __SSE2__
+
+        rms = 1.f / sqrtf(rms / elemcount + eps);
+#if __SSE2__
+        _rms = _mm_set1_ps(rms);
+#if __AVX__
+        _rms_avx = _mm256_insertf128_ps(_mm256_castps128_ps256(_rms), _rms, 1);
+#if __AVX512F__
+        _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms_avx), _rms_avx, 1);
+#endif // __AVX512F__
+#endif // __AVX__
+#endif // __SSE2__
+    }
+
+    if (gamma_ptr)
+    {
+        int i = 0;
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+        if (elempack == 16)
+        {
+            for (; i + 15 < size; i += 16)
+            {
+                __m512 _p = _mm512_loadu_ps(ptr);
+                __m512 _gamma = _mm512_set1_ps(gamma_ptr[0]);
+                _p = _mm512_mul_ps(_p, _rms_avx512);
+                _p = _mm512_mul_ps(_p, _gamma);
+                _mm512_storeu_ps(ptr, _p);
+                ptr += 16;
+                gamma_ptr += 1;
+            }
+        }
+#endif // __AVX512F__
+        if (elempack == 8)
+        {
+#if __AVX512F__
+            for (; i + 15 < size; i += 16)
+            {
+                __m512 _p = _mm512_loadu_ps(ptr);
+                __m256 _gamma0 = _mm256_set1_ps(gamma_ptr[0]);
+                __m256 _gamma1 = _mm256_set1_ps(gamma_ptr[1]);
+                __m512 _gamma = _mm512_insertf32x8(_mm512_castps256_ps512(_gamma0), _gamma1, 1);
+                _p = _mm512_mul_ps(_p, _rms_avx512);
+                _p = _mm512_mul_ps(_p, _gamma);
+                _mm512_storeu_ps(ptr, _p);
+                ptr += 16;
+                gamma_ptr += 2;
+            }
+#endif // __AVX512F__
+            for (; i + 7 < size; i += 8)
+            {
+                __m256 _p = _mm256_loadu_ps(ptr);
+                __m256 _gamma = _mm256_set1_ps(gamma_ptr[0]);
+                _p = _mm256_mul_ps(_p, _rms_avx);
+                _p = _mm256_mul_ps(_p, _gamma);
+                _mm256_storeu_ps(ptr, _p);
+                ptr += 8;
+                gamma_ptr += 1;
+            }
+        }
+#endif // __AVX__
+        if (elempack == 4)
+        {
+#if __AVX__
+#if __AVX512F__
+            for (; i + 15 < size; i += 16)
+            {
+                __m512 _p = _mm512_loadu_ps(ptr);
+                __m128 _gamma0 = _mm_set1_ps(gamma_ptr[0]);
+                __m128 _gamma1 = _mm_set1_ps(gamma_ptr[1]);
+                __m128 _gamma2 = _mm_set1_ps(gamma_ptr[2]);
+                __m128 _gamma3 = _mm_set1_ps(gamma_ptr[3]);
+                __m256 _gamma01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_gamma0), _gamma1, 1);
+                __m256 _gamma23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_gamma2), _gamma3, 1);
+                __m512 _gamma = _mm512_insertf32x8(_mm512_castps256_ps512(_gamma01), _gamma23, 1);
+                _p = _mm512_mul_ps(_p, _rms_avx512);
+                _p = _mm512_mul_ps(_p, _gamma);
+                _mm512_storeu_ps(ptr, _p);
+                ptr += 16;
+                gamma_ptr += 4;
+            }
+#endif // __AVX512F__
+            for (; i + 7 < size; i += 8)
+            {
+                __m256 _p = _mm256_loadu_ps(ptr);
+                __m128 _gamma0 = _mm_set1_ps(gamma_ptr[0]);
+                __m128 _gamma1 = _mm_set1_ps(gamma_ptr[1]);
+                __m256 _gamma = _mm256_insertf128_ps(_mm256_castps128_ps256(_gamma0), _gamma1, 1);
+                _p = _mm256_mul_ps(_p, _rms_avx);
+                _p = _mm256_mul_ps(_p, _gamma);
+                _mm256_storeu_ps(ptr, _p);
+                ptr += 8;
+                gamma_ptr += 2;
+            }
+#endif // __AVX__
+            for (; i + 3 < size; i += 4)
+            {
+                __m128 _p = _mm_loadu_ps(ptr);
+                __m128 _gamma = _mm_set1_ps(gamma_ptr[0]);
+                _p = _mm_mul_ps(_p, _rms);
+                _p = _mm_mul_ps(_p, _gamma);
+                _mm_storeu_ps(ptr, _p);
+                ptr += 4;
+                gamma_ptr += 1;
+            }
+        }
+        if (elempack == 1)
+        {
+#if __AVX__
+#if __AVX512F__
+            for (; i + 15 < size; i += 16)
+            {
+                __m512 _p = _mm512_loadu_ps(ptr);
+                __m512 _gamma = _mm512_loadu_ps(gamma_ptr);
+                _p = _mm512_mul_ps(_p, _rms_avx512);
+                _p = _mm512_mul_ps(_p, _gamma);
+                _mm512_storeu_ps(ptr, _p);
+                ptr += 16;
+                gamma_ptr += 16;
+            }
+#endif // __AVX512F__
+            for (; i + 7 < size; i += 8)
+            {
+                __m256 _p = _mm256_loadu_ps(ptr);
+                __m256 _gamma = _mm256_loadu_ps(gamma_ptr);
+                _p = _mm256_mul_ps(_p, _rms_avx);
+                _p = _mm256_mul_ps(_p, _gamma);
+                _mm256_storeu_ps(ptr, _p);
+                ptr += 8;
+                gamma_ptr += 8;
+            }
+#endif // __AVX__
+            for (; i + 3 < size; i += 4)
+            {
+                __m128 _p = _mm_loadu_ps(ptr);
+                __m128 _gamma = _mm_loadu_ps(gamma_ptr);
+                _p = _mm_mul_ps(_p, _rms);
+                _p = _mm_mul_ps(_p, _gamma);
+                _mm_storeu_ps(ptr, _p);
+                ptr += 4;
+                gamma_ptr += 4;
+            }
+        }
+#endif // __SSE2__
+        for (; i < size; i++)
+        {
+            ptr[0] = (ptr[0] * rms) * gamma_ptr[0];
+            ptr++;
+            gamma_ptr++;
+        }
+    }
+    else
+    {
+        int i = 0;
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+        for (; i + 15 < size; i += 16)
+        {
+            __m512 _p = _mm512_loadu_ps(ptr);
+            _p = _mm512_mul_ps(_p, _rms_avx512);
+            _mm512_storeu_ps(ptr, _p);
+            ptr += 16;
+        }
+#endif // __AVX512F__
+        for (; i + 7 < size; i += 8)
+        {
+            __m256 _p = _mm256_loadu_ps(ptr);
+            _p = _mm256_mul_ps(_p, _rms_avx);
+            _mm256_storeu_ps(ptr, _p);
+            ptr += 8;
+        }
+#endif // __AVX__
+        for (; i + 3 < size; i += 4)
+        {
+            __m128 _p = _mm_loadu_ps(ptr);
+            _p = _mm_mul_ps(_p, _rms);
+            _mm_storeu_ps(ptr, _p);
+            ptr += 4;
+        }
+#endif // __SSE2__
+        for (; i < size; i++)
+        {
+            ptr[0] = ptr[0] * rms;
+            ptr++;
+        }
+    }
+}
+
+int RMSNorm_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    const int dims = bottom_top_blob.dims;
+    const int w = bottom_top_blob.w;
+    const int h = bottom_top_blob.h;
+    const int channels = bottom_top_blob.c;
+    const int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        // assert affine_size == w
+
+        float* ptr = bottom_top_blob;
+        rmsnorm(ptr, gamma_data, eps, w * elempack, 1);
+    }
+
+    if (dims == 2)
+    {
+        // assert affine_size == w
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+            rmsnorm(ptr, gamma_data, eps, w, elempack);
+        }
+    }
+
+    if (dims == 3)
+    {
+        if (affine_size == w)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                for (int i = 0; i < h; i++)
+                {
+                    float* ptr = bottom_top_blob.channel(q).row(i);
+                    rmsnorm(ptr, gamma_data, eps, w, elempack);
+                }
+            }
+        }
+        else // if (affine_size == w * h)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                float* ptr = bottom_top_blob.channel(q);
+                rmsnorm(ptr, gamma_data, eps, w * h, elempack);
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/x86/rmsnorm_x86.h b/src/layer/x86/rmsnorm_x86.h
new file mode 100644
index 000000000000..2e6296db1c32
--- /dev/null
+++ b/src/layer/x86/rmsnorm_x86.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_RMSNORM_X86_H
+#define LAYER_RMSNORM_X86_H
+
+#include "rmsnorm.h"
+
+namespace ncnn {
+
+class RMSNorm_x86 : public RMSNorm
+{
+public:
+    RMSNorm_x86();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RMSNORM_X86_H
diff --git a/src/platform.h.in b/src/platform.h.in
index a0f17f39e315..50a9454b7da0 100644
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -70,7 +70,7 @@
 #ifdef __cplusplus
 
 #if NCNN_THREADS
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #include <process.h>
@@ -86,7 +86,7 @@
 namespace ncnn {
 
 #if NCNN_THREADS
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
 class NCNN_EXPORT Mutex
 {
 public:
@@ -141,7 +141,7 @@ public:
 private:
     DWORD key;
 };
-#else // (defined _WIN32 && !(defined __MINGW32__))
+#else // defined _WIN32
 class NCNN_EXPORT Mutex
 {
 public:
@@ -186,7 +186,7 @@ public:
 private:
     pthread_key_t key;
 };
-#endif // (defined _WIN32 && !(defined __MINGW32__))
+#endif // defined _WIN32
 #else // NCNN_THREADS
 class NCNN_EXPORT Mutex
 {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d0d2a66899a6..54e778e35e79 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -153,6 +153,7 @@ ncnn_add_layer_test(Dropout)
 ncnn_add_layer_test(Einsum)
 ncnn_add_layer_test(Eltwise)
 ncnn_add_layer_test(ELU)
+ncnn_add_layer_test(Embed)
 ncnn_add_layer_test(Erf)
 ncnn_add_layer_test(ExpandDims)
 ncnn_add_layer_test(Flatten)
@@ -193,6 +194,7 @@ ncnn_add_layer_test(ReLU)
 ncnn_add_layer_test(Reorg)
 ncnn_add_layer_test(Requantize)
 ncnn_add_layer_test(Reshape)
+ncnn_add_layer_test(RMSNorm)
 ncnn_add_layer_test(RNN)
 ncnn_add_layer_test(ROIPooling)
 ncnn_add_layer_test(ROIAlign)
diff --git a/tests/test_embed.cpp b/tests/test_embed.cpp
new file mode 100644
index 000000000000..9c007ee5d7e7
--- /dev/null
+++ b/tests/test_embed.cpp
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "testutil.h"
+
+static int test_embed(int words, int num_output, int input_dim, int bias)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, num_output);
+    pd.set(1, input_dim);
+    pd.set(2, bias);
+    pd.set(3, num_output * input_dim);
+
+    std::vector<ncnn::Mat> weights(bias ? 2 : 1);
+    weights[0] = RandomMat(num_output * input_dim);
+    if (bias)
+        weights[1] = RandomMat(num_output);
+
+    ncnn::Mat a(words);
+    RandomizeInt(a, 0, input_dim);
+
+    int ret = test_layer("Embed", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_embed failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias);
+    }
+
+    return ret;
+}
+
+static int test_embed_0()
+{
+    return 0
+           || test_embed(128, 128, 128, 0)
+           || test_embed(128, 128, 128, 1)
+           || test_embed(127, 127, 127, 0)
+           || test_embed(127, 127, 127, 1)
+           || test_embed(124, 124, 124, 0)
+           || test_embed(124, 124, 124, 1);
+}
+
+#if NCNN_INT8
+static int test_embed_int8(int words, int num_output, int input_dim, int bias)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, num_output);
+    pd.set(1, input_dim);
+    pd.set(2, bias);
+    pd.set(3, num_output * input_dim);
+    pd.set(18, 2);
+
+    std::vector<ncnn::Mat> weights(bias ? 3 : 2);
+    weights[0] = RandomS8Mat(num_output * input_dim);
+    if (bias)
+    {
+        weights[1] = RandomMat(num_output);
+        weights[2] = RandomMat(1, 100.f, 200.f);
+    }
+    else
+    {
+        weights[1] = RandomMat(1, 100.f, 200.f);
+    }
+
+    ncnn::Mat a(words);
+    RandomizeInt(a, 0, input_dim);
+
+    int ret = test_layer("Embed", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_embed_int8 failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias);
+    }
+
+    return ret;
+}
+
+static int test_embed_1()
+{
+    return 0
+           || test_embed_int8(128, 128, 128, 0)
+           || test_embed_int8(128, 128, 128, 1)
+           || test_embed_int8(127, 127, 127, 0)
+           || test_embed_int8(127, 127, 127, 1)
+           || test_embed_int8(124, 124, 124, 0)
+           || test_embed_int8(124, 124, 124, 1);
+}
+#endif // NCNN_INT8
+
+int main()
+{
+    SRAND(7767517);
+
+#if NCNN_INT8
+    return test_embed_0() || test_embed_1();
+#else
+    return test_embed_0();
+#endif
+}
diff --git a/tests/test_rmsnorm.cpp b/tests/test_rmsnorm.cpp
new file mode 100644
index 000000000000..2d88c162d8b5
--- /dev/null
+++ b/tests/test_rmsnorm.cpp
@@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "testutil.h"
+
+static int test_rmsnorm(const ncnn::Mat& a, int affine_size, float eps, int affine)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, affine_size);
+    pd.set(1, eps);
+    pd.set(2, affine);
+
+    std::vector<ncnn::Mat> weights(1);
+    weights[0] = RandomMat(affine_size);
+
+    int ret = test_layer("RMSNorm", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_rmsnorm failed a.dims=%d a=(%d %d %d) affine_size=%d eps=%f affine=%d\n", a.dims, a.w, a.h, a.c, affine_size, eps, affine);
+    }
+
+    return ret;
+}
+
+static int test_rmsnorm_0()
+{
+    return 0
+           || test_rmsnorm(RandomMat(6, 4, 2), 6, 0.01f, 0)
+           || test_rmsnorm(RandomMat(4, 5, 6), 4, 0.01f, 0)
+           || test_rmsnorm(RandomMat(3, 3, 8), 3, 0.002f, 0)
+           || test_rmsnorm(RandomMat(5, 6, 12), 5, 0.02f, 0)
+           || test_rmsnorm(RandomMat(4, 7, 16), 4, 0.02f, 0)
+           || test_rmsnorm(RandomMat(6, 7, 24), 6, 0.001f, 0)
+           || test_rmsnorm(RandomMat(5, 8, 32), 5, 0.001f, 0)
+           || test_rmsnorm(RandomMat(6, 4, 2), 6, 0.01f, 1)
+           || test_rmsnorm(RandomMat(4, 5, 6), 4, 0.01f, 1)
+           || test_rmsnorm(RandomMat(3, 3, 8), 3, 0.002f, 1)
+           || test_rmsnorm(RandomMat(5, 6, 12), 5, 0.02f, 1)
+           || test_rmsnorm(RandomMat(4, 7, 16), 4, 0.02f, 1)
+           || test_rmsnorm(RandomMat(6, 7, 24), 6, 0.001f, 1)
+           || test_rmsnorm(RandomMat(5, 8, 32), 5, 0.001f, 1);
+}
+
+static int test_rmsnorm_1()
+{
+    return 0
+           || test_rmsnorm(RandomMat(6, 4, 2), 24, 0.01f, 0)
+           || test_rmsnorm(RandomMat(4, 5, 6), 20, 0.01f, 0)
+           || test_rmsnorm(RandomMat(3, 3, 8), 9, 0.002f, 0)
+           || test_rmsnorm(RandomMat(5, 6, 12), 30, 0.02f, 0)
+           || test_rmsnorm(RandomMat(4, 7, 16), 28, 0.02f, 0)
+           || test_rmsnorm(RandomMat(6, 7, 24), 42, 0.001f, 0)
+           || test_rmsnorm(RandomMat(5, 8, 32), 40, 0.001f, 0)
+           || test_rmsnorm(RandomMat(6, 4, 2), 24, 0.01f, 1)
+           || test_rmsnorm(RandomMat(4, 5, 6), 20, 0.01f, 1)
+           || test_rmsnorm(RandomMat(3, 3, 8), 9, 0.002f, 1)
+           || test_rmsnorm(RandomMat(5, 6, 12), 30, 0.02f, 1)
+           || test_rmsnorm(RandomMat(4, 7, 16), 28, 0.02f, 1)
+           || test_rmsnorm(RandomMat(6, 7, 24), 42, 0.001f, 1)
+           || test_rmsnorm(RandomMat(5, 8, 32), 40, 0.001f, 1);
+}
+
+static int test_rmsnorm_2()
+{
+    return 0
+           || test_rmsnorm(RandomMat(4, 2), 4, 0.01f, 0)
+           || test_rmsnorm(RandomMat(5, 6), 5, 0.01f, 0)
+           || test_rmsnorm(RandomMat(3, 8), 3, 0.002f, 0)
+           || test_rmsnorm(RandomMat(6, 12), 6, 0.02f, 0)
+           || test_rmsnorm(RandomMat(4, 16), 4, 0.02f, 0)
+           || test_rmsnorm(RandomMat(7, 24), 7, 0.001f, 0)
+           || test_rmsnorm(RandomMat(8, 32), 8, 0.001f, 0)
+           || test_rmsnorm(RandomMat(4, 2), 4, 0.01f, 1)
+           || test_rmsnorm(RandomMat(5, 6), 5, 0.01f, 1)
+           || test_rmsnorm(RandomMat(3, 8), 3, 0.002f, 1)
+           || test_rmsnorm(RandomMat(6, 12), 6, 0.02f, 1)
+           || test_rmsnorm(RandomMat(4, 16), 4, 0.02f, 1)
+           || test_rmsnorm(RandomMat(7, 24), 7, 0.001f, 1)
+           || test_rmsnorm(RandomMat(8, 32), 8, 0.001f, 1);
+}
+
+static int test_rmsnorm_3()
+{
+    return 0
+           || test_rmsnorm(RandomMat(2), 2, 0.01f, 0)
+           || test_rmsnorm(RandomMat(6), 6, 0.01f, 0)
+           || test_rmsnorm(RandomMat(8), 8, 0.002f, 0)
+           || test_rmsnorm(RandomMat(12), 12, 0.02f, 0)
+           || test_rmsnorm(RandomMat(16), 16, 0.02f, 0)
+           || test_rmsnorm(RandomMat(24), 24, 0.001f, 0)
+           || test_rmsnorm(RandomMat(32), 32, 0.001f, 0)
+           || test_rmsnorm(RandomMat(2), 2, 0.01f, 1)
+           || test_rmsnorm(RandomMat(6), 6, 0.01f, 1)
+           || test_rmsnorm(RandomMat(8), 8, 0.002f, 1)
+           || test_rmsnorm(RandomMat(12), 12, 0.02f, 1)
+           || test_rmsnorm(RandomMat(16), 16, 0.02f, 1)
+           || test_rmsnorm(RandomMat(24), 24, 0.001f, 1)
+           || test_rmsnorm(RandomMat(32), 32, 0.001f, 1);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return 0
+           || test_rmsnorm_0()
+           || test_rmsnorm_1()
+           || test_rmsnorm_2()
+           || test_rmsnorm_3();
+}
diff --git a/tools/modelwriter.h b/tools/modelwriter.h
index 88ccb948a9c8..ff86338bca9c 100644
--- a/tools/modelwriter.h
+++ b/tools/modelwriter.h
@@ -99,6 +99,7 @@
 #include "layer/reorg.h"
 #include "layer/requantize.h"
 #include "layer/reshape.h"
+#include "layer/rmsnorm.h"
 #include "layer/rnn.h"
 #include "layer/roialign.h"
 #include "layer/roipooling.h"
@@ -1676,9 +1677,20 @@ int ModelWriter::save(const char* parampath, const char* binpath)
             fprintf_param_value(" 1=%d", input_dim)
             fprintf_param_value(" 2=%d", bias_term)
             fprintf_param_value(" 3=%d", weight_data_size)
+            fprintf_param_value(" 18=%d", int8_scale_term)
 
             fwrite_weight_tag_data(op->weight_data, bp);
             fwrite_weight_data(op->bias_data, bp);
+
+#if NCNN_INT8
+            // write int8_scale data
+            if (op->int8_scale_term)
+            {
+                ncnn::Mat weight_data_int8_scales(1);
+                weight_data_int8_scales[0] = op->weight_data_int8_scale;
+                fwrite_weight_data(weight_data_int8_scales, bp, 90, 100);
+            }
+#endif // NCNN_INT8
         }
         else if (layer->type == "Exp")
         {
@@ -2007,6 +2019,7 @@ int ModelWriter::save(const char* parampath, const char* binpath)
             fprintf_param_value(" 3=%d", kdim)
             fprintf_param_value(" 4=%d", vdim)
             fprintf_param_value(" 5=%d", attn_mask)
+            fprintf_param_value(" 6=%e", scale)
 
             fwrite_weight_tag_data(op->q_weight_data, bp);
             fwrite_weight_data(op->q_bias_data, bp);
@@ -2301,6 +2314,17 @@ int ModelWriter::save(const char* parampath, const char* binpath)
             fprintf_param_value(" 2=%d", c)
             fprintf_param_value(" 3=%d", permute)
         }
+        else if (layer->type == "RMSNorm")
+        {
+            ncnn::RMSNorm* op = (ncnn::RMSNorm*)layer;
+            ncnn::RMSNorm* op_default = (ncnn::RMSNorm*)layer_default;
+
+            fprintf_param_value(" 0=%d", affine_size)
+            fprintf_param_value(" 1=%e", eps)
+            fprintf_param_value(" 2=%d", affine)
+
+            fwrite_weight_data(op->gamma_data, bp);
+        }
         else if (layer->type == "RNN")
         {
             ncnn::RNN* op = (ncnn::RNN*)layer;
diff --git a/tools/onnx/onnx2ncnn.cpp b/tools/onnx/onnx2ncnn.cpp
index e443a28edf14..1b29e34c1285 100644
--- a/tools/onnx/onnx2ncnn.cpp
+++ b/tools/onnx/onnx2ncnn.cpp
@@ -2956,6 +2956,15 @@ static std::string trunc_name(std::string name)
 
 int main(int argc, char** argv)
 {
+    fprintf(stderr, "onnx2ncnn may not fully meet your needs. For more accurate and elegant\n\
+conversion results, please use PNNX. PyTorch Neural Network eXchange (PNNX) is\n\
+an open standard for PyTorch model interoperability. PNNX provides an open model\n\
+format for PyTorch. It defines computation graph as well as high level operators\n\
+strictly matches PyTorch. You can obtain pnnx through the following ways:\n\
+1. Install via python\n\
+   pip3 install pnnx\n\
+2. Get the executable from https://github.com/pnnx/pnnx\n\
+For more information, please refer to https://github.com/pnnx/pnnx\n");
     if (!(argc == 2 || argc == 4))
     {
         fprintf(stderr, "Usage: %s [onnxpb] [ncnnparam] [ncnnbin]\n", argv[0]);
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index e2fc28da9a9c..7743a8ae453e 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -77,6 +77,7 @@ set(pnnx_pass_level1_SRCS
     pass_level1/nn_ReplicationPad1d.cpp
     pass_level1/nn_ReplicationPad2d.cpp
     pass_level1/nn_ReplicationPad3d.cpp
+    pass_level1/nn_RMSNorm.cpp
     pass_level1/nn_RNN.cpp
     pass_level1/nn_RReLU.cpp
     pass_level1/nn_SELU.cpp
@@ -163,6 +164,7 @@ set(pnnx_pass_level2_SRCS
     pass_level2/F_prelu.cpp
     pass_level2/F_relu.cpp
     pass_level2/F_relu6.cpp
+    pass_level2/F_rms_norm.cpp
     pass_level2/F_rrelu.cpp
     pass_level2/F_scaled_dot_product_attention.cpp
     pass_level2/F_selu.cpp
@@ -367,6 +369,7 @@ set(pnnx_pass_level5_SRCS
     pass_level5/fuse_pixel_unshuffle.cpp
     pass_level5/fuse_layernorm.cpp
     pass_level5/fuse_multiheadattention.cpp
+    pass_level5/fuse_rmsnorm.cpp
     pass_level5/fuse_scaled_dot_product_attention.cpp
     pass_level5/fuse_select_to_unbind.cpp
     pass_level5/fuse_silu.cpp
@@ -383,6 +386,7 @@ set(pnnx_pass_level5_SRCS
     pass_level5/fuse_static_layernorm.cpp
     pass_level5/fuse_static_linear.cpp
     pass_level5/fuse_static_prelu.cpp
+    pass_level5/fuse_static_rmsnorm.cpp
     pass_level5/normalize_einsum_equation.cpp
     pass_level5/unroll_rnn_op.cpp
 )
@@ -472,6 +476,8 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/F_prelu.cpp
     pass_ncnn/F_relu.cpp
     pass_ncnn/F_relu6.cpp
+    pass_ncnn/F_rms_norm.cpp
+    pass_ncnn/F_scaled_dot_product_attention.cpp
     pass_ncnn/F_selu.cpp
     pass_ncnn/F_sigmoid.cpp
     pass_ncnn/F_silu.cpp
@@ -537,6 +543,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/nn_ReplicationPad1d.cpp
     pass_ncnn/nn_ReplicationPad2d.cpp
     pass_ncnn/nn_ReplicationPad3d.cpp
+    pass_ncnn/nn_RMSNorm.cpp
     pass_ncnn/nn_RNN.cpp
     pass_ncnn/nn_SELU.cpp
     pass_ncnn/nn_Sigmoid.cpp
@@ -571,6 +578,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/torch_mm.cpp
     pass_ncnn/torch_norm.cpp
     pass_ncnn/torch_prod.cpp
+    pass_ncnn/torch_roll.cpp
     pass_ncnn/torch_slice_scatter.cpp
     pass_ncnn/torch_squeeze.cpp
     pass_ncnn/torch_sum.cpp
@@ -586,12 +594,12 @@ if(PROTOBUF_FOUND)
     endif()
 
     if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE)
-        protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx.proto)
+        protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx-data.proto onnx-ml.proto onnx-operators-ml.proto)
         add_library(onnxproto STATIC ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS})
         target_include_directories(onnxproto PUBLIC ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
         target_link_libraries(onnxproto PUBLIC ${PROTOBUF_LIBRARIES})
     else()
-        add_library(onnxproto STATIC onnx.proto)
+        add_library(onnxproto STATIC onnx-data.proto onnx-ml.proto onnx-operators-ml.proto)
         target_include_directories(onnxproto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
         protobuf_generate(TARGET onnxproto)
         target_link_libraries(onnxproto PUBLIC protobuf::libprotobuf)
diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 07d2bbefefd2..8b2b6dfd2d7f 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1091,7 +1091,8 @@ static std::string expand_expression(const Operator* op)
                  || t == "maximum"
                  || t == "min"
                  || t == "minimum"
-                 || t == "pow")
+                 || t == "pow"
+                 || t == "logaddexp")
         {
             std::string binaryop;
             if (t == "atan2") binaryop = "torch.atan2";
@@ -1101,6 +1102,7 @@ static std::string expand_expression(const Operator* op)
             if (t == "min") binaryop = "torch.min";
             if (t == "minimum") binaryop = "torch.minimum";
             if (t == "pow") binaryop = "torch.pow";
+            if (t == "logaddexp") binaryop = "torch.logaddexp";
 
             std::string a = exprstack.top();
             exprstack.pop();
@@ -2109,6 +2111,15 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
                         fprintf(pyfp, ", ");
                 }
 
+                if (op->type == "torch.max" || op->type == "torch.max")
+                {
+                    if (op->has_param("dim") && op->outputs.size() == 1)
+                    {
+                        // torch.max and torch.min with dim returns tuple
+                        fprintf(pyfp, ", _");
+                    }
+                }
+
                 if (op->type.substr(0, 7) == "Tensor.")
                 {
                     if (op->type == "Tensor.fill")
diff --git a/tools/pnnx/src/load_onnx.cpp b/tools/pnnx/src/load_onnx.cpp
index 36624d916bdd..9adf2b470888 100644
--- a/tools/pnnx/src/load_onnx.cpp
+++ b/tools/pnnx/src/load_onnx.cpp
@@ -14,7 +14,7 @@
 
 #include "load_onnx.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 #include <google/protobuf/io/coded_stream.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
diff --git a/tools/pnnx/src/onnx-data.proto b/tools/pnnx/src/onnx-data.proto
new file mode 100644
index 000000000000..d7d925d45d02
--- /dev/null
+++ b/tools/pnnx/src/onnx-data.proto
@@ -0,0 +1,155 @@
+//
+// WARNING: This file is automatically generated!  Please edit onnx.in.proto.
+//
+
+
+// SPDX-License-Identifier: Apache-2.0
+
+
+syntax = "proto2";
+
+package onnx;
+import "onnx-ml.proto";
+
+// This file contains the proto definitions for MapProto and
+// SequenceProto. These protos are used to represent the data structures
+// of maps and sequence for use in test data or ModelProto.
+
+// Sequences
+//
+// Defines a dense, ordered, collection of elements that are of homogeneous types.
+// Sequences can be made out of tensors, maps, or sequences.
+//
+// If a sequence is made out of tensors, the tensors must have the same element
+// type (i.e. int32). In some cases, the tensors in a sequence can have different
+// shapes.  Whether the tensors can have different shapes or not depends on the
+// type/shape associated with the corresponding "ValueInfo". For example,
+// "Sequence<Tensor<float, [M,N]>" means that all tensors have same shape. However,
+// "Sequence<Tensor<float, [omitted,omitted]>" means they can have different
+// shapes (all of rank 2), where "omitted" means the corresponding dimension has
+// no symbolic/constant value. Finally, "Sequence<Tensor<float, omitted>>" means
+// that the different tensors can have different ranks, when the "shape" itself
+// is omitted from the tensor-type. For a more complete description, refer to
+// https://github.com/onnx/onnx/blob/main/docs/IR.md#static-tensor-shapes.
+//
+message SequenceProto {
+
+  optional string name = 1;
+
+  enum DataType {
+    UNDEFINED = 0;
+    TENSOR = 1;
+    SPARSE_TENSOR = 2;
+    SEQUENCE = 3;
+    MAP = 4;
+    OPTIONAL = 5;
+  }
+
+  // The data type of the element.
+  // This field MUST have a valid SequenceProto.DataType value
+  optional int32 elem_type = 2;
+
+  // For TensorProto values.
+  // When this field is present, the elem_type field MUST be TENSOR.
+  repeated TensorProto tensor_values = 3;
+
+  // For SparseTensorProto values.
+  // When this field is present, the elem_type field MUST be SPARSE_TENSOR.
+  repeated SparseTensorProto sparse_tensor_values = 4;
+
+  // For SequenceProto values, allowing sequences to be of themselves.
+  // When this field is present, the elem_type field MUST be SEQUENCE.
+  repeated SequenceProto sequence_values = 5;
+
+  // For MapProto values.
+  // When this field is present, the elem_type field MUST be MAP.
+  repeated MapProto map_values = 6;
+
+  // For OptionalProto values.
+  // When this field is present, the elem_type field MUST be Optional.
+  repeated OptionalProto optional_values = 7;
+
+}
+
+
+// Maps
+//
+// Specifies an associative table, defined by keys and values.
+// MapProto is formed with a repeated field of keys (of type INT8, INT16, INT32,
+// INT64, UINT8, UINT16, UINT32, UINT64, or STRING) and values (of type TENSOR,
+// SPARSE_TENSOR, SEQUENCE, or MAP). Key types and value types have to remain
+// the same throughout the instantiation of the MapProto.
+//
+message MapProto {
+
+  optional string name = 1;
+
+  // All MapProto data types must have the same length of keys and values.
+
+  // The data type of the key.
+  // This field MUST have a valid TensorProto.DataType value of
+  // INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64, or STRING
+  optional int32 key_type = 2;
+
+  // Every element of keys has to be one of the following data types
+  // INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64, or STRING.
+  // The integer cases are represented by the repeated int64 field keys below.
+  repeated int64 keys = 3;
+
+  // If keys are strings, they are represented by the repeated bytes field
+  // string_keys below.
+  repeated bytes string_keys = 4;
+
+  // MapProto values are represented in a SequenceProto of the same length as the
+  // repeated keys field and have to be one of the following data types
+  // TENSOR, SPARSE_TENSOR, MAP, SEQUENCE.
+  optional SequenceProto values = 5;
+}
+
+// Optional
+//
+//
+message OptionalProto {
+
+  optional string name = 1;
+
+  enum DataType {
+    UNDEFINED = 0;
+    TENSOR = 1;
+    SPARSE_TENSOR = 2;
+    SEQUENCE = 3;
+    MAP = 4;
+    OPTIONAL = 5;
+  }
+
+  // The data type of the element, identifies if the OptionalProto value
+  // is Tensor, Sparse Tensor, Sequence, Map, or Optional.
+  // The type of the optional value MUST match the elem_type specified.
+  // This field MUST have a valid OptionalProto.DataType value.
+  optional int32 elem_type = 2;
+
+  // For TensorProto value.
+  // When this field is present, the elem_type field MUST be TENSOR.
+  optional TensorProto tensor_value = 3;
+
+  // For SparseTensorProto value.
+  // When this field is present, the elem_type field MUST be SPARSE_TENSOR.
+  optional SparseTensorProto sparse_tensor_value = 4;
+
+  // For SequenceProto value.
+  // When this field is present, the elem_type field MUST be SEQUENCE.
+  optional SequenceProto sequence_value = 5;
+
+  // For MapProto value.
+  // When this field is present, the elem_type field MUST be MAP.
+  optional MapProto map_value = 6;
+
+  // For OptionalProto value, allowing optional to be of itself (completeness)
+  // When this field is present, the elem_type field MUST be OPTIONAL.
+  optional OptionalProto optional_value = 7;
+
+}
+
+// For using protobuf-lite
+option optimize_for = LITE_RUNTIME;
+
diff --git a/tools/pnnx/src/onnx.proto b/tools/pnnx/src/onnx-ml.proto
similarity index 92%
rename from tools/pnnx/src/onnx.proto
rename to tools/pnnx/src/onnx-ml.proto
index 15012ce65c38..5f4c0f4a4e28 100644
--- a/tools/pnnx/src/onnx.proto
+++ b/tools/pnnx/src/onnx-ml.proto
@@ -24,6 +24,8 @@ package onnx;
 //
 // The normative semantic specification of the ONNX IR is found in docs/IR.md.
 // Definitions of the built-in neural network operators may be found in docs/Operators.md.
+// Definitions of the built-in classical machine learning operators may be found in
+// docs/Operators-ml.md.
 
 // Notes
 //
@@ -106,7 +108,11 @@ enum Version {
   // IR VERSION 9 published on May 5, 2023
   // Added AttributeProto to FunctionProto so that default attribute values can be set.
   // Added FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ.
-  IR_VERSION = 0x0000000000000009;
+  IR_VERSION_2023_5_5 = 0x0000000000000009;
+
+  // IR VERSION 10 published on TBD
+  // Added UINT4, INT4.
+  IR_VERSION = 0x000000000000000A;
 }
 
 // Attributes
@@ -190,6 +196,8 @@ message ValueInfoProto {
   optional TypeProto type = 2;
   // A human-readable documentation for this value. Markdown is allowed.
   optional string doc_string = 3;
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 4;
 }
 
 // Nodes
@@ -211,12 +219,17 @@ message NodeProto {
   optional string op_type = 4;  // namespace Operator
   // The domain of the OperatorSet that specifies the operator named by op_type.
   optional string domain = 7;   // namespace Domain
+  // Overload identifier, used only to map this to a model-local function.
+  optional string overload = 8;
 
   // Additional named attributes.
   repeated AttributeProto attribute = 5;
 
   // A human-readable documentation for this node. Markdown is allowed.
   optional string doc_string = 6;
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 9;
 }
 
 // Training information
@@ -401,7 +414,7 @@ message ModelProto {
 
   // A list of function protos local to the model.
   //
-  // Name of the function "FunctionProto.name" should be unique within the domain "FunctionProto.domain".
+  // The (domain, name, overload) tuple must be unique across the function protos in this list.
   // In case of any conflicts the behavior (whether the model local functions are given higher priority,
   // or standard operator sets are given higher priotity or this is treated as error) is defined by
   // the runtimes.
@@ -475,6 +488,9 @@ message GraphProto {
   // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model.
   repeated TensorAnnotation quantization_annotation = 14;
 
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 16;
+
   reserved 3, 4, 6 to 9;
   reserved "ir_version", "producer_version", "producer_tag", "domain";
 }
@@ -520,7 +536,11 @@ message TensorProto {
     FLOAT8E4M3FN = 17;    // float 8, mostly used for coefficients, supports nan, not inf
     FLOAT8E4M3FNUZ = 18;  // float 8, mostly used for coefficients, supports nan, not inf, no negative zero
     FLOAT8E5M2 = 19;      // follows IEEE 754, supports nan, inf, mostly used for gradients
-    FLOAT8E5M2FNUZ = 20;  // follows IEEE 754, supports nan, inf, mostly used for gradients, no negative zero
+    FLOAT8E5M2FNUZ = 20;  // follows IEEE 754, supports nan, not inf, mostly used for gradients, no negative zero
+
+    // 4-bit data-types
+    UINT4 = 21;  // Unsigned integer in range [0, 15]
+    INT4 = 22;   // Signed integer in range [-8, 7], using two's-complement representation
 
     // Future extensions go here.
   }
@@ -555,11 +575,13 @@ message TensorProto {
   // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
   repeated float float_data = 4 [packed = true];
 
-  // For int32, uint8, int8, uint16, int16, bool, float8, and float16 values
+  // For int32, uint8, int8, uint16, int16, uint4, int4, bool, float8 and float16 values
   // float16 and float8 values must be bit-wise converted to an uint16_t prior
   // to writing to the buffer.
+  // uint4 and int4 values must be packed to 4bitx2 prior to writing to the buffer, the first element is stored in
+  // the 4 LSB and the second element is stored in the 4 MSB.
   // When this field is present, the data_type field MUST be
-  // INT32, INT16, INT8, UINT16, UINT8, BOOL, FLOAT16, BFLOAT16, FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ
+  // INT32, INT16, INT8, INT4, UINT16, UINT8, UINT4, BOOL, FLOAT16, BFLOAT16, FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ
   repeated int32 int32_data = 5 [packed = true];
 
   // For strings.
@@ -589,6 +611,7 @@ message TensorProto {
   // Complex64 elements must be written as two consecutive FLOAT values, real component first.
   // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
   // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
+  // uint4 and int4 values must be packed to 4bitx2, the first element is stored in the 4 LSB and the second element is stored in the 4 MSB.
   //
   // Note: the advantage of specific field rather than the raw_data field is
   // that in some cases (e.g. int data), protobuf does a better packing via
@@ -631,6 +654,9 @@ message TensorProto {
   // When this field is present, the data_type field MUST be
   // UINT32 or UINT64
   repeated uint64 uint64_data = 11 [packed = true];
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 16;
 }
 
 // A serialized sparse-tensor value
@@ -724,6 +750,17 @@ message TypeProto {
   }
 
 
+  message Opaque {
+    // When missing, the domain is the same as the model's.
+    optional string domain = 1;
+    // The name is optional but significant when provided.
+    optional string name = 2;
+    // parameters that help defining the type
+    // DEPRECATED do not use.
+    // repeated TypeProto parameters = 3;
+  }
+
+
   oneof value {
     // The type of a tensor.
     Tensor tensor_type = 1;
@@ -746,6 +783,9 @@ message TypeProto {
     // Type of the sparse tensor
     SparseTensor sparse_tensor_type = 8;
 
+
+    Opaque opaque_type = 7;
+
   }
 
   // An optional denotation can be used to denote the whole
@@ -777,9 +817,8 @@ enum OperatorStatus {
 }
 
 message FunctionProto {
-  // The name of the function, similar usage of op_type in OperatorProto.
-  // Combined with FunctionProto.domain, this forms the unique identity of
-  // the FunctionProto.
+  // The name of the function, similar to op_type in NodeProto.
+  // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model.
   optional string name = 1;
 
   // Deprecated since IR Version 8
@@ -826,9 +865,22 @@ message FunctionProto {
 
   repeated OperatorSetIdProto opset_import = 9;
 
-  // The domain which this function belongs to. Combined with FunctionProto.name, this forms the unique identity of
-  // the FunctionProto.
+  // The domain which this function belongs to.
+  // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model.
   optional string domain = 10;
+
+  // The overload identifier of the function.
+  // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model.
+  optional string overload = 13;
+
+  // Information for the values in the function. The ValueInfoProto.name's
+  // must be distinct and refer to names in the function (including inputs,
+  // outputs, and intermediate values). It is optional for a value to appear
+  // in value_info list.
+  repeated ValueInfoProto value_info = 12;
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 14;
 }
 
 // For using protobuf-lite
diff --git a/tools/pnnx/src/onnx-operators-ml.proto b/tools/pnnx/src/onnx-operators-ml.proto
new file mode 100644
index 000000000000..de62706f5cbd
--- /dev/null
+++ b/tools/pnnx/src/onnx-operators-ml.proto
@@ -0,0 +1,136 @@
+//
+// WARNING: This file is automatically generated!  Please edit onnx.in.proto.
+//
+
+
+// Copyright (c) ONNX Project Contributors.
+// Licensed under the Apache-2.0 license.
+
+syntax = "proto2";
+
+package onnx;
+import "onnx-ml.proto";
+
+//
+// This file contains the proto definitions for OperatorSetProto and
+// OperatorProto.  OperatorSetProtos are used to describe a versioned
+// set of operators that can be used by a ModelProto.
+//
+// Like ModelProto, OperatorSetProto is defined as a top-level file/wire
+// format, however their usage is different.
+//
+// ModelProto files are used to describe executable graphs that can be
+// executed directly by a framework, runtime, or engine.
+//
+// OperatorSetProto files are used to describe a set of operators that are
+// available in a given environment.  The file TBD.TBD is the OperatorSetProto
+// that describes the ONNX standard operators.
+//
+
+// An OperatorProto represents the immutable specification of the signature
+// and semantics of an operator.
+//
+// Operators are declared as part of an OperatorSet, which also defines the
+// domain name for the set.
+//
+// Operators are uniquely identified by a three part identifier
+//   (domain, op_type, since_version)
+// where
+//   *domain* is the domain of an operator set that
+//      contains this operator specification.
+//
+//   *op_type* is the name of the operator as referenced by a
+//      NodeProto.op_type
+//
+//   *since_version* is the version of the operator set that
+//      this operator was initially declared in.
+//
+message OperatorProto {
+  // The name of the operator within a domain.
+  // This field MUST be present in this version of the IR.
+  optional string op_type = 1;
+
+  // The version of the operator set that first introduced this
+  // operator. This value MUST be the same value as the
+  // opset_version of the operator set that first published this operator.
+  // Subsequent versions of the operator set MUST NOT alter the signature
+  // or semantics of the operator once published as STABLE.
+  // This field MUST be present in this version of the IR.
+  optional int64 since_version = 2;
+
+  // This field indicates whether the syntax, semantics, or presence
+  // of this operator is in an experimental or stable stage. Once an
+  // operator is published as STABLE, it's syntax and semantics MUST NOT
+  // change in subsequent versions of the operator set.
+  // When an operator is published as EXPERIMENTAL, the syntax and semantics
+  // of the operator MAY change across operator set versions.
+  // Operators "become" stable by deprecating the experimental version and
+  // introducing a new stable operator with the same op_type.
+  optional OperatorStatus status = 3;
+
+  // Eventually we will declare the signature of the operator here
+
+  // A human-readable documentation for this operator. Markdown is allowed.
+  optional string doc_string = 10;
+}
+
+// An OperatorSetProto represents an immutable set of immutable operator
+// specifications.
+//
+// The domain of the set (OperatorSetProto.domain) is a reverse-DNS name
+// that disambiguates operator sets defined by independent entities.
+//
+// The version of the set (opset_version) is a monotonically increasing
+// integer that indicates changes to the membership of the operator set.
+//
+//
+// Operator sets are uniquely identified by a two part identifier (domain, opset_version)
+//
+// Like ModelProto, OperatorSetProto is intended as a top-level file/wire format,
+// and thus has the standard format headers in addition to the operator set information.
+//
+message OperatorSetProto {
+  // All OperatorSetProtos start with a distingushed byte sequence to disambiguate
+  // protobuf files containing OperatorSets from other content.
+  // This field MUST be "ONNXOPSET"
+  // This field MUST be present in this version of the IR
+  optional string magic = 1;
+
+  // All OperatorSetProtos indicate the version of the IR syntax and semantics
+  // they adhere to. It is always IR_VERSION.
+  // This field MUST be present in this version of the IR
+  optional int64 ir_version = 2;
+
+  // The prerelease component of the SemVer of the IR.
+  // This field MAY be absent in this version of the IR
+  optional string ir_version_prerelease = 3;
+
+  // The build metadata component of the SemVer of the IR.
+  // This field MAY be absent in this version of the IR
+  optional string ir_build_metadata = 7;
+
+  // Domain name of the operator set, in reverse DNS form (e.g., com.acme.dnnops).
+  optional string domain = 4;
+
+  // The version of the set of operators. This is a simple int value
+  // that is monotonically increasing as new versions of the operator set
+  // are published. All operators in this set MUST have since_version
+  // <= opset_version.
+  optional int64 opset_version = 5;
+
+  // A human-readable documentation for this set of operators. Markdown is allowed.
+  optional string doc_string = 6;
+
+  // The operators specified by this operator set.
+  // The (name, version) MUST be unique across all OperatorProtos in operator
+  repeated OperatorProto operator = 8;
+
+  // The functions specified by this operator set.
+  // The (name, version) MUST be unique across all OperatorProtos/FunctionProtos in operator/functions
+  repeated FunctionProto functions = 9;
+}
+
+
+// For using protobuf-lite
+option optimize_for = LITE_RUNTIME;
+
diff --git a/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp b/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp
new file mode 100644
index 000000000000..498f0453c14f
--- /dev/null
+++ b/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level1.h"
+
+#include "../utils.h"
+
+namespace pnnx {
+
+class RMSNorm : public FuseModulePass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "__torch__.torch.nn.modules.normalization.RMSNorm";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.RMSNorm";
+    }
+
+    void write(Operator* op, const std::shared_ptr<torch::jit::Graph>& graph, const torch::jit::Module& mod) const
+    {
+        const torch::jit::Node* rmsn = find_node_by_kind(graph, "aten::rms_norm");
+
+        op->params["normalized_shape"] = rmsn->namedInput("normalized_shape");
+        op->params["eps"] = rmsn->namedInput("eps");
+        op->params["elementwise_affine"] = mod.hasattr("weight");
+
+        if (mod.hasattr("weight"))
+        {
+            op->attrs["weight"] = mod.attr("weight").toTensor();
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_MODULE_PASS(RMSNorm)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2.cpp b/tools/pnnx/src/pass_level2.cpp
index bc7e51b8d5d0..de44a3553662 100644
--- a/tools/pnnx/src/pass_level2.cpp
+++ b/tools/pnnx/src/pass_level2.cpp
@@ -1166,6 +1166,18 @@ static void functionize(Graph& graph)
             if (out0->consumers.size() == 1)
                 continue;
 
+            bool all_consumers_are_same = true;
+            for (size_t j = 1; j < out0->consumers.size(); j++)
+            {
+                if (out0->consumers[j] != out0->consumers[0])
+                {
+                    all_consumers_are_same = false;
+                    break;
+                }
+            }
+            if (all_consumers_are_same)
+                continue;
+
             for (int j = (int)out0->consumers.size() - 1; j > 0; j--)
             {
                 Operator* op1 = out0->consumers[j];
diff --git a/tools/pnnx/src/pass_level2/F_hardswish.cpp b/tools/pnnx/src/pass_level2/F_hardswish.cpp
index caa724f55a73..2ce9e1b420bf 100644
--- a/tools/pnnx/src/pass_level2/F_hardswish.cpp
+++ b/tools/pnnx/src/pass_level2/F_hardswish.cpp
@@ -343,4 +343,30 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_hardswish_onnx_2, 9)
 
+class F_hardswish_onnx_3 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+8 7
+pnnx.Input              input       0 1 input
+prim::Constant          op_0        0 1 v3 value=3
+aten::add               op_1        2 1 input v3 a
+aten::clamp             op_2        1 1 a b max=6 min=0
+aten::mul               op_3        2 1 input b c
+prim::Constant          op_4        0 1 v6 value=6
+aten::div               op_5        2 1 c v6 out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.hardswish";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_hardswish_onnx_3, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_interpolate.cpp b/tools/pnnx/src/pass_level2/F_interpolate.cpp
index b93bd2df6c8d..119842b1c780 100644
--- a/tools/pnnx/src/pass_level2/F_interpolate.cpp
+++ b/tools/pnnx/src/pass_level2/F_interpolate.cpp
@@ -1005,7 +1005,7 @@ class F_interpolate_onnx : public GraphRewriterPass
         return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-Resize                  op_0        1 1 input out sizes=%sizes coordinate_transformation_mode=%coordinate_transformation_mode mode=%mode nearest_mode=floor cubic_coeff_a=*
+Resize                  op_0        1 1 input out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -1017,104 +1017,69 @@ pnnx.Output             output      1 0 out
 
     bool match(const std::map<std::string, const Operator*>& matched_operators, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& /*captured_attrs*/) const
     {
-        if (captured_params.at("sizes").type != 5)
+        if (captured_params.find("op_0.coordinate_transformation_mode") == captured_params.end())
             return false;
 
-        const std::vector<int>& sizes = captured_params.at("sizes").ai;
-
-        if (sizes.size() < 3 || sizes.size() > 5)
+        if (captured_params.at("op_0.coordinate_transformation_mode").type != 4)
             return false;
 
-        const std::vector<int>& input_shape = matched_operators.at("op_0")->inputs[0]->shape;
-        if (input_shape.size() < 3 || input_shape.size() > 5)
+        if (captured_params.find("op_0.mode") == captured_params.end())
             return false;
 
-        if (input_shape[0] != sizes[0] || input_shape[1] != sizes[1])
+        if (captured_params.at("op_0.mode").type != 4)
             return false;
 
-        return true;
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
-    {
-        const std::string& coordinate_transformation_mode = captured_params.at("coordinate_transformation_mode").s;
-        std::string mode = captured_params.at("mode").s;
-        const std::vector<int>& sizes = captured_params.at("sizes").ai;
-
-        if (mode == "linear")
+        if (captured_params.find("op_0.nearest_mode") != captured_params.end())
         {
-            if (coordinate_transformation_mode == "half_pixel")
-                op->params["align_corners"] = false;
-            if (coordinate_transformation_mode == "align_corners")
-                op->params["align_corners"] = true;
-
-            if (sizes.size() == 4)
-                mode = "bilinear";
-            if (sizes.size() == 5)
-                mode = "trilinear";
+            if (captured_params.at("op_0.nearest_mode").type != 4 || captured_params.at("op_0.nearest_mode").s != "floor")
+                return false;
         }
 
-        if (mode == "cubic")
+        if (captured_params.find("op_0.roi") != captured_params.end())
         {
-            if (coordinate_transformation_mode == "half_pixel")
-                op->params["align_corners"] = false;
-            if (coordinate_transformation_mode == "align_corners")
-                op->params["align_corners"] = true;
-
-            mode = "bicubic";
+            if (captured_params.at("op_0.roi").type != 6 || !captured_params.at("op_0.roi").ai.empty())
+                return false;
         }
 
-        op->params["mode"] = mode;
-        if (sizes.size() == 3)
-            op->params["size"] = {sizes[2]};
-        if (sizes.size() == 4)
-            op->params["size"] = {sizes[2], sizes[3]};
-        if (sizes.size() == 5)
-            op->params["size"] = {sizes[2], sizes[3], sizes[4]};
-    }
-};
+        if (captured_params.find("op_0.sizes") == captured_params.end() && captured_params.find("op_0.scales") == captured_params.end())
+            return false;
 
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_interpolate_onnx, 10)
+        if (captured_params.find("op_0.sizes") != captured_params.end() && captured_params.at("op_0.sizes").type == 5 && !captured_params.at("op_0.sizes").ai.empty())
+        {
+            const std::vector<int>& sizes = captured_params.at("op_0.sizes").ai;
 
-class F_interpolate_onnx_1 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-3 2
-pnnx.Input              input       0 1 input
-Resize                  op_0        1 1 input out scales=%scales coordinate_transformation_mode=%coordinate_transformation_mode mode=%mode nearest_mode=floor cubic_coeff_a=*
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
+            if (sizes.size() < 3 || sizes.size() > 5)
+                return false;
 
-    const char* type_str() const
-    {
-        return "F.interpolate";
-    }
+            const std::vector<int>& input_shape = matched_operators.at("op_0")->inputs[0]->shape;
+            if (input_shape.size() < 3 || input_shape.size() > 5)
+                return false;
 
-    bool match(const std::map<std::string, Parameter>& captured_params) const
-    {
-        if (captured_params.at("scales").type != 6)
-            return false;
-
-        const std::vector<float>& scales = captured_params.at("scales").af;
+            if (input_shape[0] != sizes[0] || input_shape[1] != sizes[1])
+                return false;
+        }
+        else if (captured_params.find("op_0.scales") != captured_params.end() && captured_params.at("op_0.scales").type == 6 && !captured_params.at("op_0.scales").af.empty())
+        {
+            const std::vector<float>& scales = captured_params.at("op_0.scales").af;
 
-        if (scales.size() < 3 || scales.size() > 5)
-            return false;
+            if (scales.size() < 3 || scales.size() > 5)
+                return false;
 
-        if (scales[0] != 1.f || scales[1] != 1.f)
+            if (scales[0] != 1.f || scales[1] != 1.f)
+                return false;
+        }
+        else
+        {
             return false;
+        }
 
         return true;
     }
 
     void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
     {
-        const std::string& coordinate_transformation_mode = captured_params.at("coordinate_transformation_mode").s;
-        std::string mode = captured_params.at("mode").s;
-        const std::vector<float>& scales = captured_params.at("scales").af;
+        const std::string& coordinate_transformation_mode = captured_params.at("op_0.coordinate_transformation_mode").s;
+        std::string mode = captured_params.at("op_0.mode").s;
 
         if (mode == "linear")
         {
@@ -1122,11 +1087,6 @@ pnnx.Output             output      1 0 out
                 op->params["align_corners"] = false;
             if (coordinate_transformation_mode == "align_corners")
                 op->params["align_corners"] = true;
-
-            if (scales.size() == 4)
-                mode = "bilinear";
-            if (scales.size() == 5)
-                mode = "trilinear";
         }
 
         if (mode == "cubic")
@@ -1135,22 +1095,63 @@ pnnx.Output             output      1 0 out
                 op->params["align_corners"] = false;
             if (coordinate_transformation_mode == "align_corners")
                 op->params["align_corners"] = true;
-
-            mode = "bicubic";
         }
 
-        op->params["mode"] = mode;
-        op->params["recompute_scale_factor"] = false;
-        if (scales.size() == 3)
-            op->params["scale_factor"] = {scales[2]};
-        if (scales.size() == 4)
-            op->params["scale_factor"] = {scales[2], scales[3]};
-        if (scales.size() == 5)
-            op->params["scale_factor"] = {scales[2], scales[3], scales[4]};
+        if (captured_params.find("op_0.sizes") != captured_params.end() && captured_params.at("op_0.sizes").type == 5 && !captured_params.at("op_0.sizes").ai.empty())
+        {
+            const std::vector<int>& sizes = captured_params.at("op_0.sizes").ai;
+
+            if (mode == "linear")
+            {
+                if (sizes.size() == 4)
+                    mode = "bilinear";
+                if (sizes.size() == 5)
+                    mode = "trilinear";
+            }
+
+            if (mode == "cubic")
+            {
+                mode = "bicubic";
+            }
+
+            op->params["mode"] = mode;
+            if (sizes.size() == 3)
+                op->params["size"] = {sizes[2]};
+            if (sizes.size() == 4)
+                op->params["size"] = {sizes[2], sizes[3]};
+            if (sizes.size() == 5)
+                op->params["size"] = {sizes[2], sizes[3], sizes[4]};
+        }
+        else if (captured_params.find("op_0.scales") != captured_params.end() && captured_params.at("op_0.scales").type == 6 && !captured_params.at("op_0.scales").af.empty())
+        {
+            const std::vector<float>& scales = captured_params.at("op_0.scales").af;
+
+            if (mode == "linear")
+            {
+                if (scales.size() == 4)
+                    mode = "bilinear";
+                if (scales.size() == 5)
+                    mode = "trilinear";
+            }
+
+            if (mode == "cubic")
+            {
+                mode = "bicubic";
+            }
+
+            op->params["mode"] = mode;
+            op->params["recompute_scale_factor"] = false;
+            if (scales.size() == 3)
+                op->params["scale_factor"] = {scales[2]};
+            if (scales.size() == 4)
+                op->params["scale_factor"] = {scales[2], scales[3]};
+            if (scales.size() == 5)
+                op->params["scale_factor"] = {scales[2], scales[3], scales[4]};
+        }
     }
 };
 
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_interpolate_onnx_1, 10)
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_interpolate_onnx, 10)
 
 class F_interpolate_onnx_2 : public GraphRewriterPass
 {
diff --git a/tools/pnnx/src/pass_level2/F_linear.cpp b/tools/pnnx/src/pass_level2/F_linear.cpp
index 4c454581ec3f..62f9d62e5054 100644
--- a/tools/pnnx/src/pass_level2/F_linear.cpp
+++ b/tools/pnnx/src/pass_level2/F_linear.cpp
@@ -129,7 +129,7 @@ class F_linear_onnx : public GraphRewriterPass
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 weight
 pnnx.Input              input_2     0 1 bias
-Gemm                    op_0        3 1 input weight bias out alpha=1.000000e+00 beta=1.000000e+00 transB=1
+Gemm                    gemm        3 1 input weight bias out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -138,6 +138,39 @@ pnnx.Output             output      1 0 out
     {
         return "F.linear";
     }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& /*captured_attrs*/) const
+    {
+        if (captured_params.find("gemm.alpha") != captured_params.end())
+        {
+            if (captured_params.at("gemm.alpha").type != 3 || captured_params.at("gemm.alpha").f != 1.f)
+                return false;
+        }
+
+        if (captured_params.find("gemm.beta") != captured_params.end())
+        {
+            if (captured_params.at("gemm.beta").type != 3 || captured_params.at("gemm.beta").f != 1.f)
+                return false;
+        }
+
+        if (captured_params.find("gemm.transA") != captured_params.end())
+        {
+            if (captured_params.at("gemm.transA").type != 2 || captured_params.at("gemm.transA").i != 0)
+                return false;
+        }
+
+        if (captured_params.find("gemm.transB") == captured_params.end())
+            return false;
+
+        if (captured_params.at("gemm.transB").type != 2 || captured_params.at("gemm.transB").i != 1)
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& /*captured_params*/) const
+    {
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_linear_onnx, 10)
@@ -152,7 +185,7 @@ class F_linear_onnx_1 : public GraphRewriterPass
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 bias
 pnnx.Attribute          weight      0 1 weight @data=(%in_features,%out_features)f32
-Gemm                    gemm        3 1 input weight bias out alpha=1.000000e+00 beta=1.000000e+00
+Gemm                    gemm        3 1 input weight bias out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -169,6 +202,35 @@ pnnx.Output             output      1 0 out
 )PNNXIR";
     }
 
+    bool match(const std::map<std::string, const Operator*>& matched_operators, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& /*captured_attrs*/) const
+    {
+        if (captured_params.find("gemm.alpha") != captured_params.end())
+        {
+            if (captured_params.at("gemm.alpha").type != 3 || captured_params.at("gemm.alpha").f != 1.f)
+                return false;
+        }
+
+        if (captured_params.find("gemm.beta") != captured_params.end())
+        {
+            if (captured_params.at("gemm.beta").type != 3 || captured_params.at("gemm.beta").f != 1.f)
+                return false;
+        }
+
+        if (captured_params.find("gemm.transA") != captured_params.end())
+        {
+            if (captured_params.at("gemm.transA").type != 2 || captured_params.at("gemm.transA").i != 0)
+                return false;
+        }
+
+        if (captured_params.find("gemm.transB") != captured_params.end())
+        {
+            if (captured_params.at("gemm.transB").type != 2 || captured_params.at("gemm.transB").i != 0)
+                return false;
+        }
+
+        return true;
+    }
+
     void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
     {
         const int in_features = captured_params.at("in_features").i;
diff --git a/tools/pnnx/src/pass_level2/F_log_softmax.cpp b/tools/pnnx/src/pass_level2/F_log_softmax.cpp
index 0264973783b0..ad9eba30d1cf 100644
--- a/tools/pnnx/src/pass_level2/F_log_softmax.cpp
+++ b/tools/pnnx/src/pass_level2/F_log_softmax.cpp
@@ -39,4 +39,77 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_log_softmax, 10)
 
+class F_log_softmax_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input_0     0 1 input
+LogSoftmax              op_0        1 1 input out axis=%dim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.log_softmax";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_log_softmax_onnx, 10)
+
+class F_log_softmax_onnx_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input_0     0 1 input
+Transpose               op_0        1 1 input a perm=%perm
+LogSoftmax              op_1        1 1 a b axis=%axis
+Transpose               op_2        1 1 b out perm=%perm
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.log_softmax";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::vector<int>& perm = captured_params.at("perm").ai;
+        const int axis = captured_params.at("axis").i;
+
+        if (axis >= (int)perm.size())
+            return false;
+
+        int excount = 0;
+        for (int i = 0; i < (int)perm.size(); i++)
+        {
+            if (perm[i] != i)
+                excount++;
+        }
+
+        if (excount != 2)
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::vector<int>& perm = captured_params.at("perm").ai;
+        const int axis = captured_params.at("axis").i;
+
+        op->params["dim"] = perm[axis];
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_log_softmax_onnx_1, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_logsigmoid.cpp b/tools/pnnx/src/pass_level2/F_logsigmoid.cpp
index e35670686a0e..e0d4df607f23 100644
--- a/tools/pnnx/src/pass_level2/F_logsigmoid.cpp
+++ b/tools/pnnx/src/pass_level2/F_logsigmoid.cpp
@@ -37,4 +37,26 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_logsigmoid, 10)
 
+class F_logsigmoid_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+aten::sigmoid           op_0        1 1 input a
+aten::log               op_1        1 1 a out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.logsigmoid";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_logsigmoid_onnx, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_mish.cpp b/tools/pnnx/src/pass_level2/F_mish.cpp
index 1a083ba85d9a..485a7e3b0b52 100644
--- a/tools/pnnx/src/pass_level2/F_mish.cpp
+++ b/tools/pnnx/src/pass_level2/F_mish.cpp
@@ -62,4 +62,27 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_mish_1, 9)
 
+class F_mish_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+Softplus                op_0        1 1 input a
+aten::tanh              op_1        1 1 a b
+aten::mul               op_2        2 1 input b out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.mish";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_mish_onnx, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_rms_norm.cpp b/tools/pnnx/src/pass_level2/F_rms_norm.cpp
new file mode 100644
index 000000000000..aaa1813c5639
--- /dev/null
+++ b/tools/pnnx/src/pass_level2/F_rms_norm.cpp
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level2.h"
+
+namespace pnnx {
+
+class F_rms_norm : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 weight
+pnnx.Input              input_2     0 1 normalized_shape
+prim::Constant          op_0        0 1 eps value=%eps
+aten::rms_norm          op_1        4 1 input normalized_shape weight eps out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.rms_norm";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_rms_norm, 10)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_selu.cpp b/tools/pnnx/src/pass_level2/F_selu.cpp
index 592c3dd8ed77..9df970b1bbc1 100644
--- a/tools/pnnx/src/pass_level2/F_selu.cpp
+++ b/tools/pnnx/src/pass_level2/F_selu.cpp
@@ -37,4 +37,25 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_selu, 10)
 
+class F_selu_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+Selu                    op_0        1 1 input out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.selu";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_selu_onnx, 10)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_softmin.cpp b/tools/pnnx/src/pass_level2/F_softmin.cpp
index bb0768663c53..89e5d9aeaf83 100644
--- a/tools/pnnx/src/pass_level2/F_softmin.cpp
+++ b/tools/pnnx/src/pass_level2/F_softmin.cpp
@@ -40,4 +40,26 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softmin, 9)
 
+class F_softmin_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+aten::neg               op_0        1 1 input 6
+Softmax                 op_1        1 1 6 out axis=%dim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.softmin";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softmin_onnx, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_softplus.cpp b/tools/pnnx/src/pass_level2/F_softplus.cpp
index c6a5279b4140..8d346eb76ed5 100644
--- a/tools/pnnx/src/pass_level2/F_softplus.cpp
+++ b/tools/pnnx/src/pass_level2/F_softplus.cpp
@@ -39,4 +39,62 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softplus, 10)
 
+class F_softplus_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input_0     0 1 input
+Softplus                op_0        1 1 input out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.softplus";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& /*captured_params*/) const
+    {
+        op->params["beta"] = 1.f;
+        op->params["threshold"] = 20.f;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softplus_onnx, 10)
+
+class F_softplus_onnx_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+7 6
+pnnx.Input              input_0     0 1 input
+prim::Constant          op_0        0 1 beta value=%beta
+aten::mul               op_1        2 1 input beta a
+Softplus                op_2        1 1 a b
+prim::Constant          op_3        0 1 beta2 value=%beta
+aten::div               op_4        2 1 b beta2 out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.softplus";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        op->params["beta"] = captured_params.at("beta");
+        op->params["threshold"] = 20.f;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softplus_onnx_1, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_softshrink.cpp b/tools/pnnx/src/pass_level2/F_softshrink.cpp
index 286990bf2c57..8d14a8a644b4 100644
--- a/tools/pnnx/src/pass_level2/F_softshrink.cpp
+++ b/tools/pnnx/src/pass_level2/F_softshrink.cpp
@@ -38,4 +38,62 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softshrink, 10)
 
+static bool NearlyEqual(float a, float b, float epsilon)
+{
+    if (a == b)
+        return true;
+
+    float diff = (float)fabs(a - b);
+    if (diff <= epsilon)
+        return true;
+
+    // relative error
+    return diff < epsilon * std::max(fabs(a), fabs(b));
+}
+
+class F_softshrink_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+15 14
+pnnx.Input              input       0 1 input
+prim::Constant          op_0        0 1 lambd value=%lambd
+aten::gt                op_1        2 1 input lambd 8
+prim::Constant          op_2        0 1 lambd2 value=%lambd
+aten::sub               op_3        2 1 input lambd2 9
+prim::Constant          op_4        0 1 zero value=0
+aten::where             op_5        3 1 8 9 zero a
+prim::Constant          op_6        0 1 mlambd value=%lambd2
+aten::lt                op_7        2 1 input mlambd 11
+prim::Constant          op_8        0 1 lambd3 value=%lambd
+aten::add               op_9        2 1 input lambd3 12
+prim::Constant          op_10       0 1 zero2 value=0
+aten::where             op_11       3 1 11 12 zero2 b
+aten::add               op_12       2 1 a b out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.softshrink";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        float lambd = captured_params.at("lambd").f;
+        float lambd2 = captured_params.at("lambd2").f;
+        return NearlyEqual(lambd, -lambd2, 0.001);
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        op->params["lambd"] = captured_params.at("lambd");
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softshrink_onnx, 10)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_softsign.cpp b/tools/pnnx/src/pass_level2/F_softsign.cpp
index 4ec8ae9e520d..ae6005d63376 100644
--- a/tools/pnnx/src/pass_level2/F_softsign.cpp
+++ b/tools/pnnx/src/pass_level2/F_softsign.cpp
@@ -41,4 +41,28 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softsign, 10)
 
+class F_softsign_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input       0 1 input
+aten::abs               op_0        1 1 input 6
+prim::Constant          op_1        0 1 8 value=1
+aten::add               op_2        2 1 6 8 9
+aten::div               op_3        2 1 input 9 out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.softsign";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softsign_onnx, 10)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_tanhshrink.cpp b/tools/pnnx/src/pass_level2/F_tanhshrink.cpp
index d8d6c311fcd8..01e578bf8ade 100644
--- a/tools/pnnx/src/pass_level2/F_tanhshrink.cpp
+++ b/tools/pnnx/src/pass_level2/F_tanhshrink.cpp
@@ -39,4 +39,26 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_tanhshrink, 9)
 
+class F_tanhshrink_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+aten::tanh              op_0        1 1 input 7
+aten::sub               op_1        2 1 input 7 out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.tanhshrink";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_tanhshrink_onnx, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/Tensor_expand.cpp b/tools/pnnx/src/pass_level2/Tensor_expand.cpp
index 23c1af6a863d..4c94d7b8e04f 100644
--- a/tools/pnnx/src/pass_level2/Tensor_expand.cpp
+++ b/tools/pnnx/src/pass_level2/Tensor_expand.cpp
@@ -61,4 +61,52 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_expand_1, 20)
 
+class Tensor_expand_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+Expand                  op_0        1 1 input out %*=%*
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Tensor.expand";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.find("op_0.shape") == captured_params.end())
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("op_0.shape").type == 5)
+        {
+            op->params["shape"] = captured_params.at("op_0.shape");
+        }
+        else // if (captured_params.at("op_0.shape").type == 2)
+        {
+            op->params["shape"] = std::vector<int>{captured_params.at("op_0.shape").i};
+        }
+
+        // onnx set expand shape 1 for not changing the size of that dimension while torch uses -1
+        for (size_t i = 0; i < op->params["shape"].ai.size(); i++)
+        {
+            if (op->params["shape"].ai[i] == 1)
+                op->params["shape"].ai[i] = -1;
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_expand_onnx, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/Tensor_reshape.cpp b/tools/pnnx/src/pass_level2/Tensor_reshape.cpp
index 1c578a8d6333..412e609cc403 100644
--- a/tools/pnnx/src/pass_level2/Tensor_reshape.cpp
+++ b/tools/pnnx/src/pass_level2/Tensor_reshape.cpp
@@ -48,7 +48,7 @@ class Tensor_reshape_onnx : public GraphRewriterPass
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 shape
 aten::cat               op_0        1 1 shape cat dim=0
-Reshape                 op_1        2 1 input cat out allowzero=*
+Reshape                 op_1        2 1 input cat out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -57,46 +57,15 @@ pnnx.Output             output      1 0 out
     {
         return "Tensor.reshape";
     }
-};
-
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx, 19)
-
-class Tensor_reshape_onnx_1 : public Tensor_reshape_onnx
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input_0     0 1 input
-pnnx.Input              input_1     0 1 shape
-aten::cat               op_0        1 1 shape cat dim=0
-Reshape                 op_1        2 1 input cat out
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-};
-
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_1, 19)
 
-class Tensor_reshape_onnx_2 : public Tensor_reshape_onnx
-{
-public:
-    const char* match_pattern_graph() const
+    void write(Operator* /*op*/, const std::map<std::string, Parameter>& /*captured_params*/) const
     {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input_0     0 1 input
-pnnx.Input              input_1     0 1 shape
-Reshape                 op_1        2 1 input shape out allowzero=*
-pnnx.Output             output      1 0 out
-)PNNXIR";
     }
 };
 
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_2, 20)
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx, 19)
 
-class Tensor_reshape_onnx_3 : public Tensor_reshape_onnx
+class Tensor_reshape_onnx_1 : public Tensor_reshape_onnx
 {
 public:
     const char* match_pattern_graph() const
@@ -105,15 +74,15 @@ class Tensor_reshape_onnx_3 : public Tensor_reshape_onnx
 4 3
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 shape
-Reshape                 op_1        2 1 input shape out
+Reshape                 op_0        2 1 input shape out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
 };
 
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_3, 20)
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_1, 20)
 
-class Tensor_reshape_onnx_4 : public GraphRewriterPass
+class Tensor_reshape_onnx_2 : public GraphRewriterPass
 {
 public:
     const char* match_pattern_graph() const
@@ -121,7 +90,7 @@ class Tensor_reshape_onnx_4 : public GraphRewriterPass
         return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-Reshape                 op_1        1 1 input out shape=%shape allowzero=*
+Reshape                 op_0        1 1 input out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -130,24 +99,28 @@ pnnx.Output             output      1 0 out
     {
         return "Tensor.reshape";
     }
-};
 
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_4, 20)
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.find("op_0.shape") == captured_params.end())
+            return false;
+
+        return true;
+    }
 
-class Tensor_reshape_onnx_5 : public Tensor_reshape_onnx_4
-{
-public:
-    const char* match_pattern_graph() const
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
     {
-        return R"PNNXIR(7767517
-3 2
-pnnx.Input              input       0 1 input
-Reshape                 op_1        1 1 input out shape=%shape
-pnnx.Output             output      1 0 out
-)PNNXIR";
+        if (captured_params.at("op_0.shape").type == 5)
+        {
+            op->params["shape"] = captured_params.at("op_0.shape");
+        }
+        else // if (captured_params.at("op_0.shape").type == 2)
+        {
+            op->params["shape"] = std::vector<int>{captured_params.at("op_0.shape").i};
+        }
     }
 };
 
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_5, 20)
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_2, 20)
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_max.cpp b/tools/pnnx/src/pass_level2/torch_max.cpp
index b606fed066b8..5a993d6f55ea 100644
--- a/tools/pnnx/src/pass_level2/torch_max.cpp
+++ b/tools/pnnx/src/pass_level2/torch_max.cpp
@@ -35,6 +35,18 @@ pnnx.Output             output      2 0 out indices
     {
         return "torch.max";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        GraphRewriterPass::write(op, captured_params);
+
+        // drop indices if not used
+        if (op->outputs[1]->consumers.empty())
+        {
+            op->outputs[1]->producer = 0;
+            op->outputs.resize(1);
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_max, 20)
@@ -78,11 +90,22 @@ pnnx.Output             output      1 0 out
         return "torch.max";
     }
 
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.find("op_0.axes") != captured_params.end())
+        {
+            if (captured_params.at("op_0.axes").type != 5 || captured_params.at("op_0.axes").ai.size() != 1)
+                return false;
+        }
+
+        return true;
+    }
+
     void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
     {
         if (captured_params.find("op_0.axes") != captured_params.end())
         {
-            op->params["dim"] = captured_params.at("op_0.axes");
+            op->params["dim"] = captured_params.at("op_0.axes").ai[0];
 
             if (captured_params.find("op_0.keepdims") != captured_params.end())
             {
diff --git a/tools/pnnx/src/pass_level2/torch_min.cpp b/tools/pnnx/src/pass_level2/torch_min.cpp
index 35cc4988a195..fa174614e018 100644
--- a/tools/pnnx/src/pass_level2/torch_min.cpp
+++ b/tools/pnnx/src/pass_level2/torch_min.cpp
@@ -35,6 +35,18 @@ pnnx.Output             output      2 0 out indices
     {
         return "torch.min";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        GraphRewriterPass::write(op, captured_params);
+
+        // drop indices if not used
+        if (op->outputs[1]->consumers.empty())
+        {
+            op->outputs[1]->producer = 0;
+            op->outputs.resize(1);
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_min, 20)
@@ -78,11 +90,22 @@ pnnx.Output             output      1 0 out
         return "torch.min";
     }
 
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.find("op_0.axes") != captured_params.end())
+        {
+            if (captured_params.at("op_0.axes").type != 5 || captured_params.at("op_0.axes").ai.size() != 1)
+                return false;
+        }
+
+        return true;
+    }
+
     void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
     {
         if (captured_params.find("op_0.axes") != captured_params.end())
         {
-            op->params["dim"] = captured_params.at("op_0.axes");
+            op->params["dim"] = captured_params.at("op_0.axes").ai[0];
 
             if (captured_params.find("op_0.keepdims") != captured_params.end())
             {
diff --git a/tools/pnnx/src/pass_level2/torch_squeeze.cpp b/tools/pnnx/src/pass_level2/torch_squeeze.cpp
index d7e157d94b12..dabffebc1262 100644
--- a/tools/pnnx/src/pass_level2/torch_squeeze.cpp
+++ b/tools/pnnx/src/pass_level2/torch_squeeze.cpp
@@ -110,20 +110,23 @@ class torch_squeeze_onnx_1 : public torch_squeeze_onnx
         return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-Squeeze                 op_0        1 1 input out axes=%axes
+Squeeze                 op_0        1 1 input out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
 
     void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
     {
-        if (captured_params.at("axes").type == 5 && captured_params.at("axes").ai.size() == 1)
+        if (captured_params.find("op_0.axes") != captured_params.end())
         {
-            op->params["dim"] = captured_params.at("axes").ai[0];
-        }
-        else
-        {
-            op->params["dim"] = captured_params.at("axes");
+            if (captured_params.at("op_0.axes").type == 5 && captured_params.at("op_0.axes").ai.size() == 1)
+            {
+                op->params["dim"] = captured_params.at("op_0.axes").ai[0];
+            }
+            else
+            {
+                op->params["dim"] = captured_params.at("op_0.axes");
+            }
         }
     }
 };
diff --git a/tools/pnnx/src/pass_level2/torch_tile.cpp b/tools/pnnx/src/pass_level2/torch_tile.cpp
index d1504bacda84..a2f2780116c1 100644
--- a/tools/pnnx/src/pass_level2/torch_tile.cpp
+++ b/tools/pnnx/src/pass_level2/torch_tile.cpp
@@ -60,4 +60,45 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_tile_onnx, 20)
 
+class torch_tile_onnx_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+Tile                    op_0        1 1 input out %*=%*
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.tile";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.find("op_0.repeats") == captured_params.end())
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("op_0.repeats").type == 5)
+        {
+            op->params["dims"] = captured_params.at("op_0.repeats");
+        }
+        else // if (captured_params.at("op_0.repeats").type == 2)
+        {
+            op->params["dims"] = std::vector<int>{captured_params.at("op_0.repeats").i};
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_tile_onnx_1, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level3/fuse_expression.cpp b/tools/pnnx/src/pass_level3/fuse_expression.cpp
index 708d1a548df4..8fc918fed9d7 100644
--- a/tools/pnnx/src/pass_level3/fuse_expression.cpp
+++ b/tools/pnnx/src/pass_level3/fuse_expression.cpp
@@ -154,6 +154,7 @@ static bool operand_maybe_tensor(const Operand* operand)
             || op->type == "aten::div"
             || op->type == "aten::floor_divide"
             || op->type == "aten::fmod"
+            || op->type == "aten::logaddexp"
             || op->type == "aten::max"
             || op->type == "aten::maximum"
             || op->type == "aten::min"
@@ -653,6 +654,7 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s
     else if (op->type == "aten::atan2"
              || op->type == "aten::floor_divide"
              || op->type == "aten::fmod"
+             || op->type == "aten::logaddexp"
              || op->type == "aten::max"
              || op->type == "aten::maximum"
              || op->type == "aten::min"
@@ -867,6 +869,7 @@ void fuse_expression(Graph& graph, const std::set<std::string>& foldable_constan
                     || op->type == "aten::fmod"
                     || op->type == "aten::log"
                     || op->type == "aten::log10"
+                    || op->type == "aten::logaddexp"
                     || op->type == "aten::max"
                     || op->type == "aten::maximum"
                     || op->type == "aten::min"
diff --git a/tools/pnnx/src/pass_level5.cpp b/tools/pnnx/src/pass_level5.cpp
index 4903f1851179..5f08b80f5ef9 100644
--- a/tools/pnnx/src/pass_level5.cpp
+++ b/tools/pnnx/src/pass_level5.cpp
@@ -44,6 +44,7 @@
 #include "pass_level5/fuse_multiheadattention.h"
 #include "pass_level5/fuse_pad_conv1d.h"
 #include "pass_level5/fuse_pad_conv2d.h"
+#include "pass_level5/fuse_rmsnorm.h"
 #include "pass_level5/fuse_scaled_dot_product_attention.h"
 #include "pass_level5/fuse_select_to_unbind.h"
 #include "pass_level5/fuse_silu.h"
@@ -60,6 +61,7 @@
 #include "pass_level5/fuse_static_layernorm.h"
 #include "pass_level5/fuse_static_linear.h"
 #include "pass_level5/fuse_static_prelu.h"
+#include "pass_level5/fuse_static_rmsnorm.h"
 #include "pass_level5/normalize_einsum_equation.h"
 #include "pass_level4/dead_code_elimination.h"
 #include "pass_level4/canonicalize.h"
@@ -102,6 +104,7 @@ void pass_level5(Graph& g, const std::set<std::string>& foldable_constants, cons
     fuse_static_groupnorm(g);
     fuse_static_instancenorm(g);
     fuse_static_layernorm(g);
+    fuse_static_rmsnorm(g);
 
     fuse_static_conv(g);
     fuse_static_convtranspose(g);
@@ -143,6 +146,7 @@ void pass_level5(Graph& g, const std::set<std::string>& foldable_constants, cons
 
     fuse_channel_shuffle(g);
     fuse_layernorm(g);
+    fuse_rmsnorm(g);
     fuse_multiheadattention(g);
     fuse_scaled_dot_product_attention(g);
 
diff --git a/tools/pnnx/src/pass_level5/eval_expression.cpp b/tools/pnnx/src/pass_level5/eval_expression.cpp
index 44e1f7e36911..c7d5d5d02260 100644
--- a/tools/pnnx/src/pass_level5/eval_expression.cpp
+++ b/tools/pnnx/src/pass_level5/eval_expression.cpp
@@ -390,7 +390,8 @@ static std::string eval_expression(const Operator* op)
                  || t == "floor_divide"
                  || t == "fmod"
                  || t == "pow"
-                 || t == "remainder")
+                 || t == "remainder"
+                 || t == "logaddexp")
         {
             std::string a = exprstack.top();
             exprstack.pop();
@@ -459,6 +460,11 @@ static std::string eval_expression(const Operator* op)
                         r += bf;
                     exprstack.push(std::to_string(r));
                 }
+                if (t == "logaddexp")
+                {
+                    float r = log(exp(af) + exp(bf));
+                    exprstack.push(std::to_string(r));
+                }
             }
             else
             {
diff --git a/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp b/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp
index 2a9f3b837b17..c178788f2a79 100644
--- a/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp
+++ b/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp
@@ -702,6 +702,57 @@ pnnx.Output             output      1 0 out
     }
 };
 
+class fuse_multiheadattention_pass_1_1_1 : public fuse_multiheadattention_pass_sameqkv
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+19 18
+pnnx.Input              input       0 1 input
+nn.Linear               op_0        1 1 input 256 bias=%qbias in_features=%embed_dim out_features=%embed_dim @bias @weight
+nn.Linear               op_1        1 1 input 257 bias=%kbias in_features=%embed_dim out_features=%embed_dim @bias @weight
+nn.Linear               op_2        1 1 input 260 bias=%vbias in_features=%embed_dim out_features=%embed_dim @bias @weight
+Tensor.view             op_3        1 1 256 263 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.view             op_4        1 1 257 258 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.view             op_5        1 1 260 261 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.permute          op_6        1 1 263 264 dims=(0,2,1,3)
+Tensor.permute          op_7        1 1 258 259 dims=(0,2,1,3)
+Tensor.permute          op_8        1 1 261 262 dims=(0,2,1,3)
+torch.transpose         op_9        1 1 259 265 dim0=-1 dim1=-2
+torch.matmul            op_10       2 1 264 265 266
+pnnx.Expression         op_11       1 1 266 267 expr=div(@0,%sqrt_feat_per_head)
+F.softmax               softmax     1 1 267 268 dim=%softmax_dim
+torch.matmul            op_13       2 1 268 262 269
+Tensor.permute          op_14       1 1 269 270 dims=(0,2,1,3)
+Tensor.reshape          op_15       1 1 270 271 shape=(%batch,%size,%embed_dim)
+nn.Linear               out_proj    1 1 271 out bias=%outbias in_features=%embed_dim out_features=%embed_dim @bias @weight
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& /*captured_attrs*/) const
+    {
+        const int embed_dim = captured_params.at("embed_dim").i;
+        const int num_heads = captured_params.at("num_heads").i;
+        const int feat_per_head = captured_params.at("feat_per_head").i;
+        const float sqrt_feat_per_head = captured_params.at("sqrt_feat_per_head").f;
+        const int softmax_dim = captured_params.at("softmax_dim").i;
+
+        if (embed_dim != num_heads * feat_per_head)
+            return false;
+
+        if (!NearlyEqual(sqrt_feat_per_head, sqrt(feat_per_head), 0.001))
+            return false;
+
+        int softmax_input_rank = (int)matched_operators.at("softmax")->inputs[0]->shape.size();
+        if (softmax_dim != -1 && softmax_dim != softmax_input_rank - 1)
+            return false;
+
+        return true;
+    }
+};
+
 class fuse_multiheadattention_pass_1_2 : public fuse_multiheadattention_pass_qkv
 {
 public:
@@ -1734,6 +1785,64 @@ pnnx.Output             output      1 0 out
     }
 };
 
+class fuse_multiheadattention_pass_onnx_1_2 : public fuse_multiheadattention_pass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+21 20
+pnnx.Input              input_q     0 1 input
+nn.Linear               op_0        1 1 input 14 bias=%qkvbias in_features=%embed_dim out_features=%qkv_out_features @bias @weight
+Tensor.reshape          op_1        1 1 14 15 shape=(%batch,%size,1,3,%embed_dim)
+Tensor.permute          op_2        1 1 15 16 dims=(3,1,2,0,4)
+torch.squeeze           op_3        1 1 16 17 dim=3
+torch.unbind            op_4        1 3 17 18 19 20 dim=0
+Tensor.reshape          op_5        1 1 18 21 shape=(%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_6        1 1 19 23 shape=(%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_7        1 1 20 25 shape=(%size,%num_heads,%feat_per_head)
+Tensor.permute          op_8        1 1 21 22 dims=(1,0,2)
+Tensor.permute          op_9        1 1 23 24 dims=(1,0,2)
+Tensor.permute          op_10       1 1 25 26 dims=(1,0,2)
+Tensor.reshape          op_11       1 1 22 27 shape=(%batch,%num_heads,%size,%feat_per_head)
+Tensor.reshape          op_12       1 1 24 28 shape=(%batch,%num_heads,%size,%feat_per_head)
+Tensor.reshape          op_13       1 1 26 29 shape=(%batch,%num_heads,%size,%feat_per_head)
+F.scaled_dot_product_attention op_14 3 1 27 28 29 35 dropout_p=0.000000e+00 is_causal=False
+Tensor.permute          op_15       1 1 35 36 dims=(2,0,1,3)
+Tensor.reshape          op_16       1 1 36 37 shape=(%size,%embed_dim)
+nn.Linear               out_proj    1 1 37 38 bias=%outbias in_features=%embed_dim out_features=%embed_dim @bias @weight
+Tensor.reshape          op_18       1 1 38 out shape=(%size,%batch,%embed_dim)
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.MultiheadAttention   attention   1 1 input out embed_dim=%embed_dim kdim=%embed_dim vdim=%embed_dim num_heads=%num_heads batch_first=False add_zero_attn=False add_bias_kv=False
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& /*captured_attrs*/) const
+    {
+        const int embed_dim = captured_params.at("embed_dim").i;
+        const int qkv_out_features = captured_params.at("qkv_out_features").i;
+        const int num_heads = captured_params.at("num_heads").i;
+        const int feat_per_head = captured_params.at("feat_per_head").i;
+
+        if (qkv_out_features != embed_dim * 3)
+            return false;
+
+        if (embed_dim != num_heads * feat_per_head)
+            return false;
+
+        return true;
+    }
+};
+
 class fuse_multiheadattention_pass_onnx_2 : public fuse_multiheadattention_pass
 {
 public:
@@ -2024,6 +2133,7 @@ void fuse_multiheadattention(Graph& graph)
     fuse_multiheadattention_pass_q_samekv d;
     fuse_multiheadattention_pass_1 b1;
     fuse_multiheadattention_pass_1_1 b11;
+    fuse_multiheadattention_pass_1_1_1 b111;
     fuse_multiheadattention_pass_1_2 b12;
     fuse_multiheadattention_pass_2 c1;
     fuse_multiheadattention_pass_3 d1;
@@ -2048,6 +2158,7 @@ void fuse_multiheadattention(Graph& graph)
     fuse_multiheadattention_pass_onnx onnx0;
     fuse_multiheadattention_pass_onnx_1 onnx1;
     fuse_multiheadattention_pass_onnx_1_1 onnx1a;
+    fuse_multiheadattention_pass_onnx_1_2 onnx1b;
     fuse_multiheadattention_pass_onnx_2 onnx2;
     fuse_multiheadattention_pass_onnx_3 onnx3;
     fuse_multiheadattention_pass_onnx_4 onnx4;
@@ -2063,6 +2174,7 @@ void fuse_multiheadattention(Graph& graph)
     pnnx_graph_rewrite(graph, &d, opindex);
     pnnx_graph_rewrite(graph, &b1, opindex);
     pnnx_graph_rewrite(graph, &b11, opindex);
+    pnnx_graph_rewrite(graph, &b111, opindex);
     pnnx_graph_rewrite(graph, &b12, opindex);
     pnnx_graph_rewrite(graph, &c1, opindex);
     pnnx_graph_rewrite(graph, &d1, opindex);
@@ -2087,6 +2199,7 @@ void fuse_multiheadattention(Graph& graph)
     pnnx_graph_rewrite(graph, &onnx0, opindex);
     pnnx_graph_rewrite(graph, &onnx1, opindex);
     pnnx_graph_rewrite(graph, &onnx1a, opindex);
+    pnnx_graph_rewrite(graph, &onnx1b, opindex);
     pnnx_graph_rewrite(graph, &onnx2, opindex);
     pnnx_graph_rewrite(graph, &onnx3, opindex);
     pnnx_graph_rewrite(graph, &onnx4, opindex);
diff --git a/tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp b/tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp
new file mode 100644
index 000000000000..7b99770ed6ed
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp
@@ -0,0 +1,97 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_rmsnorm.h"
+
+#include "pass_level2.h"
+
+#include <math.h>
+#include <string.h>
+
+namespace pnnx {
+
+class fuse_rmsnorm_pass : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_0        0 1 weight @data #weight=(%c)f32
+pnnx.Expression         op_1        1 1 input sq expr=pow(@0,2)
+torch.mean              op_2        1 1 sq sqmean dim=(-1) keepdim=True
+pnnx.Expression         op_3        3 1 weight input sqmean out expr=mul(@0,mul(@1,rsqrt(add(@2,%eps))))
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.RMSNorm              rmsnorm     1 1 input out elementwise_affine=True eps=%eps normalized_shape=(%c) @weight=%op_0.data
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+class fuse_rmsnorm_pass_1 : public fuse_rmsnorm_pass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_0        0 1 weight @data #weight=(%c)f32
+pnnx.Expression         op_1        1 1 input sq expr=pow(@0,2.000000e+00)
+torch.mean              op_2        1 1 sq sqmean dim=(-1) keepdim=True
+pnnx.Expression         op_3        3 1 weight input sqmean out expr=mul(@0,mul(@1,reciprocal(sqrt(add(@2,%eps)))))
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+class fuse_rmsnorm_pass_onnx : public fuse_rmsnorm_pass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_0        0 1 weight @data #weight=(%c)f32
+pnnx.Expression         op_1        1 1 input sq expr=pow(@0,2.000000e+00)
+torch.mean              op_2        1 1 sq sqmean dim=(-1) keepdim=True
+pnnx.Expression         op_3        3 1 weight input sqmean out expr=mul(@0,mul(@1,div(1.000000e+00,sqrt(add(@2,%eps)))))
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+void fuse_rmsnorm(Graph& graph)
+{
+    fuse_rmsnorm_pass a;
+    fuse_rmsnorm_pass_1 a1;
+    fuse_rmsnorm_pass_onnx b;
+    int opindex = 0;
+
+    pnnx_graph_rewrite(graph, &a, opindex);
+    pnnx_graph_rewrite(graph, &a1, opindex);
+    pnnx_graph_rewrite(graph, &b, opindex);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_rmsnorm.h b/tools/pnnx/src/pass_level5/fuse_rmsnorm.h
new file mode 100644
index 000000000000..0ba18e37f61b
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_rmsnorm.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_rmsnorm(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp b/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp
index 8f265f374dc3..a6dcbc86db75 100644
--- a/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp
+++ b/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp
@@ -62,7 +62,7 @@ pnnx.Output             output      1 0 out
 pnnx.Input              input_0     0 1 query
 pnnx.Input              input_1     0 1 key
 pnnx.Input              input_2     0 1 value
-F.scaled_dot_product_attention op_0 3 1 query key value out attn_mask=None dropout_p=0.0 is_causal=False
+F.scaled_dot_product_attention sdpa 3 1 query key value out attn_mask=None dropout_p=0.0 is_causal=False
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -114,7 +114,7 @@ pnnx.Input              input_Rh    0 1 Rh
 pnnx.Input              input_Rw    0 1 Rw
 pnnx.Expression         RhRw        2 1 Rh Rw RhRw expr=add(@0,@1) #RhRw=(%batch,%h,%w,%h,%w)f32
 Tensor.reshape          attn_mask   1 1 RhRw attn_mask shape=(%batch,%qsize,%qsize) #attn_mask=(%batch,%qsize,%qsize)f32
-F.scaled_dot_product_attention op_0 4 1 query key value attn_mask out dropout_p=0.0 is_causal=False $attn_mask=attn_mask
+F.scaled_dot_product_attention sdpa 4 1 query key value attn_mask out dropout_p=0.0 is_causal=False $attn_mask=attn_mask
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -137,15 +137,95 @@ pnnx.Output             output      1 0 out
     }
 };
 
+class fuse_scaled_dot_product_attention_pass_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+12 11
+pnnx.Input              input_0     0 1 query
+pnnx.Input              input_1     0 1 key
+pnnx.Input              input_2     0 1 value
+pnnx.Input              input_3     0 1 attn_mask
+Tensor.permute          op_0        1 1 query 13 dims=(0,2,1,3)
+Tensor.permute          op_1        1 1 key 20 dims=(0,2,3,1)
+Tensor.permute          op_2        1 1 value 19 dims=(0,2,1,3)
+torch.matmul            op_3        2 1 13 20 21
+pnnx.Expression         op_4        2 1 21 attn_mask 23 expr=add(@0,@1)
+F.softmax               softmax     1 1 23 24 dim=%softmax_dim
+torch.matmul            op_6        2 1 24 19 out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+9 8
+pnnx.Input              input_0     0 1 query
+pnnx.Input              input_1     0 1 key
+pnnx.Input              input_2     0 1 value
+pnnx.Input              input_3     0 1 attn_mask
+Tensor.permute          op_0        1 1 query q dims=(0,2,1,3)
+Tensor.permute          op_1        1 1 key k dims=(0,2,1,3)
+Tensor.permute          op_2        1 1 value v dims=(0,2,1,3)
+F.scaled_dot_product_attention sdpa 4 1 q k v attn_mask out dropout_p=0.0 is_causal=False $attn_mask=attn_mask
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& /*captured_attrs*/) const
+    {
+        const int softmax_dim = captured_params.at("softmax_dim").i;
+
+        int softmax_input_rank = (int)matched_operators.at("softmax")->inputs[0]->shape.size();
+        if (softmax_dim != -1 && softmax_dim != softmax_input_rank - 1)
+            return false;
+
+        return true;
+    }
+
+    void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& /*captured_params*/, const std::map<std::string, Attribute>& /*captured_attrs*/) const
+    {
+        Operator* op = ops.at("sdpa");
+
+        op->params["scale"] = 1.f;
+
+        // rewrite qkv shape
+        {
+            std::vector<int> q_shape = ops.at("op_0")->inputs[0]->shape;
+            std::vector<int> k_shape = ops.at("op_1")->inputs[0]->shape;
+            std::vector<int> v_shape = ops.at("op_2")->inputs[0]->shape;
+
+            if (!q_shape.empty())
+                std::swap(q_shape[1], q_shape[2]);
+            if (!k_shape.empty())
+                std::swap(k_shape[1], k_shape[2]);
+            if (!v_shape.empty())
+                std::swap(v_shape[1], v_shape[2]);
+
+            ops.at("op_0")->outputs[0]->shape = q_shape;
+            ops.at("op_0")->outputs[0]->type = ops.at("op_0")->inputs[0]->type;
+            ops.at("op_1")->outputs[0]->shape = k_shape;
+            ops.at("op_1")->outputs[0]->type = ops.at("op_1")->inputs[0]->type;
+            ops.at("op_2")->outputs[0]->shape = v_shape;
+            ops.at("op_2")->outputs[0]->type = ops.at("op_2")->inputs[0]->type;
+        }
+    }
+};
+
 void fuse_scaled_dot_product_attention(Graph& graph)
 {
 #if TORCH_VERSION_MAJOR >= 2
     fuse_scaled_dot_product_attention_pass a;
     fuse_scaled_dot_product_attention_pass_1 b;
+    fuse_scaled_dot_product_attention_pass_onnx onnx0;
     int opindex = 0;
 
     pnnx_graph_rewrite(graph, &a, opindex);
     pnnx_graph_rewrite(graph, &b, opindex);
+    pnnx_graph_rewrite(graph, &onnx0, opindex);
 #endif
 }
 
diff --git a/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp
new file mode 100644
index 000000000000..ed68c026d309
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_static_rmsnorm.h"
+
+#include "pass_level2.h"
+
+#include <math.h>
+#include <string.h>
+
+namespace pnnx {
+
+class fuse_static_Frmsnorm_pass : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @data
+F.rms_norm              op_0        2 1 input weight out normalized_shape=%normalized_shape eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.RMSNorm              rmsn        1 1 input out normalized_shape=%normalized_shape eps=%eps elementwise_affine=True @weight=%op_weight.data
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+void fuse_static_rmsnorm(Graph& graph)
+{
+    fuse_static_Frmsnorm_pass a;
+    int opindex = 0;
+
+    pnnx_graph_rewrite(graph, &a, opindex);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h
new file mode 100644
index 000000000000..c88b703cb072
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_static_rmsnorm(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp b/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp
index 1d9ca98e03d8..aaef7db2d74b 100644
--- a/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp
@@ -63,6 +63,22 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool1d, 20)
 
+class F_max_pool1d_1 : public F_max_pool1d
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+F.max_pool1d            op_0        1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding ceil_mode=%ceil_mode return_indices=False
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool1d_1, 20)
+
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp b/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp
index ba5a52f4f7dd..3519c8a022b7 100644
--- a/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp
@@ -66,6 +66,22 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool2d, 20)
 
+class F_max_pool2d_1 : public F_max_pool2d
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+F.max_pool2d            op_0        1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding ceil_mode=%ceil_mode return_indices=False
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool2d_1, 20)
+
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp b/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp
index 5476907fa881..2caede16a293 100644
--- a/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp
@@ -69,6 +69,22 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool3d, 20)
 
+class F_max_pool3d_1 : public F_max_pool3d
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+F.max_pool3d            op_0        1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding ceil_mode=%ceil_mode return_indices=False
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool3d_1, 20)
+
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_rms_norm.cpp b/tools/pnnx/src/pass_ncnn/F_rms_norm.cpp
new file mode 100644
index 000000000000..8230168312c2
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/F_rms_norm.cpp
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class F_rms_norm : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+F.rms_norm              op_0        1 1 input out weight=None normalized_shape=%normalized_shape eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "RMSNorm";
+    }
+
+    const char* name_str() const
+    {
+        return "rmsn";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::vector<int>& normalized_shape = captured_params.at("normalized_shape").ai;
+        int affine_size = normalized_shape[0];
+        for (size_t i = 1; i < normalized_shape.size(); i++)
+        {
+            affine_size *= normalized_shape[i];
+        }
+
+        const float eps = captured_params.at("eps").type == 0 ? 0.f : captured_params.at("eps").f;
+
+        op->params["0"] = affine_size;
+        op->params["1"] = eps;
+        op->params["2"] = 0;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_rms_norm, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp b/tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp
new file mode 100644
index 000000000000..af9f06b3f528
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp
@@ -0,0 +1,223 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class F_scaled_dot_product_attention : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+16 15
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 attn_mask
+nn.Linear               op_0        1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight
+nn.Linear               op_1        1 1 input k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight
+nn.Linear               op_2        1 1 input v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight
+Tensor.reshape          op_3        1 1 q 10 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_4        1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_5        1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.permute          op_6        1 1 10 16 dims=(0,2,1,3)
+Tensor.permute          op_7        1 1 12 17 dims=(0,2,1,3)
+Tensor.permute          op_8        1 1 14 18 dims=(0,2,1,3)
+F.scaled_dot_product_attention op_9 4 1 16 17 18 attn_mask 19 dropout_p=0.0 is_causal=False scale=%scale
+Tensor.permute          op_10       1 1 19 20 dims=(0,2,1,3)
+Tensor.reshape          op_11       1 1 20 21 shape=(%batch,%size,%embed_dim)
+nn.Linear               out_proj    1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "MultiHeadAttention";
+    }
+
+    const char* name_str() const
+    {
+        return "sdpa_attention";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        op->params["0"] = captured_params.at("embed_dim");
+        op->params["1"] = captured_params.at("num_heads");
+
+        const int embed_dim = captured_params.at("embed_dim").i;
+        const int qdim = captured_params.at("qdim").i;
+        const int kdim = captured_params.at("kdim").i;
+        const int vdim = captured_params.at("vdim").i;
+
+        op->params["2"] = embed_dim * qdim;
+        op->params["3"] = kdim;
+        op->params["4"] = vdim;
+        op->params["5"] = 1;
+        op->params["6"] = captured_params.at("scale");
+
+        op->attrs["0"] = Attribute();
+        op->attrs["0"].data = {0, 0, 0, 0};
+        op->attrs["1"] = captured_attrs.at("op_0.weight");
+        if (captured_params.at("qbias").b)
+        {
+            op->attrs["2"] = captured_attrs.at("op_0.bias");
+        }
+        else
+        {
+            op->attrs["2"] = Attribute({embed_dim}, std::vector<float>(embed_dim, 0.f));
+        }
+        op->attrs["3"] = Attribute();
+        op->attrs["3"].data = {0, 0, 0, 0};
+        op->attrs["4"] = captured_attrs.at("op_1.weight");
+        if (captured_params.at("kbias").b)
+        {
+            op->attrs["5"] = captured_attrs.at("op_1.bias");
+        }
+        else
+        {
+            op->attrs["5"] = Attribute({embed_dim}, std::vector<float>(embed_dim, 0.f));
+        }
+        op->attrs["6"] = Attribute();
+        op->attrs["6"].data = {0, 0, 0, 0};
+        op->attrs["7"] = captured_attrs.at("op_2.weight");
+        if (captured_params.at("vbias").b)
+        {
+            op->attrs["8"] = captured_attrs.at("op_2.bias");
+        }
+        else
+        {
+            op->attrs["8"] = Attribute({embed_dim}, std::vector<float>(embed_dim, 0.f));
+        }
+        op->attrs["9"] = Attribute();
+        op->attrs["9"].data = {0, 0, 0, 0};
+        op->attrs["a"] = captured_attrs.at("out_proj.weight");
+        if (captured_params.at("outbias").b)
+        {
+            op->attrs["b"] = captured_attrs.at("out_proj.bias");
+        }
+        else
+        {
+            op->attrs["b"] = Attribute({qdim}, std::vector<float>(qdim, 0.f));
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention, 10)
+
+class F_scaled_dot_product_attention_1 : public F_scaled_dot_product_attention
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+17 16
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 kv
+pnnx.Input              input_2     0 1 attn_mask
+nn.Linear               op_0        1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight
+nn.Linear               op_1        1 1 kv k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight
+nn.Linear               op_2        1 1 kv v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight
+Tensor.reshape          op_3        1 1 q 10 shape=(%batch,%qsize,%num_heads,%feat_per_head)
+Tensor.reshape          op_4        1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_5        1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.permute          op_6        1 1 10 16 dims=(0,2,1,3)
+Tensor.permute          op_7        1 1 12 17 dims=(0,2,1,3)
+Tensor.permute          op_8        1 1 14 18 dims=(0,2,1,3)
+F.scaled_dot_product_attention op_9 4 1 16 17 18 attn_mask 19 dropout_p=0.0 is_causal=False scale=%scale
+Tensor.permute          op_10       1 1 19 20 dims=(0,2,1,3)
+Tensor.reshape          op_11       1 1 20 21 shape=(%batch,%qsize,%embed_dim)
+nn.Linear               out_proj    1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention_1, 10)
+
+class F_scaled_dot_product_attention_2 : public F_scaled_dot_product_attention
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+15 14
+pnnx.Input              input       0 1 input
+nn.Linear               op_0        1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight
+nn.Linear               op_1        1 1 input k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight
+nn.Linear               op_2        1 1 input v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight
+Tensor.reshape          op_3        1 1 q 10 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_4        1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_5        1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.permute          op_6        1 1 10 16 dims=(0,2,1,3)
+Tensor.permute          op_7        1 1 12 17 dims=(0,2,1,3)
+Tensor.permute          op_8        1 1 14 18 dims=(0,2,1,3)
+F.scaled_dot_product_attention op_9 3 1 16 17 18 19 dropout_p=0.0 is_causal=False attn_mask=None scale=%scale
+Tensor.permute          op_10       1 1 19 20 dims=(0,2,1,3)
+Tensor.reshape          op_11       1 1 20 21 shape=(%batch,%size,%embed_dim)
+nn.Linear               out_proj    1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        F_scaled_dot_product_attention::write(op, captured_params, captured_attrs);
+        op->params["5"] = 0;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention_2, 10)
+
+class F_scaled_dot_product_attention_3 : public F_scaled_dot_product_attention
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+16 15
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 kv
+nn.Linear               op_0        1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight
+nn.Linear               op_1        1 1 kv k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight
+nn.Linear               op_2        1 1 kv v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight
+Tensor.reshape          op_3        1 1 q 10 shape=(%batch,%qsize,%num_heads,%feat_per_head)
+Tensor.reshape          op_4        1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_5        1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.permute          op_6        1 1 10 16 dims=(0,2,1,3)
+Tensor.permute          op_7        1 1 12 17 dims=(0,2,1,3)
+Tensor.permute          op_8        1 1 14 18 dims=(0,2,1,3)
+F.scaled_dot_product_attention op_9 3 1 16 17 18 19 dropout_p=0.0 is_causal=False attn_mask=None scale=%scale
+Tensor.permute          op_10       1 1 19 20 dims=(0,2,1,3)
+Tensor.reshape          op_11       1 1 20 21 shape=(%batch,%qsize,%embed_dim)
+nn.Linear               out_proj    1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        F_scaled_dot_product_attention::write(op, captured_params, captured_attrs);
+        op->params["5"] = 0;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention_3, 10)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/expand_expression.cpp b/tools/pnnx/src/pass_ncnn/expand_expression.cpp
index f8f97baa55c0..2fdc6d77d62e 100644
--- a/tools/pnnx/src/pass_ncnn/expand_expression.cpp
+++ b/tools/pnnx/src/pass_ncnn/expand_expression.cpp
@@ -185,6 +185,7 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx
                  || t == "div"
                  || t == "floor_divide"
                  || t == "fmod"
+                 || t == "logaddexp"
                  || t == "max"
                  || t == "maximum"
                  || t == "min"
@@ -211,6 +212,7 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx
             if (t == "sub") op_binary->params["0"] = 1;
             if (t == "mul") op_binary->params["0"] = 2;
             if (t == "div") op_binary->params["0"] = 3;
+            if (t == "logaddexp") fprintf(stderr, "BinaryOp logaddexp not supported yet\n"); // TODO
             if (t == "max" || t == "maximum") op_binary->params["0"] = 4;
             if (t == "min" || t == "minimum") op_binary->params["0"] = 5;
             if (t == "floor_divide") fprintf(stderr, "BinaryOp floor_divide not supported yet\n"); // TODO
diff --git a/tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp b/tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp
new file mode 100644
index 000000000000..7fda637c5cac
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class nn_RMSNorm : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.RMSNorm              op_0        1 1 input out normalized_shape=%normalized_shape eps=%eps elementwise_affine=%elementwise_affine @weight
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "RMSNorm";
+    }
+
+    const char* name_str() const
+    {
+        return "rmsn";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        const std::vector<int>& normalized_shape = captured_params.at("normalized_shape").ai;
+        int affine_size = normalized_shape[0];
+        for (size_t i = 1; i < normalized_shape.size(); i++)
+        {
+            affine_size *= normalized_shape[i];
+        }
+
+        const float eps = captured_params.at("eps").type == 0 ? 0.f : captured_params.at("eps").f;
+
+        op->params["0"] = affine_size;
+        op->params["1"] = eps;
+        op->params["2"] = captured_params.at("elementwise_affine").b ? 1 : 0;
+
+        if (captured_params.at("elementwise_affine").b)
+        {
+            op->attrs["0"] = captured_attrs.at("op_0.weight");
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_RMSNorm, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
index 6e53f7aa841b..d4532422b522 100644
--- a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
+++ b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
@@ -46,6 +46,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "F.group_norm",
         "F.instance_norm",
         "F.interpolate",
+        "F.layer_norm",
         "F.linear",
         "F.local_response_norm",
         "F.lp_pool1d",
@@ -56,6 +57,8 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "F.pixel_shuffle",
         "F.pixel_unshuffle",
         "F.prelu",
+        "F.rms_norm",
+        "F.scaled_dot_product_attention",
         "F.unfold",
         "F.upsample_bilinear",
         "F.upsample_nearest",
@@ -90,6 +93,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "nn.InstanceNorm2d",
         "nn.InstanceNorm3d",
         "nn.LocalResponseNorm",
+        "nn.LayerNorm",
         "nn.LPPool1d",
         "nn.LPPool2d",
         "nn.MaxPool1d",
@@ -103,6 +107,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "nn.ReplicationPad1d",
         "nn.ReplicationPad2d",
         "nn.ReplicationPad3d",
+        "nn.RMSNorm",
         "nn.Softmax2d",
         "nn.Unfold",
         "nn.Upsample",
diff --git a/tools/pnnx/src/pass_ncnn/torch_max.cpp b/tools/pnnx/src/pass_ncnn/torch_max.cpp
index 76cd33f239b6..95987da5162f 100644
--- a/tools/pnnx/src/pass_ncnn/torch_max.cpp
+++ b/tools/pnnx/src/pass_ncnn/torch_max.cpp
@@ -65,6 +65,22 @@ pnnx.Output             output      2 0 out indices
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_max, 20)
 
+class torch_max_0 : public torch_max
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+torch.max               op_0        1 1 input out dim=%dim keepdim=%keepdim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_max_0, 20)
+
 class torch_max_1 : public GraphRewriterPass
 {
 public:
diff --git a/tools/pnnx/src/pass_ncnn/torch_min.cpp b/tools/pnnx/src/pass_ncnn/torch_min.cpp
index 49851b443dbf..3ef2ae47da00 100644
--- a/tools/pnnx/src/pass_ncnn/torch_min.cpp
+++ b/tools/pnnx/src/pass_ncnn/torch_min.cpp
@@ -65,6 +65,22 @@ pnnx.Output             output      2 0 out indices
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_min, 20)
 
+class torch_min_0 : public torch_min
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+torch.min               op_0        1 1 input out dim=%dim keepdim=%keepdim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_min_0, 20)
+
 class torch_min_1 : public GraphRewriterPass
 {
 public:
diff --git a/tools/pnnx/src/pass_ncnn/torch_roll.cpp b/tools/pnnx/src/pass_ncnn/torch_roll.cpp
new file mode 100644
index 000000000000..c7c295933337
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/torch_roll.cpp
@@ -0,0 +1,193 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class torch_roll : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+torch.roll              op_0        1 1 input out dims=%dims shifts=%shifts
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+Slice                   slice       1 2 input a b
+Concat                  concat      2 1 b a out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dims").type != 5)
+            return false;
+
+        if (captured_params.at("dims").ai.size() != 1)
+            return false;
+
+        if (captured_params.at("shifts").type != 5)
+            return false;
+
+        if (captured_params.at("shifts").ai.size() != 1)
+            return false;
+
+        return true;
+    }
+
+    void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        GraphRewriterPass::write(ops, captured_params, captured_attrs);
+
+        const Operand* in = ops.at("slice")->inputs[0];
+
+        const int batch_index = in->params.at("__batch_index").i;
+
+        int axis = captured_params.at("dims").ai[0];
+        if (axis == batch_index)
+        {
+            fprintf(stderr, "roll along batch axis %d is not supported\n", batch_index);
+        }
+
+        if (axis < 0)
+        {
+            int input_rank = in->shape.size();
+            axis = input_rank + axis;
+        }
+
+        if (axis > batch_index)
+            axis -= 1;
+
+        ops.at("slice")->params["1"] = axis;
+
+        ops.at("concat")->params["0"] = axis;
+
+        const int shift = captured_params.at("shifts").ai[0];
+        ops.at("slice")->params["2"] = std::vector<int>{-shift};
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_roll, 20)
+
+class torch_roll_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+torch.roll              op_0        1 1 input out dims=%dims shifts=%shifts
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+8 7
+pnnx.Input              input       0 1 input
+Slice                   slice       1 2 input a b
+Slice                   slice_a     1 2 a a0 a1
+Slice                   slice_b     1 2 b b0 b1
+Concat                  concat_a    2 1 a1 a0 a10
+Concat                  concat_b    2 1 b1 b0 b10
+Concat                  concat      2 1 b10 a10 out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dims").type != 5)
+            return false;
+
+        if (captured_params.at("dims").ai.size() != 2)
+            return false;
+
+        if (captured_params.at("shifts").type != 5)
+            return false;
+
+        if (captured_params.at("shifts").ai.size() != 2)
+            return false;
+
+        return true;
+    }
+
+    void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        GraphRewriterPass::write(ops, captured_params, captured_attrs);
+
+        const Operand* in = ops.at("slice")->inputs[0];
+
+        const int batch_index = in->params.at("__batch_index").i;
+
+        int axis0 = captured_params.at("dims").ai[0];
+        int axis1 = captured_params.at("dims").ai[1];
+        if (axis0 == batch_index || axis1 == batch_index)
+        {
+            fprintf(stderr, "roll along batch axis %d is not supported\n", batch_index);
+        }
+
+        if (axis0 < 0)
+        {
+            int input_rank = in->shape.size();
+            axis0 = input_rank + axis0;
+        }
+
+        if (axis0 > batch_index)
+            axis0 -= 1;
+
+        if (axis1 < 0)
+        {
+            int input_rank = in->shape.size();
+            axis1 = input_rank + axis1;
+        }
+        if (axis1 > batch_index)
+            axis1 -= 1;
+
+        ops.at("slice")->params["1"] = axis0;
+        ops.at("slice_a")->params["1"] = axis1;
+        ops.at("slice_b")->params["1"] = axis1;
+
+        ops.at("concat_a")->params["0"] = axis1;
+        ops.at("concat_b")->params["0"] = axis1;
+        ops.at("concat")->params["0"] = axis0;
+
+        const int shift0 = captured_params.at("shifts").ai[0];
+        const int shift1 = captured_params.at("shifts").ai[1];
+        ops.at("slice")->params["2"] = std::vector<int>{-shift0};
+        ops.at("slice_a")->params["2"] = std::vector<int>{-shift1};
+        ops.at("slice_b")->params["2"] = std::vector<int>{-shift1};
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_roll_1, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx.cpp b/tools/pnnx/src/pass_onnx.cpp
index dd9194111fc1..87dd27d27cbc 100644
--- a/tools/pnnx/src/pass_onnx.cpp
+++ b/tools/pnnx/src/pass_onnx.cpp
@@ -14,7 +14,7 @@
 
 #include "pass_onnx.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 #include <google/protobuf/io/coded_stream.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
@@ -820,6 +820,8 @@ void pass_onnx(const onnx::ModelProto& model, Graph& pnnx_graph)
                         is_attr_weight = true;
                     if (sim_op_type == "Gather" && j == 0)
                         is_attr_weight = true;
+                    if (sim_op_type == "Gemm" && (j == 1 || j == 2))
+                        is_attr_weight = true;
                     if (sim_op_type == "GroupNormalization" && (j == 1 || j == 2))
                         is_attr_weight = true;
                     if (sim_op_type == "GRU" && (j == 1 || j == 2 || j == 3 || j == 5))
diff --git a/tools/pnnx/src/pass_onnx/canonicalize.h b/tools/pnnx/src/pass_onnx/canonicalize.h
index a24ad86a9fdb..6ec55f2d1401 100644
--- a/tools/pnnx/src/pass_onnx/canonicalize.h
+++ b/tools/pnnx/src/pass_onnx/canonicalize.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/dead_code_elimination.h b/tools/pnnx/src/pass_onnx/dead_code_elimination.h
index b890b6a7d7c5..7d8b7e0d25d6 100644
--- a/tools/pnnx/src/pass_onnx/dead_code_elimination.h
+++ b/tools/pnnx/src/pass_onnx/dead_code_elimination.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/eliminate_noop.h b/tools/pnnx/src/pass_onnx/eliminate_noop.h
index e465e398c0aa..3325ae9cf104 100644
--- a/tools/pnnx/src/pass_onnx/eliminate_noop.h
+++ b/tools/pnnx/src/pass_onnx/eliminate_noop.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/fold_constants.h b/tools/pnnx/src/pass_onnx/fold_constants.h
index 98d6ef717abc..f165a96e177f 100644
--- a/tools/pnnx/src/pass_onnx/fold_constants.h
+++ b/tools/pnnx/src/pass_onnx/fold_constants.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp
index a3021d33c907..39dc8d808826 100644
--- a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp
+++ b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp
@@ -32,6 +32,7 @@ struct constant_as_attribute
 };
 
 static constant_as_attribute caas[] = {
+    {"Expand", 1, "shape"},
     {"Gather", 1, "indices"},
     {"If", 0, "cond"},
     {"Pad", 1, "pads"},
@@ -42,6 +43,7 @@ static constant_as_attribute caas[] = {
     {"ReduceProd", 1, "axes"},
     {"ReduceSum", 1, "axes"},
     {"Reshape", 1, "shape"},
+    {"Resize", 1, "roi"},
     {"Resize", 2, "scales"},
     {"Resize", 3, "sizes"},
     {"Slice", 1, "starts"},
@@ -49,6 +51,7 @@ static constant_as_attribute caas[] = {
     {"Slice", 3, "axes"},
     {"Slice", 4, "steps"},
     {"Squeeze", 1, "axes"},
+    {"Tile", 1, "repeats"},
     {"Unsqueeze", 1, "axes"},
     {"Upsample", 1, "scales"},
 };
diff --git a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h
index ad6cf80007c4..a90c089fee6c 100644
--- a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h
+++ b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/inline_containers.h b/tools/pnnx/src/pass_onnx/inline_containers.h
index 56b21f47b374..e3051c5e3330 100644
--- a/tools/pnnx/src/pass_onnx/inline_containers.h
+++ b/tools/pnnx/src/pass_onnx/inline_containers.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/inline_if_graph.h b/tools/pnnx/src/pass_onnx/inline_if_graph.h
index c84b5761ac57..e9c1c2f0ee8c 100644
--- a/tools/pnnx/src/pass_onnx/inline_if_graph.h
+++ b/tools/pnnx/src/pass_onnx/inline_if_graph.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/model_stat.h b/tools/pnnx/src/pass_onnx/model_stat.h
index dd62e67a1bc9..993630b1b4b7 100644
--- a/tools/pnnx/src/pass_onnx/model_stat.h
+++ b/tools/pnnx/src/pass_onnx/model_stat.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp
index 0e8851f05f20..21cf6076d2d2 100644
--- a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp
index 070981e1d642..a8e3e96be6be 100644
--- a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp
index 5a006fe37090..6f5be930e643 100644
--- a/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp
index ff2a5dd8aad6..9fdcfdd72d64 100644
--- a/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp b/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp
index c3639904d477..96448c0f25c3 100644
--- a/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp b/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp
index 0f9405f160ae..afac686a22aa 100644
--- a/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp b/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp
index c9aeac561ac2..2cd6b7dd750f 100644
--- a/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp b/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp
index 6413685fcb5c..f90c23cbb6ab 100644
--- a/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_GELU.cpp b/tools/pnnx/src/pass_onnx/nn_GELU.cpp
index f5b7000e017a..22d2823673a5 100644
--- a/tools/pnnx/src/pass_onnx/nn_GELU.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_GELU.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp b/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp
index f4ecf2895576..fece12e2bcee 100644
--- a/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_Linear.cpp b/tools/pnnx/src/pass_onnx/nn_Linear.cpp
index 4dce81908b2b..0515a8ea4549 100644
--- a/tools/pnnx/src/pass_onnx/nn_Linear.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_Linear.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp
index 47924bd33fcf..518abd434b0b 100644
--- a/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp
index c8c467f5ba29..04de8bd104a2 100644
--- a/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp b/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp
index a29ec9d93062..df1bd0922734 100644
--- a/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/shape_inference.h b/tools/pnnx/src/pass_onnx/shape_inference.h
index b4cd657bb812..b484d5265cae 100644
--- a/tools/pnnx/src/pass_onnx/shape_inference.h
+++ b/tools/pnnx/src/pass_onnx/shape_inference.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/save_onnx.cpp b/tools/pnnx/src/save_onnx.cpp
index 3406c730b2d0..3ef3a772a2f4 100644
--- a/tools/pnnx/src/save_onnx.cpp
+++ b/tools/pnnx/src/save_onnx.cpp
@@ -14,7 +14,7 @@
 
 #include "save_onnx.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 #include <string.h>
 #include <fstream>
diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt
index 2046a6392566..0dd566c37b58 100644
--- a/tools/pnnx/tests/CMakeLists.txt
+++ b/tools/pnnx/tests/CMakeLists.txt
@@ -61,6 +61,7 @@ pnnx_add_test(F_pixel_unshuffle)
 pnnx_add_test(F_prelu)
 pnnx_add_test(F_relu)
 pnnx_add_test(F_relu6)
+pnnx_add_test(F_rms_norm)
 pnnx_add_test(F_rrelu)
 pnnx_add_test(F_scaled_dot_product_attention)
 pnnx_add_test(F_selu)
@@ -145,6 +146,7 @@ pnnx_add_test(nn_ReLU6)
 pnnx_add_test(nn_ReplicationPad1d)
 pnnx_add_test(nn_ReplicationPad2d)
 pnnx_add_test(nn_ReplicationPad3d)
+pnnx_add_test(nn_RMSNorm)
 pnnx_add_test(nn_RNN)
 pnnx_add_test(nn_RReLU)
 pnnx_add_test(nn_SELU)
@@ -234,6 +236,7 @@ pnnx_add_test(torch_ones_like)
 pnnx_add_test(torch_positive)
 pnnx_add_test(torch_prod)
 pnnx_add_test(torch_repeat_interleave)
+pnnx_add_test(torch_roll)
 pnnx_add_test(torch_scatter_add)
 pnnx_add_test(torch_slice_scatter)
 pnnx_add_test(torch_sum)
@@ -295,6 +298,7 @@ pnnx_add_test(torch_floor)
 pnnx_add_test(torch_imag)
 pnnx_add_test(torch_log)
 pnnx_add_test(torch_log10)
+pnnx_add_test(torch_logaddexp)
 pnnx_add_test(torch_maximum)
 pnnx_add_test(torch_minimum)
 pnnx_add_test(torch_neg)
@@ -342,6 +346,7 @@ pnnx_add_test(pnnx_fuse_input_unpack)
 pnnx_add_test(pnnx_fuse_layernorm)
 pnnx_add_test(pnnx_fuse_linear_batchnorm1d)
 pnnx_add_test(pnnx_fuse_multiheadattention)
+pnnx_add_test(pnnx_fuse_rmsnorm)
 pnnx_add_test(pnnx_fuse_scaled_dot_product_attention)
 pnnx_add_test(pnnx_fuse_select_to_unbind)
 pnnx_add_test(pnnx_fuse_slice_to_tensor_split)
diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt
index a682e42835b9..49cb063f335e 100644
--- a/tools/pnnx/tests/ncnn/CMakeLists.txt
+++ b/tools/pnnx/tests/ncnn/CMakeLists.txt
@@ -53,6 +53,7 @@ pnnx_ncnn_add_test(F_pixel_unshuffle)
 pnnx_ncnn_add_test(F_prelu)
 pnnx_ncnn_add_test(F_relu)
 pnnx_ncnn_add_test(F_relu6)
+pnnx_ncnn_add_test(F_rms_norm)
 pnnx_ncnn_add_test(F_selu)
 pnnx_ncnn_add_test(F_sigmoid)
 pnnx_ncnn_add_test(F_silu)
@@ -123,6 +124,7 @@ pnnx_ncnn_add_test(nn_ReLU6)
 pnnx_ncnn_add_test(nn_ReplicationPad1d)
 pnnx_ncnn_add_test(nn_ReplicationPad2d)
 pnnx_ncnn_add_test(nn_ReplicationPad3d)
+pnnx_ncnn_add_test(nn_RMSNorm)
 pnnx_ncnn_add_test(nn_RNN)
 pnnx_ncnn_add_test(nn_SELU)
 pnnx_ncnn_add_test(nn_Sigmoid)
@@ -162,6 +164,7 @@ pnnx_ncnn_add_test(torch_min)
 pnnx_ncnn_add_test(torch_mm)
 pnnx_ncnn_add_test(torch_norm)
 pnnx_ncnn_add_test(torch_prod)
+pnnx_ncnn_add_test(torch_roll)
 pnnx_ncnn_add_test(torch_slice_scatter)
 pnnx_ncnn_add_test(torch_sum)
 pnnx_ncnn_add_test(torch_squeeze)
diff --git a/tools/pnnx/tests/ncnn/test_F_layer_norm.py b/tools/pnnx/tests/ncnn/test_F_layer_norm.py
index 92244f179104..9d590aa76dda 100644
--- a/tools/pnnx/tests/ncnn/test_F_layer_norm.py
+++ b/tools/pnnx/tests/ncnn/test_F_layer_norm.py
@@ -37,8 +37,8 @@ def test():
     net.eval()
 
     torch.manual_seed(0)
-    x = torch.rand(12, 24)
-    y = torch.rand(3, 12, 16)
+    x = torch.rand(1, 12, 24)
+    y = torch.rand(1, 3, 12, 16)
 
     a = net(x, y)
 
@@ -48,7 +48,7 @@ def test():
 
     # torchscript to pnnx
     import os
-    os.system("../../src/pnnx test_F_layer_norm.pt inputshape=[12,24],[3,12,16]")
+    os.system("../../src/pnnx test_F_layer_norm.pt inputshape=[1,12,24],[1,3,12,16]")
 
     # ncnn inference
     import test_F_layer_norm_ncnn
diff --git a/tools/pnnx/tests/ncnn/test_F_rms_norm.py b/tools/pnnx/tests/ncnn/test_F_rms_norm.py
new file mode 100644
index 000000000000..f30f72f9ac45
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_F_rms_norm.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.w3 = nn.Parameter(torch.rand(24))
+        self.w4 = nn.Parameter(torch.rand(12, 16))
+
+    def forward(self, x, y):
+        x = F.rms_norm(x, (24,), self.w3)
+
+        y = F.rms_norm(y, (16,), None)
+        z = F.rms_norm(y, (12,16), self.w4, eps=1e-3)
+        return x, y, z
+
+def test():
+    if version.parse(torch.__version__) < version.parse('2.4'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12, 24)
+    y = torch.rand(1, 3, 12, 16)
+
+    a = net(x, y)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y))
+    mod.save("test_F_rms_norm.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_F_rms_norm.pt inputshape=[1,12,24],[1,3,12,16]")
+
+    # ncnn inference
+    import test_F_rms_norm_ncnn
+    b = test_F_rms_norm_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-3, 1e-3):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py b/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py
index a45444060d04..d409bdfba3a1 100644
--- a/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py
+++ b/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py
@@ -36,8 +36,8 @@ def test():
     net.eval()
 
     torch.manual_seed(0)
-    x = torch.rand(24, 64)
-    y = torch.rand(12, 24, 64)
+    x = torch.rand(1, 24, 64)
+    y = torch.rand(1, 12, 24, 64)
 
     a = net(x, y)
 
@@ -47,7 +47,7 @@ def test():
 
     # torchscript to pnnx
     import os
-    os.system("../../src/pnnx test_nn_LayerNorm.pt inputshape=[24,64],[12,24,64]")
+    os.system("../../src/pnnx test_nn_LayerNorm.pt inputshape=[1,24,64],[1,12,24,64]")
 
     # ncnn inference
     import test_nn_LayerNorm_ncnn
diff --git a/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py b/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py
new file mode 100644
index 000000000000..e69ad1220bc1
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.rmsn_0 = nn.RMSNorm(64)
+        self.rmsn_0.weight = nn.Parameter(torch.rand(64))
+        self.rmsn_1 = nn.RMSNorm(normalized_shape=(24,64), eps=1e-2, elementwise_affine=False)
+
+    def forward(self, x, y):
+        x = self.rmsn_0(x)
+        y = self.rmsn_0(y)
+        z = self.rmsn_1(y)
+        return x, y, z
+
+def test():
+    if version.parse(torch.__version__) < version.parse('2.4'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 24, 64)
+    y = torch.rand(1, 12, 24, 64)
+
+    a = net(x, y)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y))
+    mod.save("test_nn_RMSNorm.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_RMSNorm.pt inputshape=[1,24,64],[1,12,24,64]")
+
+    # ncnn inference
+    import test_nn_RMSNorm_ncnn
+    b = test_nn_RMSNorm_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-3, 1e-3):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_roll.py b/tools/pnnx/tests/ncnn/test_torch_roll.py
new file mode 100644
index 000000000000..6412ee6ba603
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_roll.py
@@ -0,0 +1,64 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.roll(x, 3, 1)
+        y = torch.roll(y, -2, -1)
+        z = torch.roll(z, shifts=(2,1), dims=(0,1))
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_roll.pt")
+
+    # torchscript to ncnn
+    import os
+    os.system("../../src/pnnx test_torch_roll.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_roll_ncnn
+    b = test_torch_roll_ncnn.test_inference()
+
+    print(x)
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            print(a0)
+            print(b0)
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_unbind.py b/tools/pnnx/tests/ncnn/test_torch_unbind.py
index 3b8e427010c4..8e224612d7ec 100644
--- a/tools/pnnx/tests/ncnn/test_torch_unbind.py
+++ b/tools/pnnx/tests/ncnn/test_torch_unbind.py
@@ -26,6 +26,7 @@ def forward(self, x, y):
 
         x0 = F.relu(x0)
         x1 = F.relu(x1)
+        x2 = F.relu(x2)
         y0 = F.relu(y0)
         y1 = F.relu(y1)
         y2 = F.relu(y2)
@@ -35,7 +36,7 @@ def forward(self, x, y):
         y6 = F.relu(y6)
         y7 = F.relu(y7)
         y8 = F.relu(y8)
-        return x0, x1, y0, y1, y2, y3, y4, y5, y6, y7, y8
+        return x0, x1, x2, y0, y1, y2, y3, y4, y5, y6, y7, y8
 
 def test():
     net = Model()
diff --git a/tools/pnnx/tests/onnx/CMakeLists.txt b/tools/pnnx/tests/onnx/CMakeLists.txt
index 0c0a136fbaf1..673fa0434d9c 100644
--- a/tools/pnnx/tests/onnx/CMakeLists.txt
+++ b/tools/pnnx/tests/onnx/CMakeLists.txt
@@ -29,16 +29,27 @@ pnnx_onnx_add_test(F_layer_norm)
 pnnx_onnx_add_test(F_leaky_relu)
 pnnx_onnx_add_test(F_linear)
 pnnx_onnx_add_test(F_local_response_norm)
+pnnx_onnx_add_test(F_logsigmoid)
+pnnx_onnx_add_test(F_log_softmax)
 pnnx_onnx_add_test(F_max_pool1d)
 pnnx_onnx_add_test(F_max_pool2d)
 pnnx_onnx_add_test(F_max_pool3d)
+pnnx_onnx_add_test(F_mish)
 pnnx_onnx_add_test(F_pad)
 pnnx_onnx_add_test(F_prelu)
 pnnx_onnx_add_test(F_relu)
 pnnx_onnx_add_test(F_relu6)
 pnnx_onnx_add_test(F_scaled_dot_product_attention)
+pnnx_onnx_add_test(F_selu)
 pnnx_onnx_add_test(F_sigmoid)
+pnnx_onnx_add_test(F_silu)
 pnnx_onnx_add_test(F_softmax)
+pnnx_onnx_add_test(F_softmin)
+pnnx_onnx_add_test(F_softplus)
+pnnx_onnx_add_test(F_softshrink)
+pnnx_onnx_add_test(F_softsign)
+pnnx_onnx_add_test(F_tanh)
+pnnx_onnx_add_test(F_tanhshrink)
 pnnx_onnx_add_test(F_upsample_bilinear)
 pnnx_onnx_add_test(F_upsample_nearest)
 pnnx_onnx_add_test(F_upsample)
@@ -74,10 +85,13 @@ pnnx_onnx_add_test(nn_LayerNorm)
 pnnx_onnx_add_test(nn_LeakyReLU)
 pnnx_onnx_add_test(nn_Linear)
 pnnx_onnx_add_test(nn_LocalResponseNorm)
+pnnx_onnx_add_test(nn_LogSigmoid)
+pnnx_onnx_add_test(nn_LogSoftmax)
 pnnx_onnx_add_test(nn_LSTM)
 pnnx_onnx_add_test(nn_MaxPool1d)
 pnnx_onnx_add_test(nn_MaxPool2d)
 pnnx_onnx_add_test(nn_MaxPool3d)
+pnnx_onnx_add_test(nn_Mish)
 pnnx_onnx_add_test(nn_MultiheadAttention)
 pnnx_onnx_add_test(nn_PReLU)
 pnnx_onnx_add_test(nn_ReflectionPad1d)
@@ -88,8 +102,16 @@ pnnx_onnx_add_test(nn_ReplicationPad1d)
 pnnx_onnx_add_test(nn_ReplicationPad2d)
 pnnx_onnx_add_test(nn_ReplicationPad3d)
 pnnx_onnx_add_test(nn_RNN)
+pnnx_onnx_add_test(nn_SELU)
 pnnx_onnx_add_test(nn_Sigmoid)
+pnnx_onnx_add_test(nn_SiLU)
 pnnx_onnx_add_test(nn_Softmax)
+pnnx_onnx_add_test(nn_Softmin)
+pnnx_onnx_add_test(nn_Softplus)
+pnnx_onnx_add_test(nn_Softshrink)
+pnnx_onnx_add_test(nn_Softsign)
+pnnx_onnx_add_test(nn_Tanh)
+pnnx_onnx_add_test(nn_Tanhshrink)
 pnnx_onnx_add_test(nn_Upsample)
 pnnx_onnx_add_test(nn_UpsamplingBilinear2d)
 pnnx_onnx_add_test(nn_UpsamplingNearest2d)
@@ -104,8 +126,30 @@ pnnx_onnx_add_test(squeezenet1_1)
 pnnx_onnx_add_test(swin_t)
 pnnx_onnx_add_test(vit_b_32)
 
+pnnx_onnx_add_test(Tensor_expand)
+pnnx_onnx_add_test(Tensor_permute)
+pnnx_onnx_add_test(Tensor_repeat)
+pnnx_onnx_add_test(Tensor_reshape)
+pnnx_onnx_add_test(Tensor_select)
+pnnx_onnx_add_test(Tensor_slice)
+pnnx_onnx_add_test(Tensor_view)
+
+pnnx_onnx_add_test(torch_cat)
+pnnx_onnx_add_test(torch_ceil)
+pnnx_onnx_add_test(torch_chunk)
+pnnx_onnx_add_test(torch_flatten)
+pnnx_onnx_add_test(torch_floor)
 pnnx_onnx_add_test(torch_max)
+pnnx_onnx_add_test(torch_maximum)
 pnnx_onnx_add_test(torch_mean)
 pnnx_onnx_add_test(torch_min)
+pnnx_onnx_add_test(torch_minimum)
 pnnx_onnx_add_test(torch_prod)
+pnnx_onnx_add_test(torch_roll)
+pnnx_onnx_add_test(torch_split)
+pnnx_onnx_add_test(torch_squeeze)
+pnnx_onnx_add_test(torch_stack)
 pnnx_onnx_add_test(torch_sum)
+pnnx_onnx_add_test(torch_transpose)
+pnnx_onnx_add_test(torch_unbind)
+pnnx_onnx_add_test(torch_unsqueeze)
diff --git a/tools/pnnx/tests/onnx/test_F_log_softmax.py b/tools/pnnx/tests/onnx/test_F_log_softmax.py
new file mode 100644
index 000000000000..8bc657c67780
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_log_softmax.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.log_softmax(x, 1)
+        y = F.log_softmax(y, 0)
+        z = F.log_softmax(z, 2)
+        w = F.log_softmax(w, 3)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_log_softmax.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_log_softmax.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_log_softmax_pnnx
+    b = test_F_log_softmax_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_logsigmoid.py b/tools/pnnx/tests/onnx/test_F_logsigmoid.py
new file mode 100644
index 000000000000..a731936a1097
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_logsigmoid.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.logsigmoid(x)
+        y = F.logsigmoid(y)
+        z = F.logsigmoid(z)
+        w = F.logsigmoid(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_logsigmoid.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_logsigmoid.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_logsigmoid_pnnx
+    b = test_F_logsigmoid_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_mish.py b/tools/pnnx/tests/onnx/test_F_mish.py
new file mode 100644
index 000000000000..69026d38b2bf
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_mish.py
@@ -0,0 +1,76 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+def mish_forward_0(x):
+    return x * F.softplus(x).tanh()
+
+def mish_forward_1(x):
+    return x.mul(torch.tanh(F.softplus(x)))
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.mish(x)
+        y = F.mish(y)
+        z = mish_forward_0(z)
+        w = mish_forward_1(w)
+        return x, y, z, w
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.9'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_mish.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_mish.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_mish_pnnx
+    b = test_F_mish_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_selu.py b/tools/pnnx/tests/onnx/test_F_selu.py
new file mode 100644
index 000000000000..e70f93441912
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_selu.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.selu(x)
+        y = F.selu(y)
+        z = F.selu(z)
+        w = F.selu(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_selu.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_selu.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_selu_pnnx
+    b = test_F_selu_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_sigmoid.py b/tools/pnnx/tests/onnx/test_F_sigmoid.py
index 684a7ab48d9f..c90e570e0057 100644
--- a/tools/pnnx/tests/onnx/test_F_sigmoid.py
+++ b/tools/pnnx/tests/onnx/test_F_sigmoid.py
@@ -41,7 +41,7 @@ def test():
     z = torch.rand(1, 3, 12, 16)
     w = torch.rand(1, 5, 7, 9, 11)
 
-    a0, a1, a2, a3 = net(x, y, z, w)
+    a = net(x, y, z, w)
 
     # export onnx
     torch.onnx.export(net, (x, y, z, w), "test_F_sigmoid.onnx")
@@ -52,9 +52,12 @@ def test():
 
     # pnnx inference
     import test_F_sigmoid_pnnx
-    b0, b1, b2, b3 = test_F_sigmoid_pnnx.test_inference()
+    b = test_F_sigmoid_pnnx.test_inference()
 
-    return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) and torch.equal(a3, b3)
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
 
 if __name__ == "__main__":
     if test():
diff --git a/tools/pnnx/tests/onnx/test_F_silu.py b/tools/pnnx/tests/onnx/test_F_silu.py
new file mode 100644
index 000000000000..d6cc987262ea
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_silu.py
@@ -0,0 +1,69 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+def silu_forward_0(x):
+    return x * torch.sigmoid(x)
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.silu(x)
+        y = F.silu(y)
+        z = F.silu(z)
+        w = silu_forward_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_silu.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_silu.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_silu_pnnx
+    b = test_F_silu_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_softmin.py b/tools/pnnx/tests/onnx/test_F_softmin.py
new file mode 100644
index 000000000000..88a82fea00af
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_softmin.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.softmin(x, 1)
+        y = F.softmin(y, 0)
+        z = F.softmin(z, 2)
+        w = F.softmin(w, 3)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_softmin.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_softmin.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_softmin_pnnx
+    b = test_F_softmin_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_softplus.py b/tools/pnnx/tests/onnx/test_F_softplus.py
new file mode 100644
index 000000000000..c261f58d67c4
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_softplus.py
@@ -0,0 +1,70 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.softplus(x)
+        y = F.softplus(y, 2, 5.2)
+        z = F.softplus(z, -0.7, 15)
+        w = F.softplus(w, 0.1, 0.3)
+        return x, y, z, w
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.11'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_softplus.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_softplus.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_softplus_pnnx
+    b = test_F_softplus_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_softshrink.py b/tools/pnnx/tests/onnx/test_F_softshrink.py
new file mode 100644
index 000000000000..7f1fb8838077
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_softshrink.py
@@ -0,0 +1,70 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.softshrink(x)
+        y = F.softshrink(y, 0.1)
+        z = F.softshrink(z, 0.22)
+        w = F.softshrink(w, 0)
+        return x, y, z, w
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.11'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_softshrink.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_softshrink.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_softshrink_pnnx
+    b = test_F_softshrink_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_softsign.py b/tools/pnnx/tests/onnx/test_F_softsign.py
new file mode 100644
index 000000000000..27164f3dfc17
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_softsign.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.softsign(x)
+        y = F.softsign(y)
+        z = F.softsign(z)
+        w = F.softsign(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_softsign.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_softsign.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_softsign_pnnx
+    b = test_F_softsign_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_tanh.py b/tools/pnnx/tests/onnx/test_F_tanh.py
new file mode 100644
index 000000000000..b56d513f655e
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_tanh.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.tanh(x)
+        y = F.tanh(y)
+        z = F.tanh(z)
+        w = F.tanh(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_tanh.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_tanh.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_tanh_pnnx
+    b = test_F_tanh_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_tanhshrink.py b/tools/pnnx/tests/onnx/test_F_tanhshrink.py
new file mode 100644
index 000000000000..7be2bf57cb16
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_tanhshrink.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.tanhshrink(x)
+        y = F.tanhshrink(y)
+        z = F.tanhshrink(z)
+        w = F.tanhshrink(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_tanhshrink.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_tanhshrink.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_tanhshrink_pnnx
+    b = test_F_tanhshrink_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_expand.py b/tools/pnnx/tests/onnx/test_Tensor_expand.py
new file mode 100644
index 000000000000..ceb01dac4c81
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_expand.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = x.expand(24)
+        y = y.expand(-1, 11, -1)
+        z = z.expand(2, 8, 3, -1, 4)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1)
+    y = torch.rand(3, 1, 1)
+    z = torch.rand(1, 8, 1, 9, 1)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_expand.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_expand.onnx inputshape=[1],[3,1,1],[1,8,1,9,1]")
+
+    # pnnx inference
+    import test_Tensor_expand_pnnx
+    b = test_Tensor_expand_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_permute.py b/tools/pnnx/tests/onnx/test_Tensor_permute.py
new file mode 100644
index 000000000000..a36de4c251cc
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_permute.py
@@ -0,0 +1,64 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = x.permute(1, 0, 2)
+        x = x.permute(0, 1, 2)
+        y = y.permute(2, 3, 1, 0)
+        y = y.permute(3, 1, 0, 2)
+        z = z.permute(1, 3, 0, 4, 2)
+        z = z.permute(0, 2, 4, 3, 1)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_permute.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_permute.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_Tensor_permute_pnnx
+    b = test_Tensor_permute_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_repeat.py b/tools/pnnx/tests/onnx/test_Tensor_repeat.py
new file mode 100644
index 000000000000..569ad548beaf
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_repeat.py
@@ -0,0 +1,63 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = x.repeat(1, 2, 3)
+        x = x.repeat(2, 3, 4)
+        y = y.repeat(1, 2, 1, 4)
+        y = y.repeat(3, 4, 5, 1)
+        z = z.repeat(1, 2, 3, 1, 5)
+        z = z.repeat(2, 3, 3, 1, 1)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_repeat.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_repeat.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_Tensor_repeat_pnnx
+    b = test_Tensor_repeat_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_reshape.py b/tools/pnnx/tests/onnx/test_Tensor_reshape.py
new file mode 100644
index 000000000000..027fb40a07d9
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_reshape.py
@@ -0,0 +1,63 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = x.reshape(1, 2, 24)
+        x = x.reshape(48)
+        y = y.reshape(1, 11, 5, 9)
+        y = y.reshape(99, 5)
+        z = z.reshape(4, 3, 30, 10, 14)
+        z = z.reshape(15, 2, 10, 7, 8, 3)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_reshape.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_reshape.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_Tensor_reshape_pnnx
+    b = test_Tensor_reshape_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_select.py b/tools/pnnx/tests/onnx/test_Tensor_select.py
new file mode 100644
index 000000000000..4f7488b55a52
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_select.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = x.select(1, 1)
+        y = y.select(2, 4)
+        z = z.select(0, 10)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_select.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_select.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_Tensor_select_pnnx
+    b = test_Tensor_select_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_slice.py b/tools/pnnx/tests/onnx/test_Tensor_slice.py
new file mode 100644
index 000000000000..7fe32b4af617
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_slice.py
@@ -0,0 +1,79 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        if version.parse(torch.__version__) < version.parse('1.12'):
+            x = x[:,:12,1:14:1]
+        else:
+            x = x[:,:12,1:14:2]
+        x = x[...,1:]
+        if version.parse(torch.__version__) >= version.parse('1.10'):
+            x = x[:,:,:x.size(2)-1]
+        y = y[0:,1:,5:,3:]
+        if version.parse(torch.__version__) < version.parse('1.12'):
+            y = y[:,:,1:13:1,:14]
+        else:
+            y = y[:,:,1:13:2,:14]
+        if version.parse(torch.__version__) >= version.parse('1.10'):
+            y = y[:1,:y.size(1):,:,:]
+        z = z[4:]
+        if version.parse(torch.__version__) < version.parse('1.12'):
+            z = z[:2,:,:,:,2:-2:1]
+        else:
+            z = z[:2,:,:,:,2:-2:3]
+        if version.parse(torch.__version__) >= version.parse('1.10'):
+            z = z[:,:,:,z.size(3)-3:,:]
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 13, 26)
+    y = torch.rand(1, 15, 19, 21)
+    z = torch.rand(14, 18, 15, 19, 20)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_slice.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_slice.onnx inputshape=[1,13,26],[1,15,19,21],[14,18,15,19,20]")
+
+    # pnnx inference
+    import test_Tensor_slice_pnnx
+    b = test_Tensor_slice_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_view.py b/tools/pnnx/tests/onnx/test_Tensor_view.py
new file mode 100644
index 000000000000..40df090a07bb
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_view.py
@@ -0,0 +1,63 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = x.view(1, 2, 24)
+        x = x.view(48)
+        y = y.view(1, 11, 5, 9)
+        y = y.view(99, 5)
+        z = z.view(4, 3, 30, 10, 14)
+        z = z.view(15, 2, 10, 7, 8, 3)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_view.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_view.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_Tensor_view_pnnx
+    b = test_Tensor_view_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_LogSigmoid.py b/tools/pnnx/tests/onnx/test_nn_LogSigmoid.py
new file mode 100644
index 000000000000..ddb44cbf4427
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_LogSigmoid.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.LogSigmoid()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_LogSigmoid.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_LogSigmoid.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_LogSigmoid_pnnx
+    b = test_nn_LogSigmoid_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_LogSoftmax.py b/tools/pnnx/tests/onnx/test_nn_LogSoftmax.py
new file mode 100644
index 000000000000..dbe8dc96d824
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_LogSoftmax.py
@@ -0,0 +1,71 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.LogSoftmax(dim=1)
+        self.act_1 = nn.LogSoftmax(dim=1)
+        self.act_2 = nn.LogSoftmax(dim=0)
+        self.act_3 = nn.LogSoftmax(dim=2)
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_1(y)
+        z = self.act_2(z)
+        w = self.act_3(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_LogSoftmax.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_LogSoftmax.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_LogSoftmax_pnnx
+    b = test_nn_LogSoftmax_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Mish.py b/tools/pnnx/tests/onnx/test_nn_Mish.py
new file mode 100644
index 000000000000..481ba7181117
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Mish.py
@@ -0,0 +1,72 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Mish()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.9'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Mish.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Mish.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Mish_pnnx
+    b = test_nn_Mish_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_ReLU.py b/tools/pnnx/tests/onnx/test_nn_ReLU.py
index d381fb5bc0e5..8230e3f4827a 100644
--- a/tools/pnnx/tests/onnx/test_nn_ReLU.py
+++ b/tools/pnnx/tests/onnx/test_nn_ReLU.py
@@ -61,7 +61,7 @@ def test():
         if not torch.allclose(a0, b0, 1e-4, 1e-4):
             return False
 
-    if version.parse(torch.__version__) < version.parse('2.4'):
+    if version.parse(torch.__version__) < version.parse('2.5'):
         return True
 
     # export dynamo onnx
diff --git a/tools/pnnx/tests/onnx/test_nn_SELU.py b/tools/pnnx/tests/onnx/test_nn_SELU.py
new file mode 100644
index 000000000000..a78c9e2336f3
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_SELU.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.SELU()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_SELU.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_SELU.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_SELU_pnnx
+    b = test_nn_SELU_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_SiLU.py b/tools/pnnx/tests/onnx/test_nn_SiLU.py
new file mode 100644
index 000000000000..e509ddb6754f
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_SiLU.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.SiLU()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_SiLU.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_SiLU.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_SiLU_pnnx
+    b = test_nn_SiLU_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Sigmoid.py b/tools/pnnx/tests/onnx/test_nn_Sigmoid.py
index 5b9cfc9a2bef..72d5d798ef48 100644
--- a/tools/pnnx/tests/onnx/test_nn_Sigmoid.py
+++ b/tools/pnnx/tests/onnx/test_nn_Sigmoid.py
@@ -43,7 +43,7 @@ def test():
     z = torch.rand(1, 12, 24, 64)
     w = torch.rand(1, 12, 24, 32, 64)
 
-    a0, a1, a2, a3 = net(x, y, z, w)
+    a = net(x, y, z, w)
 
     # export onnx
     torch.onnx.export(net, (x, y, z, w), "test_nn_Sigmoid.onnx")
@@ -54,9 +54,12 @@ def test():
 
     # pnnx inference
     import test_nn_Sigmoid_pnnx
-    b0, b1, b2, b3 = test_nn_Sigmoid_pnnx.test_inference()
+    b = test_nn_Sigmoid_pnnx.test_inference()
 
-    return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) and torch.equal(a3, b3)
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
 
 if __name__ == "__main__":
     if test():
diff --git a/tools/pnnx/tests/onnx/test_nn_Softmin.py b/tools/pnnx/tests/onnx/test_nn_Softmin.py
new file mode 100644
index 000000000000..9cb8417f2f65
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Softmin.py
@@ -0,0 +1,71 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Softmin(dim=1)
+        self.act_1 = nn.Softmin(dim=1)
+        self.act_2 = nn.Softmin(dim=0)
+        self.act_3 = nn.Softmin(dim=2)
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_1(y)
+        z = self.act_2(z)
+        w = self.act_3(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Softmin.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Softmin.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Softmin_pnnx
+    b = test_nn_Softmin_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Softplus.py b/tools/pnnx/tests/onnx/test_nn_Softplus.py
new file mode 100644
index 000000000000..445c6341b29c
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Softplus.py
@@ -0,0 +1,73 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Softplus()
+        self.act_1 = nn.Softplus(beta=0.7, threshold=15)
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_1(z)
+        w = self.act_1(w)
+        return x, y, z, w
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.11'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Softplus.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Softplus.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Softplus_pnnx
+    b = test_nn_Softplus_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Softshrink.py b/tools/pnnx/tests/onnx/test_nn_Softshrink.py
new file mode 100644
index 000000000000..b86e9239c162
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Softshrink.py
@@ -0,0 +1,73 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Softshrink()
+        self.act_1 = nn.Softshrink(lambd=1.3)
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_1(z)
+        w = self.act_1(w)
+        return x, y, z, w
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.11'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Softshrink.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Softshrink.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Softshrink_pnnx
+    b = test_nn_Softshrink_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Softsign.py b/tools/pnnx/tests/onnx/test_nn_Softsign.py
new file mode 100644
index 000000000000..da86752ca671
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Softsign.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Softsign()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Softsign.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Softsign.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Softsign_pnnx
+    b = test_nn_Softsign_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Tanh.py b/tools/pnnx/tests/onnx/test_nn_Tanh.py
new file mode 100644
index 000000000000..083275d277f2
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Tanh.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Tanh()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Tanh.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Tanh.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Tanh_pnnx
+    b = test_nn_Tanh_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Tanhshrink.py b/tools/pnnx/tests/onnx/test_nn_Tanhshrink.py
new file mode 100644
index 000000000000..20cabe2559a5
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Tanhshrink.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Tanhshrink()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Tanhshrink.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Tanhshrink.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Tanhshrink_pnnx
+    b = test_nn_Tanhshrink_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_squeezenet1_1.py b/tools/pnnx/tests/onnx/test_squeezenet1_1.py
index f5f5f4a668a9..28c7df8fb81e 100644
--- a/tools/pnnx/tests/onnx/test_squeezenet1_1.py
+++ b/tools/pnnx/tests/onnx/test_squeezenet1_1.py
@@ -39,7 +39,7 @@ def test():
     if not torch.allclose(a, b, 1e-4, 1e-4):
         return False
 
-    if version.parse(torch.__version__) < version.parse('2.4'):
+    if version.parse(torch.__version__) < version.parse('2.5'):
         return True
 
     # export dynamo onnx
diff --git a/tools/pnnx/tests/onnx/test_swin_t.py b/tools/pnnx/tests/onnx/test_swin_t.py
index be25520d0bc4..6361d20c9116 100644
--- a/tools/pnnx/tests/onnx/test_swin_t.py
+++ b/tools/pnnx/tests/onnx/test_swin_t.py
@@ -43,7 +43,7 @@ def test():
     if not torch.allclose(a, b, 1e-4, 1e-4):
         return False
 
-    if version.parse(torch.__version__) < version.parse('2.4'):
+    if version.parse(torch.__version__) < version.parse('2.5'):
         return True
 
     # export dynamo onnx
diff --git a/tools/pnnx/tests/onnx/test_torch_cat.py b/tools/pnnx/tests/onnx/test_torch_cat.py
new file mode 100644
index 000000000000..0d944434d280
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_cat.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        out0 = torch.cat((x, y), dim=1)
+        out1 = torch.cat((z, w), dim=3)
+        out2 = torch.cat((w, w), dim=2)
+        return out0, out1, out2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 2, 16)
+    z = torch.rand(1, 5, 9, 11)
+    w = torch.rand(1, 5, 9, 3)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_torch_cat.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_cat.onnx inputshape=[1,3,16],[1,2,16],[1,5,9,11],[1,5,9,3]")
+
+    # pnnx inference
+    import test_torch_cat_pnnx
+    b = test_torch_cat_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_ceil.py b/tools/pnnx/tests/onnx/test_torch_ceil.py
new file mode 100644
index 000000000000..1ff59b37a485
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_ceil.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.ceil(x * 10)
+        y = torch.ceil(y * 10)
+        z = torch.ceil(z * 10)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_ceil.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_ceil.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_ceil_pnnx
+    b = test_torch_ceil_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_chunk.py b/tools/pnnx/tests/onnx/test_torch_chunk.py
new file mode 100644
index 000000000000..2d1400103b9f
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_chunk.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x0, x1 = torch.chunk(x, chunks=2, dim=1)
+        y0, y1, y2 = torch.chunk(y, chunks=3, dim=2)
+        z0, z1, z2, z3, z4 = torch.chunk(z, chunks=5, dim=0)
+        return x0, x1, y0, y1, y2, z0, z1, z2, z3, z4
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_chunk.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_chunk.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_chunk_pnnx
+    b = test_torch_chunk_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_flatten.py b/tools/pnnx/tests/onnx/test_torch_flatten.py
new file mode 100644
index 000000000000..6105b106804e
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_flatten.py
@@ -0,0 +1,63 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.flatten(x)
+        y = torch.flatten(y, start_dim=1, end_dim=-1)
+        z = torch.flatten(z, start_dim=3, end_dim=4)
+        x = x.relu()
+        y = y.relu()
+        z = z.relu()
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_flatten.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_flatten.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_flatten_pnnx
+    b = test_torch_flatten_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_floor.py b/tools/pnnx/tests/onnx/test_torch_floor.py
new file mode 100644
index 000000000000..a046e4c241ac
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_floor.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.floor(x * 10)
+        y = torch.floor(y * 10)
+        z = torch.floor(z * 10)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_floor.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_floor.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_floor_pnnx
+    b = test_torch_floor_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_maximum.py b/tools/pnnx/tests/onnx/test_torch_maximum.py
new file mode 100644
index 000000000000..5e17d5cb2d2a
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_maximum.py
@@ -0,0 +1,64 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        out0 = torch.maximum(x, y)
+        out1 = torch.maximum(y, y)
+        out2 = torch.maximum(z, torch.ones_like(z) + 0.1)
+        return out0, out1, out2
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.12'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(3, 16)
+    z = torch.rand(5, 9, 3)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_maximum.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_maximum.onnx inputshape=[3,16],[3,16],[5,9,3]")
+
+    # pnnx inference
+    import test_torch_maximum_pnnx
+    b = test_torch_maximum_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_minimum.py b/tools/pnnx/tests/onnx/test_torch_minimum.py
new file mode 100644
index 000000000000..0d8e9a87e50c
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_minimum.py
@@ -0,0 +1,64 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        out0 = torch.minimum(x, y)
+        out1 = torch.minimum(y, y)
+        out2 = torch.minimum(z, torch.ones_like(z) + 0.1)
+        return out0, out1, out2
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.12'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(3, 16)
+    z = torch.rand(5, 9, 3)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_minimum.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_minimum.onnx inputshape=[3,16],[3,16],[5,9,3]")
+
+    # pnnx inference
+    import test_torch_minimum_pnnx
+    b = test_torch_minimum_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_roll.py b/tools/pnnx/tests/onnx/test_torch_roll.py
new file mode 100644
index 000000000000..06b8d579649e
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_roll.py
@@ -0,0 +1,64 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.roll(x, 3, -1)
+        y = torch.roll(y, -2, -1)
+        z = torch.roll(z, shifts=(2,1), dims=(0,1))
+        return x, y, z
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.10'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_roll.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_roll.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_roll_pnnx
+    b = test_torch_roll_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_split.py b/tools/pnnx/tests/onnx/test_torch_split.py
new file mode 100644
index 000000000000..b13b041cd96b
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_split.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x0, x1 = torch.split(x, split_size_or_sections=2, dim=1)
+        y0, y1, y2 = torch.split(y, split_size_or_sections=[1,3,5], dim=2)
+        z0, z1, z2, z3, z4 = torch.split(z, split_size_or_sections=3, dim=0)
+        return x0, x1, y0, y1, y2, z0, z1, z2, z3, z4
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_split.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_split.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_split_pnnx
+    b = test_torch_split_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_squeeze.py b/tools/pnnx/tests/onnx/test_torch_squeeze.py
new file mode 100644
index 000000000000..b29e4ba2f9d7
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_squeeze.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.squeeze(x, 1)
+        y = torch.squeeze(y)
+        z = torch.squeeze(z, 4)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 1, 16)
+    y = torch.rand(1, 5, 1, 11)
+    z = torch.rand(14, 8, 5, 9, 1)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_squeeze.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_squeeze.onnx inputshape=[1,1,16],[1,5,1,11],[14,8,5,9,1]")
+
+    # pnnx inference
+    import test_torch_squeeze_pnnx
+    b = test_torch_squeeze_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_stack.py b/tools/pnnx/tests/onnx/test_torch_stack.py
new file mode 100644
index 000000000000..7b04ddd307f5
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_stack.py
@@ -0,0 +1,62 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        out0 = torch.stack((x, y), dim=0)
+        out1 = torch.stack((x, y), dim=2)
+        out2 = torch.stack((z, w), dim=2)
+        out3 = torch.stack((z, w), dim=-1)
+        return out0, out1, out2, out3
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(3, 16)
+    z = torch.rand(5, 9, 3)
+    w = torch.rand(5, 9, 3)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_torch_stack.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_stack.onnx inputshape=[3,16],[3,16],[5,9,3],[5,9,3]")
+
+    # pnnx inference
+    import test_torch_stack_pnnx
+    b = test_torch_stack_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_transpose.py b/tools/pnnx/tests/onnx/test_torch_transpose.py
new file mode 100644
index 000000000000..e6a25c441017
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_transpose.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.transpose(x, 1, 2)
+        y = torch.transpose(y, 2, 3)
+        z = torch.transpose(z, 1, 3)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_transpose.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_transpose.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_transpose_pnnx
+    b = test_torch_transpose_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_unbind.py b/tools/pnnx/tests/onnx/test_torch_unbind.py
new file mode 100644
index 000000000000..a98fa25c51cc
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_unbind.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x0, x1, x2 = torch.unbind(x, dim=1)
+        y0, y1, y2, y3, y4, y5, y6, y7, y8 = torch.unbind(y, dim=2)
+        z0, z1, z2, z3 = torch.unbind(z, dim=0)
+        return x0, x1, x2, y0, y1, y2, y3, y4, y5, y6, y7, y8, z0, z1, z2, z3
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(4, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_unbind.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_unbind.onnx inputshape=[1,3,16],[1,5,9,11],[4,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_unbind_pnnx
+    b = test_torch_unbind_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_unsqueeze.py b/tools/pnnx/tests/onnx/test_torch_unsqueeze.py
new file mode 100644
index 000000000000..01bf84076cf3
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_unsqueeze.py
@@ -0,0 +1,63 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.unsqueeze(x, 0)
+        x = torch.unsqueeze(x, 1)
+        y = torch.unsqueeze(y, 2)
+        y = torch.unsqueeze(y, -1)
+        z = torch.unsqueeze(z, -2)
+        z = torch.unsqueeze(z, 3)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_unsqueeze.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_unsqueeze.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_unsqueeze_pnnx
+    b = test_torch_unsqueeze_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_vit_b_32.py b/tools/pnnx/tests/onnx/test_vit_b_32.py
index ecb0bd350f62..3c92a119406a 100644
--- a/tools/pnnx/tests/onnx/test_vit_b_32.py
+++ b/tools/pnnx/tests/onnx/test_vit_b_32.py
@@ -46,7 +46,7 @@ def test():
     if not torch.allclose(a, b, 1e-4, 1e-4):
         return False
 
-    if version.parse(torch.__version__) < version.parse('2.4'):
+    if version.parse(torch.__version__) < version.parse('2.5'):
         return True
 
     # export dynamo onnx
diff --git a/tools/pnnx/tests/test_F_rms_norm.py b/tools/pnnx/tests/test_F_rms_norm.py
new file mode 100644
index 000000000000..5dd9e699b23f
--- /dev/null
+++ b/tools/pnnx/tests/test_F_rms_norm.py
@@ -0,0 +1,77 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.w3 = nn.Parameter(torch.rand(24))
+        self.w4 = nn.Parameter(torch.rand(12, 16))
+        self.w5 = nn.Parameter(torch.rand(24))
+
+    def forward(self, x, y, z, w0, w1, w2):
+        x = F.rms_norm(x, (24,), w0)
+        x = F.rms_norm(x, (12,24), None)
+        x = F.rms_norm(x, (24,), self.w3)
+
+        y = F.rms_norm(y, (16,), None, eps=1e-3)
+        y = F.rms_norm(y, (12,16), w1)
+        y = F.rms_norm(y, (12,16), self.w4)
+
+        z = F.rms_norm(z, (24,), w2)
+        z = F.rms_norm(z, (12,16,24), None, eps=1e-2)
+        z = F.rms_norm(z, (24,), self.w5)
+        return x, y, z
+
+def test():
+    if version.parse(torch.__version__) < version.parse('2.4'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12, 24)
+    y = torch.rand(2, 3, 12, 16)
+    z = torch.rand(1, 10, 12, 16, 24)
+    w0 = torch.rand(24)
+    w1 = torch.rand(12, 16)
+    w2 = torch.rand(24)
+
+    a0, a1, a2 = net(x, y, z, w0, w1, w2)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z, w0, w1, w2))
+    mod.save("test_F_rms_norm.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_F_rms_norm.pt inputshape=[1,12,24],[2,3,12,16],[1,10,12,16,24],[24],[12,16],[24]")
+
+    # pnnx inference
+    import test_F_rms_norm_pnnx
+    b0, b1, b2 = test_F_rms_norm_pnnx.test_inference()
+
+    return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_nn_RMSNorm.py b/tools/pnnx/tests/test_nn_RMSNorm.py
new file mode 100644
index 000000000000..a9b70cdb2661
--- /dev/null
+++ b/tools/pnnx/tests/test_nn_RMSNorm.py
@@ -0,0 +1,71 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.rmsn_0 = nn.RMSNorm(64)
+        self.rmsn_0.weight = nn.Parameter(torch.rand(64))
+        self.rmsn_1 = nn.RMSNorm(normalized_shape=(24,64), eps=1e-2, elementwise_affine=False)
+
+    def forward(self, x, y, z):
+        x = self.rmsn_0(x)
+        x = self.rmsn_1(x)
+
+        y = self.rmsn_0(y)
+        y = self.rmsn_1(y)
+
+        z = self.rmsn_0(z)
+        z = self.rmsn_1(z)
+        return x, y, z
+
+def test():
+    if version.parse(torch.__version__) < version.parse('2.4'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 24, 64)
+    y = torch.rand(1, 12, 24, 64)
+    z = torch.rand(1, 12, 16, 24, 64)
+
+    a0, a1, a2 = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_nn_RMSNorm.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_nn_RMSNorm.pt inputshape=[1,24,64],[1,12,24,64],[1,12,16,24,64]")
+
+    # pnnx inference
+    import test_nn_RMSNorm_pnnx
+    b0, b1, b2 = test_nn_RMSNorm_pnnx.test_inference()
+
+    return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py b/tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py
new file mode 100644
index 000000000000..b04fa93442fa
--- /dev/null
+++ b/tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py
@@ -0,0 +1,77 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.rand(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        variance = x.pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * x
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.rmsn_0 = T5LayerNorm(26)
+        self.rmsn_1 = T5LayerNorm(21)
+
+    def forward(self, x, y):
+        x = self.rmsn_0(x)
+        y = self.rmsn_1(y)
+        return x, y
+
+def test():
+    if version.parse(torch.__version__) < version.parse('2.4'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 64, 26)
+    y = torch.rand(3, 15, 15, 21)
+
+    a0, a1 = net(x, y)
+
+    # export onnx
+    torch.onnx.export(net, (x,y), "test.onnx")
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y))
+    mod.save("test_pnnx_fuse_rmsnorm.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_pnnx_fuse_rmsnorm.pt inputshape=[1,64,26],[3,15,15,21]")
+
+    # pnnx inference
+    import test_pnnx_fuse_rmsnorm_pnnx
+    b0, b1 = test_pnnx_fuse_rmsnorm_pnnx.test_inference()
+
+    return torch.allclose(a0, b0, 1e-4, 1e-4) and torch.allclose(a1, b1, 1e-4, 1e-4)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_logaddexp.py b/tools/pnnx/tests/test_torch_logaddexp.py
new file mode 100644
index 000000000000..6914dbd62131
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_logaddexp.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        out0 = torch.logaddexp(x, y)
+        out1 = torch.logaddexp(y, y)
+        out2 = torch.logaddexp(z, torch.ones_like(z) + 0.5)
+        return out0, out1, out2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(3, 16)
+    z = torch.rand(5, 9, 3)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_logaddexp.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_logaddexp.pt inputshape=[3,16],[3,16],[5,9,3]")
+
+    # pnnx inference
+    import test_torch_logaddexp_pnnx
+    b = test_torch_logaddexp_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_roll.py b/tools/pnnx/tests/test_torch_roll.py
new file mode 100644
index 000000000000..32e3bde38e13
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_roll.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.roll(x, 3)
+        y = torch.roll(y, -2, -1)
+        z = torch.roll(z, shifts=(2,1), dims=(0,1))
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_roll.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_roll.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_roll_pnnx
+    b = test_torch_roll_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_unbind.py b/tools/pnnx/tests/test_torch_unbind.py
index c92c87b74351..b232f289dab4 100644
--- a/tools/pnnx/tests/test_torch_unbind.py
+++ b/tools/pnnx/tests/test_torch_unbind.py
@@ -24,7 +24,7 @@ def forward(self, x, y, z):
         x0, x1, x2 = torch.unbind(x, dim=1)
         y0, y1, y2, y3, y4, y5, y6, y7, y8 = torch.unbind(y, dim=2)
         z0, z1, z2, z3 = torch.unbind(z, dim=0)
-        return x0, x1, y0, y1, y2, y3, y4, y5, y6, y7, y8, z0, z1, z2, z3
+        return x0, x1, x2, y0, y1, y2, y3, y4, y5, y6, y7, y8, z0, z1, z2, z3
 
 def test():
     net = Model()
diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp
index 4d19ceb6f166..5e92b333aa57 100644
--- a/tools/quantize/ncnn2int8.cpp
+++ b/tools/quantize/ncnn2int8.cpp
@@ -133,6 +133,8 @@ class NetQuantize : public ModelWriter
     int quantize_lstm();
     int quantize_gru();
 
+    int quantize_embed();
+
     int fuse_requantize();
 };
 
@@ -562,6 +564,55 @@ int NetQuantize::quantize_gru()
     return 0;
 }
 
+int NetQuantize::quantize_embed()
+{
+    for (size_t i = 0; i < layers.size(); i++)
+    {
+        if (layers[i]->type != "Embed")
+            continue;
+
+        // Embed - quantize weight from fp32 to int8
+        ncnn::Embed* embed = (ncnn::Embed*)layers[i];
+
+        fprintf(stderr, "quantize_embed %s\n", embed->name.c_str());
+
+        // TODO move to ncnn2table
+
+        const int num_output = embed->num_output;
+        const int input_dim = embed->input_dim;
+
+        ncnn::Mat weight_data_int8_scales(1);
+        {
+            const float* ptr = embed->weight_data;
+            float absmax = 0.f;
+            for (int i = 0; i < embed->weight_data.w; i++)
+            {
+                absmax = std::max(absmax, (float)fabs(ptr[i]));
+            }
+
+            weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax;
+        }
+
+        {
+            ncnn::Mat weight_data_int8;
+
+            ncnn::Option opt_q = opt;
+            opt_q.blob_allocator = embed->weight_data.allocator;
+            opt_q.use_packing_layout = false;
+            ncnn::quantize_to_int8(embed->weight_data, weight_data_int8, weight_data_int8_scales, opt_q);
+            if (weight_data_int8.empty())
+                return -100;
+
+            embed->weight_data = weight_data_int8;
+        }
+
+        embed->int8_scale_term = 2;
+        embed->weight_data_int8_scale = weight_data_int8_scales[0];
+    }
+
+    return 0;
+}
+
 int NetQuantize::fuse_requantize()
 {
     const size_t layer_count = layers.size();
@@ -809,6 +860,7 @@ int main(int argc, char** argv)
     quantizer.quantize_rnn();
     quantizer.quantize_lstm();
     quantizer.quantize_gru();
+    quantizer.quantize_embed();
 
     quantizer.fuse_requantize();