diff --git a/.ci/pnnx.yml b/.ci/pnnx.yml index 5c44354aaaa1..d49da39a0afc 100644 --- a/.ci/pnnx.yml +++ b/.ci/pnnx.yml @@ -4,12 +4,14 @@ on: branches: [master] paths: - '.ci/pnnx.yml' + - 'src/layer/*' - 'tools/pnnx/**' - '!tools/pnnx/README.md' mr: target-branches: [master] paths: - '.ci/pnnx.yml' + - 'src/layer/*' - 'tools/pnnx/**' - '!tools/pnnx/README.md' concurrency: @@ -17,10 +19,10 @@ concurrency: variables: protobuf_version: 21.12 - libtorch_version: 2.3.0 - libtorchvision_version: 0.18.0 - onnxruntime_version: 1.17.3 - cache_date: 20240504 + libtorch_version: 2.4.0 + libtorchvision_version: 0.19.0 + onnxruntime_version: 1.18.1 + cache_date: 20240804 jobs: ubuntu: @@ -57,6 +59,9 @@ jobs: - torch-version: 2.3.0 torchvision-version: 0.18.0 + - torch-version: 2.4.0 + torchvision-version: 0.19.0 + runs-on: pool-name: docker container: @@ -160,7 +165,7 @@ jobs: - name: setup-pytorch run: | export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}} - pip3 install --user torch==${{matrix.torch-version}}+cpu torchvision==${{matrix.torchvision-version}}+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip3 install --user torch==${{matrix.torch-version}}+cpu torchvision==${{matrix.torchvision-version}}+cpu --index-url https://download.pytorch.org/whl/cpu pip3 install --user onnx pip3 install --user onnxscript diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml index d8304c0e33c0..6b6db4f0d2e9 100644 --- a/.github/workflows/release-python.yml +++ b/.github/workflows/release-python.yml @@ -87,7 +87,7 @@ jobs: # build wheels for ubuntu-20.04 - name: Build wheels for ubuntu if: matrix.os == 'ubuntu-20.04' - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_LINUX: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} @@ -99,7 +99,7 @@ jobs: # build wheels for windows-2019 - name: Build wheels for windows if: matrix.os == 'windows-2019' && (matrix.arch == 'AMD64' || matrix.arch == 'x86') - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_WINDOWS: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} @@ -112,7 +112,7 @@ jobs: - name: Build wheels for windows ARM64 if: matrix.os == 'windows-2019' && matrix.arch == 'ARM64' - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_WINDOWS: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} @@ -184,41 +184,43 @@ jobs: - name: vulkansdk for macos if: matrix.os == 'macos-13' run: | - wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg - hdiutil attach vulkansdk-macos-1.3.236.0.dmg - sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install + wget https://sdk.lunarg.com/sdk/download/1.3.290.0/mac/vulkansdk-macos-1.3.290.0.dmg?Human=true -O vulkansdk-macos-1.3.290.0.dmg + hdiutil attach vulkansdk-macos-1.3.290.0.dmg + sudo /Volumes/vulkansdk-macos-1.3.290.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.290.0 --accept-licenses --default-answer --confirm-command install - name: Build wheels for macos x86_64 if: matrix.os == 'macos-13' && matrix.arch == 'x86_64' - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_MACOS: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} CIBW_BUILD_VERBOSITY: 1 CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=3 CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC ARCHS="x86_64" - DEPLOYMENT_TARGET="10.9" ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF + DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp" OpenMP_C_LIB_NAMES="libomp" OpenMP_CXX_LIB_NAMES="libomp" OpenMP_libomp_LIBRARY="libomp.a" - Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib + Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.290.0/macOS/lib/libMoltenVK.dylib + MACOSX_DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET with: output-dir: wheelhouse - name: Build wheels for macos arm64 if: matrix.os == 'macos-13' && matrix.arch == 'arm64' - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_MACOS: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} CIBW_BUILD_VERBOSITY: 1 CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=3 CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC_ARM64 ARCHS="arm64" - DEPLOYMENT_TARGET="11.0" ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF + DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp" OpenMP_C_LIB_NAMES="libomp" OpenMP_CXX_LIB_NAMES="libomp" OpenMP_libomp_LIBRARY="libomp.a" - Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib + Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.290.0/macOS/lib/libMoltenVK.dylib + MACOSX_DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET with: output-dir: wheelhouse @@ -244,7 +246,7 @@ jobs: fail-fast: false matrix: arch: [aarch64, ppc64le, s390x] - build_cp: [cp36, cp37, cp38, cp39, cp310, cp311, cp312] + build_cp: [cp36, cp37, cp38, cp39, cp310, cp311, cp312, cp313] build_sub: [manylinux, musllinux] steps: @@ -262,7 +264,7 @@ jobs: platforms: all - name: Build wheels for manylinux with qemu - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_LINUX: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build_cp }}-${{ matrix.build_sub }}* @@ -310,7 +312,7 @@ jobs: platforms: all - name: Build wheels for manylinux with qemu - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_LINUX: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build_pp }}-* diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 6309214e08f6..2e875fc51e73 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -53,11 +53,20 @@ jobs: name: ${{ env.PACKAGENAME }} path: /tmp/${{ env.PACKAGENAME }}.zip - ubuntu-2004: + ubuntu: needs: [setup] - runs-on: ubuntu-20.04 + strategy: + matrix: + opt: + - { shared-lib: OFF, os: ubuntu-20.04, id: ubuntu-2004 } + - { shared-lib: OFF, os: ubuntu-22.04, id: ubuntu-2204 } + - { shared-lib: OFF, os: ubuntu-24.04, id: ubuntu-2404 } + - { shared-lib: ON, os: ubuntu-20.04, id: ubuntu-2004-shared } + - { shared-lib: ON, os: ubuntu-22.04, id: ubuntu-2204-shared } + - { shared-lib: ON, os: ubuntu-24.04, id: ubuntu-2404-shared } + runs-on: ${{ matrix.opt.os }} env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2004 + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} steps: - uses: actions/checkout@v4 with: @@ -69,71 +78,7 @@ jobs: run: | mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build/install/* ${{ env.PACKAGENAME }} - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - ubuntu-2004-shared: - needs: [setup] - runs-on: ubuntu-20.04 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2004-shared - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: apt - run: | - sudo apt-get install -y libprotobuf-dev protobuf-compiler - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a -P build/install/* ${{ env.PACKAGENAME }} - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - ubuntu-2204: - needs: [setup] - runs-on: ubuntu-22.04 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2204 - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: apt - run: | - sudo apt-get install -y libprotobuf-dev protobuf-compiler - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: package @@ -149,38 +94,6 @@ jobs: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip - ubuntu-2204-shared: - needs: [setup] - runs-on: ubuntu-22.04 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2204-shared - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: apt - run: | - sudo apt-get install -y libprotobuf-dev protobuf-compiler - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a -P build/install/* ${{ env.PACKAGENAME }} - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - openmp-macos: runs-on: macos-13 env: @@ -255,85 +168,14 @@ jobs: macos: needs: [setup, openmp-macos] + strategy: + matrix: + opt: + - { vulkan: OFF, id: macos } + - { vulkan: ON, id: macos-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos - NCNN_CMAKE_OPTIONS: | - -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ - -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \ - -DENABLE_BITCODE=$ENABLE_BITCODE \ - -DENABLE_ARC=$ENABLE_ARC \ - -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ - -DCMAKE_INSTALL_PREFIX=install \ - -DCMAKE_BUILD_TYPE=Release \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="libomp.a" \ - -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_BUILD_TOOLS=OFF \ - -DNCNN_BUILD_EXAMPLES=OFF \ - -DNCNN_BUILD_BENCHMARK=OFF \ - - steps: - - uses: actions/checkout@v4 - - name: download-openmp-macos - uses: actions/download-artifact@v4 - with: - name: openmp-macos - path: openmp-macos - - name: install-openmp - run: | - sudo cp openmp-macos/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include - sudo cp openmp-macos/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - - name: build-x86_64 - run: | - mkdir build-x86_64 && cd build-x86_64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-macos/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - macos-gpu: - needs: [setup, openmp-macos] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \ @@ -346,10 +188,10 @@ jobs: -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON \ -DNCNN_BUILD_TOOLS=OFF \ -DNCNN_BUILD_EXAMPLES=OFF \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 @@ -389,6 +231,7 @@ jobs: cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang + if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers @@ -397,12 +240,26 @@ jobs: ln -s Versions/Current/Headers glslang.framework/Headers ln -s Versions/Current/Resources glslang.framework/Resources ln -s Versions/Current/glslang glslang.framework/glslang - libtool -static build-x86_64/install/lib/libglslang.a build-x86_64/install/lib/libMachineIndependent.a build-x86_64/install/lib/libGenericCodeGen.a build-x86_64/install/lib/libSPIRV.a build-x86_64/install/lib/libOGLCompiler.a build-x86_64/install/lib/libOSDependent.a -o build-x86_64/install/lib/libglslang_combined.a - libtool -static build-arm64/install/lib/libglslang.a build-arm64/install/lib/libMachineIndependent.a build-arm64/install/lib/libGenericCodeGen.a build-arm64/install/lib/libSPIRV.a build-arm64/install/lib/libOGLCompiler.a build-arm64/install/lib/libOSDependent.a -o build-arm64/install/lib/libglslang_combined.a + libtool -static \ + build-x86_64/install/lib/libglslang.a \ + build-x86_64/install/lib/libMachineIndependent.a \ + build-x86_64/install/lib/libGenericCodeGen.a \ + build-x86_64/install/lib/libSPIRV.a \ + build-x86_64/install/lib/libOGLCompiler.a \ + build-x86_64/install/lib/libOSDependent.a \ + -o build-x86_64/install/lib/libglslang_combined.a + libtool -static \ + build-arm64/install/lib/libglslang.a \ + build-arm64/install/lib/libMachineIndependent.a \ + build-arm64/install/lib/libGenericCodeGen.a \ + build-arm64/install/lib/libSPIRV.a \ + build-arm64/install/lib/libOGLCompiler.a \ + build-arm64/install/lib/libOSDependent.a \ + -o build-arm64/install/lib/libglslang_combined.a lipo -create build-x86_64/install/lib/libglslang_combined.a build-arm64/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -412,8 +269,16 @@ jobs: ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/ncnn ncnn.framework/Versions/A/Headers/ + cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip @@ -485,77 +350,14 @@ jobs: ios: needs: [setup, openmp-ios] + strategy: + matrix: + opt: + - { vulkan: OFF, id: ios } + - { vulkan: ON, id: ios-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios - NCNN_CMAKE_OPTIONS: | - -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ - -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ - -DENABLE_BITCODE=$ENABLE_BITCODE \ - -DENABLE_ARC=$ENABLE_ARC \ - -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ - -DCMAKE_INSTALL_PREFIX=install \ - -DCMAKE_BUILD_TYPE=Release \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="libomp.a" \ - -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_BUILD_BENCHMARK=OFF \ - - steps: - - uses: actions/checkout@v4 - - name: download-openmp-ios - uses: actions/download-artifact@v4 - with: - name: openmp-ios - path: openmp-ios - - name: install-openmp - run: | - sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include - sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=OS64 -DARCHS="arm64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-ios/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn - cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - ios-gpu: - needs: [setup, openmp-ios] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ @@ -568,8 +370,8 @@ jobs: -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 @@ -603,6 +405,7 @@ jobs: cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang + if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers @@ -622,7 +425,7 @@ jobs: cp build-arm64/install/lib/libglslang_combined.a glslang.framework/Versions/A/glslang cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -632,8 +435,16 @@ jobs: ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn - cp -a build-arm64/install/include/ncnn ncnn.framework/Versions/A/Headers/ + cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip @@ -716,9 +527,14 @@ jobs: ios-simulator: needs: [setup, openmp-ios-simulator] + strategy: + matrix: + opt: + - { vulkan: OFF, id: ios-simulator } + - { vulkan: ON, id: ios-simulator-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ @@ -732,89 +548,12 @@ jobs: -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 - - name: download-openmp-ios-simulator - uses: actions/download-artifact@v4 - with: - name: openmp-ios-simulator - path: openmp-ios-simulator - - name: install-openmp - run: | - sudo cp openmp-ios-simulator/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include - sudo cp openmp-ios-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib - - name: build-x86_64 - run: | - mkdir build-x86_64 && cd build-x86_64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR64 -DARCHS="x86_64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64 -DARCHS="arm64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-ios-simulator/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-ios-simulator/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create \ - build-x86_64/install/lib/libncnn.a \ - build-arm64/install/lib/libncnn.a \ - -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - ios-simulator-gpu: - needs: [setup, openmp-ios-simulator] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan - NCNN_CMAKE_OPTIONS: | - -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ - -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ - -DENABLE_BITCODE=$ENABLE_BITCODE \ - -DENABLE_ARC=$ENABLE_ARC \ - -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ - -DCMAKE_INSTALL_PREFIX=install \ - -DCMAKE_BUILD_TYPE=Release \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="libomp.a" \ - -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON \ - -DNCNN_BUILD_BENCHMARK=OFF \ - - steps: - - uses: actions/checkout@v4 - with: - submodules: true + with: + submodules: true - name: download-openmp-ios-simulator uses: actions/download-artifact@v4 with: @@ -849,6 +588,7 @@ jobs: cp -a openmp-ios-simulator/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang + if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers @@ -879,7 +619,7 @@ jobs: -o glslang.framework/Versions/A/glslang cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -892,8 +632,16 @@ jobs: build-x86_64/install/lib/libncnn.a \ build-arm64/install/lib/libncnn.a \ -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/ncnn ncnn.framework/Versions/A/Headers/ + cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip @@ -976,86 +724,14 @@ jobs: mac-catalyst: needs: [setup, openmp-mac-catalyst] + strategy: + matrix: + opt: + - { vulkan: OFF, id: mac-catalyst } + - { vulkan: ON, id: mac-catalyst-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst - NCNN_CMAKE_OPTIONS: | - -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ - -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \ - -DENABLE_BITCODE=$ENABLE_BITCODE \ - -DENABLE_ARC=$ENABLE_ARC \ - -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ - -DCMAKE_INSTALL_PREFIX=install \ - -DCMAKE_BUILD_TYPE=Release \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="libomp.a" \ - -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_BUILD_BENCHMARK=OFF \ - - steps: - - uses: actions/checkout@v4 - - name: download-openmp-mac-catalyst - uses: actions/download-artifact@v4 - with: - name: openmp-mac-catalyst - path: openmp-mac-catalyst - - name: install-openmp - run: | - sudo cp openmp-mac-catalyst/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include - sudo cp openmp-mac-catalyst/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - - name: build-x86_64 - run: | - mkdir build-x86_64 && cd build-x86_64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="x86_64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="arm64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-mac-catalyst/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-mac-catalyst/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create \ - build-x86_64/install/lib/libncnn.a \ - build-arm64/install/lib/libncnn.a \ - -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - mac-catalyst-gpu: - needs: [setup, openmp-mac-catalyst] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \ @@ -1068,8 +744,8 @@ jobs: -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 @@ -1109,6 +785,7 @@ jobs: cp -a openmp-mac-catalyst/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang + if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers @@ -1139,7 +816,7 @@ jobs: -o glslang.framework/Versions/A/glslang cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -1152,8 +829,16 @@ jobs: build-x86_64/install/lib/libncnn.a \ build-arm64/install/lib/libncnn.a \ -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/ncnn ncnn.framework/Versions/A/Headers/ + cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip @@ -1534,86 +1219,14 @@ jobs: tvos: needs: [setup, openmp-tvos] + strategy: + matrix: + opt: + - { vulkan: OFF, id: tvos } + - { vulkan: ON, id: tvos-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos - NCNN_CMAKE_OPTIONS: | - -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ - -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ - -DENABLE_BITCODE=$ENABLE_BITCODE \ - -DENABLE_ARC=$ENABLE_ARC \ - -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ - -DCMAKE_INSTALL_PREFIX=install \ - -DCMAKE_BUILD_TYPE=Release \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="libomp.a" \ - -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_BUILD_BENCHMARK=OFF \ - - steps: - - uses: actions/checkout@v4 - - name: download-openmp-tvos - uses: actions/download-artifact@v4 - with: - name: openmp-tvos - path: openmp-tvos - - name: install-openmp - run: | - sudo cp openmp-tvos/include/* $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/include - sudo cp openmp-tvos/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/lib - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: build-arm64e - run: | - mkdir build-arm64e && cd build-arm64e - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64e" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-tvos/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-tvos/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create \ - build-arm64/install/lib/libncnn.a \ - build-arm64e/install/lib/libncnn.a \ - -o ncnn.framework/Versions/A/ncnn - cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - tvos-gpu: - needs: [setup, openmp-tvos] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ @@ -1626,8 +1239,8 @@ jobs: -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 @@ -1667,6 +1280,7 @@ jobs: cp -a openmp-tvos/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang + if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers @@ -1697,7 +1311,7 @@ jobs: -o glslang.framework/Versions/A/glslang cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -1712,6 +1326,14 @@ jobs: -o ncnn.framework/Versions/A/ncnn cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip @@ -1794,9 +1416,14 @@ jobs: tvos-simulator: needs: [setup, openmp-tvos-simulator] + strategy: + matrix: + opt: + - { vulkan: OFF, id: tvos-simulator } + - { vulkan: ON, id: tvos-simulator-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ @@ -1810,9 +1437,12 @@ jobs: -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 + with: + submodules: true - name: download-openmp-tvos-simulator uses: actions/download-artifact@v4 with: @@ -1846,87 +1476,8 @@ jobs: cp openmp-tvos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-tvos-simulator/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create \ - build-x86_64/install/lib/libncnn.a \ - build-arm64/install/lib/libncnn.a \ - -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - tvos-simulator-gpu: - needs: [setup, openmp-tvos-simulator] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan - NCNN_CMAKE_OPTIONS: | - -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ - -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ - -DENABLE_BITCODE=$ENABLE_BITCODE \ - -DENABLE_ARC=$ENABLE_ARC \ - -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ - -DCMAKE_INSTALL_PREFIX=install \ - -DCMAKE_BUILD_TYPE=Release \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="libomp.a" \ - -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON \ - -DNCNN_BUILD_BENCHMARK=OFF \ - - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: download-openmp-tvos-simulator - uses: actions/download-artifact@v4 - with: - name: openmp-tvos-simulator - path: openmp-tvos-simulator - - name: install-openmp - run: | - sudo cp openmp-tvos-simulator/include/* $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/include - sudo cp openmp-tvos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/lib - - name: build-x86_64 - run: | - mkdir build-x86_64 && cd build-x86_64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="x86_64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="arm64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-tvos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-tvos-simulator/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package-glslang + - name: package-glslang + if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers @@ -1957,7 +1508,7 @@ jobs: -o glslang.framework/Versions/A/glslang cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -1972,6 +1523,14 @@ jobs: -o ncnn.framework/Versions/A/ncnn cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip @@ -2043,9 +1602,14 @@ jobs: visionos: needs: [setup, openmp-visionos] + strategy: + matrix: + opt: + - { vulkan: OFF, id: visionos } + - { vulkan: ON, id: visionos-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-visionos + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \ @@ -2059,9 +1623,12 @@ jobs: -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 + with: + submodules: true - name: download-openmp-visionos uses: actions/download-artifact@v4 with: @@ -2089,7 +1656,28 @@ jobs: cp openmp-visionos/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-visionos/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-glslang + if: matrix.opt.vulkan == 'ON' + run: | + rm -rf glslang.framework + mkdir -p glslang.framework/Versions/A/Headers + mkdir -p glslang.framework/Versions/A/Resources + ln -s A glslang.framework/Versions/Current + ln -s Versions/Current/Headers glslang.framework/Headers + ln -s Versions/Current/Resources glslang.framework/Resources + ln -s Versions/Current/glslang glslang.framework/glslang + libtool -static \ + build-arm64/install/lib/libglslang.a \ + build-arm64/install/lib/libMachineIndependent.a \ + build-arm64/install/lib/libGenericCodeGen.a \ + build-arm64/install/lib/libSPIRV.a \ + build-arm64/install/lib/libOGLCompiler.a \ + build-arm64/install/lib/libOSDependent.a \ + -o build-arm64/install/lib/libglslang_combined.a + cp build-arm64/install/lib/libglslang_combined.a glslang.framework/Versions/A/glslang + cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/ + sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -2101,8 +1689,16 @@ jobs: cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v4 with: @@ -2183,9 +1779,14 @@ jobs: visionos-simulator: needs: [setup, openmp-visionos-simulator] + strategy: + matrix: + opt: + - { vulkan: OFF, id: visionos-simulator } + - { vulkan: ON, id: visionos-simulator-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \ @@ -2199,9 +1800,12 @@ jobs: -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 + with: + submodules: true - name: download-openmp-visionos-simulator uses: actions/download-artifact@v4 with: @@ -2235,7 +1839,39 @@ jobs: cp openmp-visionos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-visionos-simulator/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-glslang + if: matrix.opt.vulkan == 'ON' + run: | + rm -rf glslang.framework + mkdir -p glslang.framework/Versions/A/Headers + mkdir -p glslang.framework/Versions/A/Resources + ln -s A glslang.framework/Versions/Current + ln -s Versions/Current/Headers glslang.framework/Headers + ln -s Versions/Current/Resources glslang.framework/Resources + ln -s Versions/Current/glslang glslang.framework/glslang + libtool -static \ + build-x86_64/install/lib/libglslang.a \ + build-x86_64/install/lib/libMachineIndependent.a \ + build-x86_64/install/lib/libGenericCodeGen.a \ + build-x86_64/install/lib/libSPIRV.a \ + build-x86_64/install/lib/libOGLCompiler.a \ + build-x86_64/install/lib/libOSDependent.a \ + -o build-x86_64/install/lib/libglslang_combined.a + libtool -static \ + build-arm64/install/lib/libglslang.a \ + build-arm64/install/lib/libMachineIndependent.a \ + build-arm64/install/lib/libGenericCodeGen.a \ + build-arm64/install/lib/libSPIRV.a \ + build-arm64/install/lib/libOGLCompiler.a \ + build-arm64/install/lib/libOSDependent.a \ + -o build-arm64/install/lib/libglslang_combined.a + lipo -create \ + build-x86_64/install/lib/libglslang_combined.a \ + build-arm64/install/lib/libglslang_combined.a \ + -o glslang.framework/Versions/A/glslang + cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ + sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -2250,8 +1886,16 @@ jobs: -o ncnn.framework/Versions/A/ncnn cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v4 with: @@ -2260,51 +1904,63 @@ jobs: android: needs: [setup] + strategy: + matrix: + opt: + - { vulkan: OFF, shared-lib: OFF, id: android } + - { vulkan: OFF, shared-lib: ON, id: android-shared } + - { vulkan: ON, shared-lib: OFF, id: android-vulkan } + - { vulkan: ON, shared-lib: ON, id: android-vulkan-shared } runs-on: ubuntu-latest env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} + NCNN_CMAKE_OPTIONS: | + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake \ + -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=install \ + -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ + -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} \ + -DNCNN_AVX512BF16=OFF \ + steps: - uses: actions/checkout@v4 + with: + submodules: true - name: ndk-fix-debug run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake - - name: build-armv7 + - name: build-armeabi-v7a run: | - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \ - -DNCNN_BUILD_BENCHMARK=OFF .. + mkdir build-armeabi-v7a && cd build-armeabi-v7a + cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 .. cmake --build . -j $(nproc) cmake --build . --target install/strip - - name: build-aarch64 + - name: build-arm64-v8a run: | - mkdir build-aarch64 && cd build-aarch64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \ - -DNCNN_BUILD_BENCHMARK=OFF .. + mkdir build-arm64-v8a && cd build-arm64-v8a + cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: build-x86 run: | mkdir build-x86 && cd build-x86 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \ - -DNCNN_BUILD_BENCHMARK=OFF .. + cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: build-x86_64 run: | mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \ - -DNCNN_BUILD_BENCHMARK=OFF .. + cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: package run: | rm -rf ${{ env.PACKAGENAME }} mkdir -p ${{ env.PACKAGENAME }} - cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a - cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a + cp -a build-armeabi-v7a/install ${{ env.PACKAGENAME }}/armeabi-v7a + cp -a build-arm64-v8a/install ${{ env.PACKAGENAME }}/arm64-v8a cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 rm -f ${{ env.PACKAGENAME }}.zip @@ -2315,55 +1971,63 @@ jobs: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip - android-shared: + webassembly: needs: [setup] runs-on: ubuntu-latest env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-shared + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-webassembly steps: - uses: actions/checkout@v4 - - name: ndk-fix-debug - run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake - - name: build-armv7 + - name: emsdk + run: | + git clone https://github.com/emscripten-core/emsdk.git + cd emsdk + ./emsdk install $EMSCRIPTEN_VERSION + ./emsdk activate $EMSCRIPTEN_VERSION + - name: build run: | - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \ - -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + source emsdk/emsdk_env.sh + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j $(nproc) cmake --build . --target install/strip - - name: build-aarch64 + - name: build-simd run: | - mkdir build-aarch64 && cd build-aarch64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \ - -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + source emsdk/emsdk_env.sh + mkdir build-simd && cd build-simd + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j $(nproc) cmake --build . --target install/strip - - name: build-x86 + - name: build-threads run: | - mkdir build-x86 && cd build-x86 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \ - -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + source emsdk/emsdk_env.sh + mkdir build-threads && cd build-threads + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j $(nproc) cmake --build . --target install/strip - - name: build-x86_64 + - name: build-simd-threads run: | - mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \ - -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + source emsdk/emsdk_env.sh + mkdir build-simd-threads && cd build-simd-threads + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: package run: | rm -rf ${{ env.PACKAGENAME }} mkdir -p ${{ env.PACKAGENAME }} - cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a - cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a - cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 - cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 + cp -a build/install ${{ env.PACKAGENAME }}/basic + cp -a build-simd/install ${{ env.PACKAGENAME }}/simd + cp -a build-threads/install ${{ env.PACKAGENAME }}/threads + cp -a build-simd-threads/install ${{ env.PACKAGENAME }}/simd-threads rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - name: upload-zip @@ -2372,692 +2036,96 @@ jobs: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip - android-gpu: + windows: needs: [setup] - runs-on: ubuntu-latest + strategy: + matrix: + opt: + - { shared-lib: OFF, os: windows-2019, toolset-version: v140, id: vs2015 } + - { shared-lib: OFF, os: windows-2019, toolset-version: v141, id: vs2017 } + - { shared-lib: OFF, os: windows-2019, toolset-version: v142, id: vs2019 } + - { shared-lib: OFF, os: windows-2022, toolset-version: v143, id: vs2022 } + - { shared-lib: ON, os: windows-2019, toolset-version: v140, id: vs2015-shared } + - { shared-lib: ON, os: windows-2019, toolset-version: v141, id: vs2017-shared } + - { shared-lib: ON, os: windows-2019, toolset-version: v142, id: vs2019-shared } + - { shared-lib: ON, os: windows-2022, toolset-version: v143, id: vs2022-shared } + runs-on: ${{ matrix.opt.os }} env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-vulkan + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-${{ matrix.opt.id }} + UseMultiToolTask: true + NCNN_CMAKE_OPTIONS: | + -T ${{ matrix.opt.toolset-version }},host=x64 ` + -DCMAKE_BUILD_TYPE=Release ` + -DCMAKE_INSTALL_PREFIX=install ` + -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" ` + -DNCNN_BUILD_EXAMPLES=OFF ` + -DNCNN_BUILD_TOOLS=ON ` + -DNCNN_BUILD_BENCHMARK=OFF ` + -DNCNN_VULKAN=ON ` + -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} ` + steps: - uses: actions/checkout@v4 with: submodules: true - - name: ndk-fix-debug - run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake - - name: build-armv7 - run: | - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-aarch64 + - name: cache-protobuf + id: cache-protobuf + uses: actions/cache@v4 + with: + path: "protobuf-install" + key: protobuf-${{ matrix.opt.toolset-version }}-x86-x64-install + - name: protobuf + if: steps.cache-protobuf.outputs.cache-hit != 'true' run: | - mkdir build-aarch64 && cd build-aarch64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip + Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip + 7z x ./protobuf-3.11.2.zip + cd protobuf-3.11.2 + mkdir build-x86; cd build-x86; + cmake -T ${{ matrix.opt.toolset-version }},host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install + cd .. + mkdir build-x64; cd build-x64; + cmake -T ${{ matrix.opt.toolset-version }},host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install - name: build-x86 run: | - mkdir build-x86 && cd build-x86 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-x86_64 + mkdir build-x86; cd build-x86 + cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A Win32 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" .. + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install + - name: build-x64 run: | - mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a - cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a - cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 - cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - android-gpu-shared: - needs: [setup] - runs-on: ubuntu-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-vulkan-shared - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: ndk-fix-debug - run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake - - name: build-armv7 - run: | - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-aarch64 - run: | - mkdir build-aarch64 && cd build-aarch64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-x86 - run: | - mkdir build-x86 && cd build-x86 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-x86_64 - run: | - mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a - cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a - cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 - cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - webassembly: - needs: [setup] - runs-on: ubuntu-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-webassembly - steps: - - uses: actions/checkout@v4 - - name: emsdk - run: | - git clone https://github.com/emscripten-core/emsdk.git - cd emsdk - ./emsdk install $EMSCRIPTEN_VERSION - ./emsdk activate $EMSCRIPTEN_VERSION - - name: build - run: | - source emsdk/emsdk_env.sh - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-simd - run: | - source emsdk/emsdk_env.sh - mkdir build-simd && cd build-simd - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-threads - run: | - source emsdk/emsdk_env.sh - mkdir build-threads && cd build-threads - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-simd-threads - run: | - source emsdk/emsdk_env.sh - mkdir build-simd-threads && cd build-simd-threads - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build/install ${{ env.PACKAGENAME }}/basic - cp -a build-simd/install ${{ env.PACKAGENAME }}/simd - cp -a build-threads/install ${{ env.PACKAGENAME }}/threads - cp -a build-simd-threads/install ${{ env.PACKAGENAME }}/simd-threads - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2015: - needs: [setup] - runs-on: windows-2019 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015 - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2015-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2015-shared: - needs: [setup] - runs-on: windows-2019 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015-shared - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2015-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2017: - needs: [setup] - runs-on: windows-2019 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017 - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2017-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2017-shared: - needs: [setup] - runs-on: windows-2019 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017-shared - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2017-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2019: - needs: [setup] - runs-on: windows-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019 - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2019-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm - run: | - mkdir build-arm; cd build-arm - cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm64 - run: | - mkdir build-arm64; cd build-arm64 - cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - mkdir ${{ env.PACKAGENAME }}/arm - mkdir ${{ env.PACKAGENAME }}/arm64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" - Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2019-shared: - needs: [setup] - runs-on: windows-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019-shared - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2019-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm - run: | - mkdir build-arm; cd build-arm - cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm64 - run: | - mkdir build-arm64; cd build-arm64 - cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - mkdir ${{ env.PACKAGENAME }}/arm - mkdir ${{ env.PACKAGENAME }}/arm64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" - Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2022: - needs: [setup] - runs-on: windows-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022 - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2022-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm - run: | - mkdir build-arm; cd build-arm - cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm64 - run: | - mkdir build-arm64; cd build-arm64 - cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install + mkdir build-x64; cd build-x64 + cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" .. + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install + - name: build-arm + if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143' + run: | + mkdir build-arm; cd build-arm + cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A arm .. + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install + - name: build-arm64 + if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143' + run: | + mkdir build-arm64; cd build-arm64 + cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A arm64 .. + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install - name: package + if: matrix.opt.toolset-version == 'v140' || matrix.opt.toolset-version == 'v141' run: | mkdir ${{ env.PACKAGENAME }} mkdir ${{ env.PACKAGENAME }}/x86 mkdir ${{ env.PACKAGENAME }}/x64 - mkdir ${{ env.PACKAGENAME }}/arm - mkdir ${{ env.PACKAGENAME }}/arm64 Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" - Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2022-shared: - needs: [setup] - runs-on: windows-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022-shared - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2022-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm - run: | - mkdir build-arm; cd build-arm - cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm64 - run: | - mkdir build-arm64; cd build-arm64 - cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - name: package + if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143' run: | mkdir ${{ env.PACKAGENAME }} mkdir ${{ env.PACKAGENAME }}/x86 @@ -3087,30 +2155,49 @@ jobs: with: path: artifacts - - name: create-xcframwork + - name: unzip run: | - mkdir -p ncnn-macos mkdir -p ncnn-ios + mkdir -p ncnn-ios-vulkan mkdir -p ncnn-ios-simulator + mkdir -p ncnn-ios-simulator-vulkan mkdir -p ncnn-mac-catalyst - mkdir -p ncnn-watchos - mkdir -p ncnn-watchos-simulator + mkdir -p ncnn-mac-catalyst-vulkan + mkdir -p ncnn-macos + mkdir -p ncnn-macos-vulkan mkdir -p ncnn-tvos + mkdir -p ncnn-tvos-vulkan mkdir -p ncnn-tvos-simulator + mkdir -p ncnn-tvos-simulator-vulkan mkdir -p ncnn-visionos + mkdir -p ncnn-visionos-vulkan mkdir -p ncnn-visionos-simulator + mkdir -p ncnn-visionos-simulator-vulkan + mkdir -p ncnn-watchos + mkdir -p ncnn-watchos-simulator - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos/ncnn-${{ needs.setup.outputs.VERSION }}-macos.zip -d ncnn-macos unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios/ncnn-${{ needs.setup.outputs.VERSION }}-ios.zip -d ncnn-ios + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan.zip -d ncnn-ios-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator.zip -d ncnn-ios-simulator + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan.zip -d ncnn-ios-simulator-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst.zip -d ncnn-mac-catalyst - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan.zip -d ncnn-mac-catalyst-vulkan + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos/ncnn-${{ needs.setup.outputs.VERSION }}-macos.zip -d ncnn-macos + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan.zip -d ncnn-macos-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos/ncnn-${{ needs.setup.outputs.VERSION }}-tvos.zip -d ncnn-tvos + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan.zip -d ncnn-tvos-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator.zip -d ncnn-tvos-simulator + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan.zip -d ncnn-tvos-simulator-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos/ncnn-${{ needs.setup.outputs.VERSION }}-visionos.zip -d ncnn-visionos + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-vulkan.zip -d ncnn-visionos-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator.zip -d ncnn-visionos-simulator + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator-vulkan.zip -d ncnn-visionos-simulator-vulkan + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator + - name: create-xcframwork + run: | + rm -rf openmp.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos/openmp.framework \ -framework ncnn-ios/openmp.framework \ @@ -3124,6 +2211,7 @@ jobs: -framework ncnn-visionos-simulator/openmp.framework \ -output openmp.xcframework + rm -rf ncnn.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos/ncnn.framework \ -framework ncnn-ios/ncnn.framework \ @@ -3139,48 +2227,9 @@ jobs: rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.xcframework ncnn.xcframework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - apple-gpu: - needs: [setup, macos-gpu, ios-gpu, ios-simulator-gpu, mac-catalyst-gpu, watchos, watchos-simulator, tvos-gpu, tvos-simulator-gpu, visionos, visionos-simulator] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-apple-vulkan - steps: - - run: sudo xcode-select --switch /Applications/Xcode_15.2.app - - name: download - uses: actions/download-artifact@v4 - with: - path: artifacts - - - name: create-xcframwork + - name: create-xcframwork-vulkan run: | - mkdir -p ncnn-macos-vulkan - mkdir -p ncnn-ios-vulkan - mkdir -p ncnn-ios-simulator-vulkan - mkdir -p ncnn-mac-catalyst-vulkan - mkdir -p ncnn-watchos - mkdir -p ncnn-watchos-simulator - mkdir -p ncnn-tvos-vulkan - mkdir -p ncnn-tvos-simulator-vulkan - mkdir -p ncnn-visionos - mkdir -p ncnn-visionos-simulator - - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan.zip -d ncnn-macos-vulkan - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan.zip -d ncnn-ios-vulkan - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan.zip -d ncnn-ios-simulator-vulkan - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan.zip -d ncnn-mac-catalyst-vulkan - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan.zip -d ncnn-tvos-vulkan - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan.zip -d ncnn-tvos-simulator-vulkan - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos/ncnn-${{ needs.setup.outputs.VERSION }}-visionos.zip -d ncnn-visionos - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator.zip -d ncnn-visionos-simulator - + rm -rf openmp.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos-vulkan/openmp.framework \ -framework ncnn-ios-vulkan/openmp.framework \ @@ -3194,6 +2243,7 @@ jobs: -framework ncnn-visionos-simulator/openmp.framework \ -output openmp.xcframework + rm -rf glslang.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos-vulkan/glslang.framework \ -framework ncnn-ios-vulkan/glslang.framework \ @@ -3201,8 +2251,11 @@ jobs: -framework ncnn-mac-catalyst-vulkan/glslang.framework \ -framework ncnn-tvos-vulkan/glslang.framework \ -framework ncnn-tvos-simulator-vulkan/glslang.framework \ + -framework ncnn-visionos-vulkan/glslang.framework \ + -framework ncnn-visionos-simulator-vulkan/glslang.framework \ -output glslang.xcframework + rm -rf ncnn.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos-vulkan/ncnn.framework \ -framework ncnn-ios-vulkan/ncnn.framework \ @@ -3212,22 +2265,27 @@ jobs: -framework ncnn-watchos-simulator/ncnn.framework \ -framework ncnn-tvos-vulkan/ncnn.framework \ -framework ncnn-tvos-simulator-vulkan/ncnn.framework \ - -framework ncnn-visionos/ncnn.framework \ - -framework ncnn-visionos-simulator/ncnn.framework \ + -framework ncnn-visionos-vulkan/ncnn.framework \ + -framework ncnn-visionos-simulator-vulkan/ncnn.framework \ -output ncnn.xcframework - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.xcframework glslang.xcframework ncnn.xcframework + rm -f ${{ env.PACKAGENAME }}-vulkan.zip + zip -9 -y -r ${{ env.PACKAGENAME }}-vulkan.zip openmp.xcframework glslang.xcframework ncnn.xcframework - name: upload-zip uses: actions/upload-artifact@v4 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip + - name: upload-zip-vulkan + uses: actions/upload-artifact@v4 + with: + name: ${{ env.PACKAGENAME }}-vulkan + path: ${{ env.PACKAGENAME }}-vulkan.zip release: permissions: contents: write # for softprops/action-gh-release to create a release - needs: [setup, full-source, ubuntu-2004, ubuntu-2004-shared, ubuntu-2204, ubuntu-2204-shared, macos, macos-gpu, ios, ios-gpu, ios-simulator, ios-simulator-gpu, mac-catalyst, mac-catalyst-gpu, watchos, watchos-simulator, tvos, tvos-simulator, android, android-shared, android-gpu, android-gpu-shared, webassembly, windows-vs2015, windows-vs2015-shared, windows-vs2017, windows-vs2017-shared, windows-vs2019, windows-vs2019-shared, windows-vs2022, windows-vs2022-shared, apple, apple-gpu] + needs: [setup, full-source, ubuntu, macos, ios, ios-simulator, mac-catalyst, watchos, watchos-simulator, tvos, tvos-simulator, android, webassembly, windows, apple] runs-on: ubuntu-latest steps: - name: download diff --git a/CMakeLists.txt b/CMakeLists.txt index 309e3b8fbd0a..0f32a80c86ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,11 @@ if(POLICY CMP0025) cmake_policy(SET CMP0025 NEW) endif() +if(POLICY CMP0057) + # reference from https://cmake.org/cmake/help/latest/policy/CMP0057.html + cmake_policy(SET CMP0057 NEW) +endif() + project(ncnn) if(MSVC AND NOT CMAKE_VERSION VERSION_LESS "3.15") diff --git a/README.md b/README.md index a9bb1c116fa4..146b04b1a4ed 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ https://github.com/Tencent/ncnn/releases/latest Source - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-full-source.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-full-source.zip) @@ -97,8 +97,8 @@ https://github.com/Tencent/ncnn/releases/latest Android - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android.zip) @@ -111,8 +111,8 @@ https://github.com/Tencent/ncnn/releases/latest Android shared - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android-vulkan-shared.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android-vulkan-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android-shared.zip) @@ -159,8 +159,8 @@ https://github.com/Tencent/ncnn/releases/latest iOS - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios.zip) @@ -173,8 +173,8 @@ https://github.com/Tencent/ncnn/releases/latest iOS-Simulator - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios-simulator-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios-simulator.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios-simulator-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios-simulator.zip) @@ -193,8 +193,8 @@ https://github.com/Tencent/ncnn/releases/latest macOS - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-macos-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-macos.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-macos-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-macos.zip) @@ -207,8 +207,8 @@ https://github.com/Tencent/ncnn/releases/latest Mac-Catalyst - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-mac-catalyst-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-mac-catalyst.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-mac-catalyst-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-mac-catalyst.zip) @@ -221,7 +221,7 @@ https://github.com/Tencent/ncnn/releases/latest watchOS - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-watchos.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-watchos.zip) @@ -234,7 +234,7 @@ https://github.com/Tencent/ncnn/releases/latest watchOS-Simulator - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-watchos-simulator.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-watchos-simulator.zip) @@ -242,8 +242,8 @@ https://github.com/Tencent/ncnn/releases/latest tvOS - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos.zip) @@ -256,8 +256,8 @@ https://github.com/Tencent/ncnn/releases/latest tvOS-Simulator - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos-simulator-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos-simulator.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos-simulator-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos-simulator.zip) @@ -265,7 +265,8 @@ https://github.com/Tencent/ncnn/releases/latest visionOS - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-visionos.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos.zip) @@ -278,7 +279,8 @@ https://github.com/Tencent/ncnn/releases/latest visionOS-Simulator - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-visionos-simulator.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos-simulator-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos-simulator.zip) @@ -286,8 +288,8 @@ https://github.com/Tencent/ncnn/releases/latest Apple xcframework - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-apple-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-apple.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-apple-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-apple.zip) @@ -296,10 +298,10 @@ https://github.com/Tencent/ncnn/releases/latest - + - + - [Build for Linux / NVIDIA Jetson / Raspberry Pi3, Pi4 / POWER](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-linux) @@ -309,11 +311,11 @@ https://github.com/Tencent/ncnn/releases/latest Ubuntu 20.04 - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2004.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2004-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2004.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2004-shared.zip) - + [](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-x64-gpu-gcc) @@ -323,8 +325,17 @@ https://github.com/Tencent/ncnn/releases/latest Ubuntu 22.04 - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2204.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2204-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2204.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2204-shared.zip) + + + + +Ubuntu 24.04 + + + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2404.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2404-shared.zip) @@ -344,8 +355,8 @@ https://github.com/Tencent/ncnn/releases/latest VS2015 - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2015.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2015-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2015.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2015-shared.zip) @@ -358,8 +369,8 @@ https://github.com/Tencent/ncnn/releases/latest VS2017 - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2017.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2017-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2017.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2017-shared.zip) @@ -367,8 +378,8 @@ https://github.com/Tencent/ncnn/releases/latest VS2019 - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2019.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2019-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2019.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2019-shared.zip) @@ -376,8 +387,8 @@ https://github.com/Tencent/ncnn/releases/latest VS2022 - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2022.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2022-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2022.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2022-shared.zip) @@ -396,7 +407,7 @@ https://github.com/Tencent/ncnn/releases/latest WebAssembly - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-webassembly.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-webassembly.zip) @@ -560,7 +571,7 @@ https://github.com/Tencent/ncnn/releases/latest **[use netron for ncnn model visualization](https://netron.app)** -**[out-of-the-box web model conversion](https://convertmodel.com/#outputFormat=ncnn)** +**[use ncnn with pytorch or onnx](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-pytorch-or-onnx)** [ncnn low-level operation api](https://github.com/Tencent/ncnn/wiki/low-level-operation-api) diff --git a/benchmark/README.md b/benchmark/README.md index 1927acf81cd4..df9e55de4a8e 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -5911,6 +5911,298 @@ cooling_down = 0 FastestDet min = 5.13 max = 5.47 avg = 5.30 ``` +### HUAWEI Kunpeng 920 7260 (x64 cores) +test on Ubuntu 20.04 (gcc 9.4.0) +``` +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 1 0 -1 0 +loop_count = 300 +num_threads = 1 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 11.64 max = 12.11 avg = 11.71 + squeezenet_int8 min = 12.22 max = 13.22 avg = 12.37 + mobilenet min = 20.00 max = 20.79 avg = 20.08 + mobilenet_int8 min = 17.44 max = 19.09 avg = 17.64 + mobilenet_v2 min = 13.29 max = 14.25 avg = 13.39 + mobilenet_v3 min = 11.06 max = 11.84 avg = 11.11 + shufflenet min = 7.56 max = 7.74 avg = 7.59 + shufflenet_v2 min = 7.84 max = 8.37 avg = 7.88 + mnasnet min = 13.07 max = 13.78 avg = 13.14 + proxylessnasnet min = 15.71 max = 16.31 avg = 15.77 + efficientnet_b0 min = 34.79 max = 35.98 avg = 34.92 + efficientnetv2_b0 min = 35.28 max = 36.36 avg = 35.41 + regnety_400m min = 17.06 max = 17.74 avg = 17.16 + blazeface min = 2.99 max = 3.04 avg = 3.01 + googlenet min = 50.76 max = 51.74 avg = 51.00 + googlenet_int8 min = 50.31 max = 52.27 avg = 50.65 + resnet18 min = 34.97 max = 37.17 avg = 35.82 + resnet18_int8 min = 40.47 max = 42.03 avg = 40.78 + alexnet min = 39.19 max = 39.80 avg = 39.32 + vgg16 min = 176.62 max = 181.29 avg = 177.07 + vgg16_int8 min = 352.35 max = 358.38 avg = 355.15 + resnet50 min = 96.76 max = 98.63 avg = 97.09 + resnet50_int8 min = 90.00 max = 92.74 avg = 90.81 + squeezenet_ssd min = 33.23 max = 33.99 avg = 33.39 + squeezenet_ssd_int8 min = 38.50 max = 41.53 avg = 39.28 + mobilenet_ssd min = 42.49 max = 44.78 avg = 42.72 + mobilenet_ssd_int8 min = 37.06 max = 39.97 avg = 37.57 + mobilenet_yolo min = 96.34 max = 98.91 avg = 96.73 + mobilenetv2_yolov3 min = 50.88 max = 52.97 avg = 51.15 + yolov4-tiny min = 65.56 max = 67.13 avg = 65.80 + nanodet_m min = 19.94 max = 20.82 avg = 20.04 + yolo-fastest-1.1 min = 7.66 max = 7.81 avg = 7.71 + yolo-fastestv2 min = 6.82 max = 7.23 avg = 6.87 + vision_transformer min = 1535.03 max = 1552.84 avg = 1543.73 + FastestDet min = 7.17 max = 7.50 avg = 7.21 +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 2 0 -1 0 +loop_count = 300 +num_threads = 2 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 6.35 max = 9.15 avg = 7.33 + squeezenet_int8 min = 8.06 max = 8.60 avg = 8.14 + mobilenet min = 10.30 max = 11.86 avg = 11.48 + mobilenet_int8 min = 8.93 max = 11.87 avg = 10.47 + mobilenet_v2 min = 9.05 max = 11.50 avg = 9.19 + mobilenet_v3 min = 6.32 max = 6.42 avg = 6.36 + shufflenet min = 6.73 max = 8.55 avg = 6.81 + shufflenet_v2 min = 4.94 max = 6.65 avg = 6.32 + mnasnet min = 7.38 max = 10.77 avg = 8.82 + proxylessnasnet min = 8.57 max = 9.72 avg = 8.63 + efficientnet_b0 min = 18.61 max = 22.53 avg = 20.42 + efficientnetv2_b0 min = 18.75 max = 21.93 avg = 20.79 + regnety_400m min = 11.86 max = 15.09 avg = 14.60 + blazeface min = 1.95 max = 3.37 avg = 2.06 + googlenet min = 28.66 max = 32.24 avg = 28.94 + googlenet_int8 min = 27.64 max = 32.15 avg = 30.84 + resnet18 min = 20.33 max = 20.77 avg = 20.47 + resnet18_int8 min = 22.63 max = 23.72 avg = 22.88 + alexnet min = 20.41 max = 29.37 avg = 27.22 + vgg16 min = 101.72 max = 140.33 avg = 103.29 + vgg16_int8 min = 187.56 max = 211.44 avg = 189.92 + resnet50 min = 51.07 max = 59.25 avg = 58.35 + resnet50_int8 min = 46.50 max = 52.55 avg = 48.93 + squeezenet_ssd min = 22.48 max = 28.59 avg = 22.98 + squeezenet_ssd_int8 min = 25.56 max = 26.82 avg = 25.99 + mobilenet_ssd min = 22.81 max = 26.21 avg = 24.88 + mobilenet_ssd_int8 min = 19.31 max = 25.53 avg = 21.74 + mobilenet_yolo min = 59.58 max = 62.04 avg = 59.99 + mobilenetv2_yolov3 min = 33.26 max = 35.74 avg = 33.51 + yolov4-tiny min = 41.14 max = 45.34 avg = 42.46 + nanodet_m min = 12.10 max = 16.69 avg = 15.02 + yolo-fastest-1.1 min = 5.44 max = 7.78 avg = 7.24 + yolo-fastestv2 min = 5.03 max = 8.08 avg = 6.75 + vision_transformer min = 994.46 max = 1090.68 avg = 1045.50 + FastestDet min = 6.76 max = 6.91 avg = 6.83 +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 4 0 -1 0 +loop_count = 300 +num_threads = 4 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 3.79 max = 6.99 avg = 4.55 + squeezenet_int8 min = 5.13 max = 5.68 avg = 5.20 + mobilenet min = 6.25 max = 6.55 avg = 6.30 + mobilenet_int8 min = 5.96 max = 6.10 avg = 6.03 + mobilenet_v2 min = 5.34 max = 7.15 avg = 5.62 + mobilenet_v3 min = 4.05 max = 5.74 avg = 5.01 + shufflenet min = 3.69 max = 5.81 avg = 5.15 + shufflenet_v2 min = 4.31 max = 6.02 avg = 4.56 + mnasnet min = 4.48 max = 6.05 avg = 5.54 + proxylessnasnet min = 5.05 max = 8.08 avg = 6.03 + efficientnet_b0 min = 10.17 max = 12.21 avg = 11.58 + efficientnetv2_b0 min = 10.86 max = 15.78 avg = 12.70 + regnety_400m min = 9.24 max = 14.13 avg = 11.98 + blazeface min = 1.89 max = 1.97 avg = 1.93 + googlenet min = 15.19 max = 20.31 avg = 16.90 + googlenet_int8 min = 17.97 max = 19.40 avg = 18.11 + resnet18 min = 11.18 max = 11.48 avg = 11.29 + resnet18_int8 min = 12.26 max = 12.78 avg = 12.44 + alexnet min = 14.43 max = 16.94 avg = 14.68 + vgg16 min = 62.40 max = 78.42 avg = 64.96 + vgg16_int8 min = 101.52 max = 109.42 avg = 104.46 + resnet50 min = 29.19 max = 39.69 avg = 32.99 + resnet50_int8 min = 26.94 max = 28.82 avg = 27.16 + squeezenet_ssd min = 12.90 max = 16.52 avg = 15.20 + squeezenet_ssd_int8 min = 15.58 max = 18.40 avg = 16.28 + mobilenet_ssd min = 13.68 max = 14.45 avg = 13.87 + mobilenet_ssd_int8 min = 12.20 max = 14.58 avg = 12.84 + mobilenet_yolo min = 34.85 max = 36.54 avg = 35.05 + mobilenetv2_yolov3 min = 18.61 max = 20.93 avg = 19.92 + yolov4-tiny min = 26.09 max = 32.32 avg = 28.03 + nanodet_m min = 7.85 max = 12.48 avg = 11.00 + yolo-fastest-1.1 min = 6.19 max = 6.49 avg = 6.31 + yolo-fastestv2 min = 3.66 max = 6.83 avg = 5.11 + vision_transformer min = 605.95 max = 624.99 avg = 609.79 + FastestDet min = 4.32 max = 5.41 avg = 5.17 +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 8 0 -1 0 +loop_count = 300 +num_threads = 8 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 2.72 max = 3.74 avg = 3.05 + squeezenet_int8 min = 3.80 max = 4.71 avg = 4.03 + mobilenet min = 3.94 max = 5.15 avg = 4.00 + mobilenet_int8 min = 3.73 max = 3.87 avg = 3.80 + mobilenet_v2 min = 4.51 max = 6.57 avg = 4.68 + mobilenet_v3 min = 4.12 max = 4.38 avg = 4.28 + shufflenet min = 4.60 max = 6.27 avg = 4.88 + shufflenet_v2 min = 4.07 max = 4.20 avg = 4.11 + mnasnet min = 4.26 max = 4.51 avg = 4.36 + proxylessnasnet min = 4.71 max = 7.40 avg = 4.80 + efficientnet_b0 min = 8.49 max = 8.74 avg = 8.56 + efficientnetv2_b0 min = 9.34 max = 9.68 avg = 9.41 + regnety_400m min = 8.00 max = 12.85 avg = 10.64 + blazeface min = 1.76 max = 1.84 avg = 1.80 + googlenet min = 10.89 max = 11.33 avg = 10.98 + googlenet_int8 min = 11.66 max = 14.07 avg = 11.83 + resnet18 min = 6.48 max = 6.61 avg = 6.54 + resnet18_int8 min = 7.30 max = 7.79 avg = 7.51 + alexnet min = 8.33 max = 8.95 avg = 8.62 + vgg16 min = 29.94 max = 47.54 avg = 31.95 + vgg16_int8 min = 54.67 max = 60.76 avg = 56.03 + resnet50 min = 16.13 max = 20.79 avg = 20.03 + resnet50_int8 min = 15.64 max = 20.13 avg = 16.11 + squeezenet_ssd min = 11.58 max = 12.02 avg = 11.77 + squeezenet_ssd_int8 min = 11.14 max = 13.72 avg = 12.10 + mobilenet_ssd min = 8.27 max = 10.77 avg = 8.76 + mobilenet_ssd_int8 min = 8.13 max = 9.09 avg = 8.29 + mobilenet_yolo min = 23.90 max = 24.69 avg = 24.17 + mobilenetv2_yolov3 min = 14.83 max = 15.72 avg = 15.19 + yolov4-tiny min = 19.78 max = 23.66 avg = 20.05 + nanodet_m min = 8.92 max = 10.76 avg = 9.09 + yolo-fastest-1.1 min = 5.49 max = 5.77 avg = 5.63 + yolo-fastestv2 min = 5.04 max = 5.21 avg = 5.10 + vision_transformer min = 318.42 max = 379.40 avg = 363.66 + FastestDet min = 4.18 max = 4.54 avg = 4.38 +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 16 0 -1 0 +loop_count = 300 +num_threads = 16 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 2.70 max = 3.14 avg = 2.81 + squeezenet_int8 min = 3.21 max = 4.22 avg = 3.39 + mobilenet min = 3.13 max = 3.26 avg = 3.20 + mobilenet_int8 min = 3.17 max = 5.05 avg = 3.30 + mobilenet_v2 min = 4.31 max = 6.24 avg = 4.62 + mobilenet_v3 min = 3.57 max = 3.77 avg = 3.68 + shufflenet min = 4.70 max = 6.45 avg = 4.80 + shufflenet_v2 min = 3.73 max = 4.27 avg = 3.87 + mnasnet min = 3.67 max = 3.87 avg = 3.75 + proxylessnasnet min = 4.28 max = 4.81 avg = 4.35 + efficientnet_b0 min = 7.31 max = 7.77 avg = 7.53 + efficientnetv2_b0 min = 9.87 max = 12.33 avg = 10.07 + regnety_400m min = 17.95 max = 18.53 avg = 18.26 + blazeface min = 2.26 max = 2.40 avg = 2.33 + googlenet min = 9.51 max = 9.99 avg = 9.68 + googlenet_int8 min = 10.98 max = 11.36 avg = 11.18 + resnet18 min = 5.59 max = 6.08 avg = 5.71 + resnet18_int8 min = 6.55 max = 7.28 avg = 6.77 + alexnet min = 6.26 max = 6.50 avg = 6.36 + vgg16 min = 23.98 max = 27.37 avg = 24.89 + vgg16_int8 min = 38.07 max = 39.66 avg = 39.02 + resnet50 min = 12.81 max = 14.19 avg = 13.76 + resnet50_int8 min = 12.42 max = 12.84 avg = 12.55 + squeezenet_ssd min = 10.80 max = 11.49 avg = 11.12 + squeezenet_ssd_int8 min = 11.57 max = 12.21 avg = 11.74 + mobilenet_ssd min = 7.46 max = 8.08 avg = 7.84 + mobilenet_ssd_int8 min = 7.47 max = 8.07 avg = 7.63 + mobilenet_yolo min = 21.70 max = 23.43 avg = 21.92 + mobilenetv2_yolov3 min = 12.55 max = 14.56 avg = 12.90 + yolov4-tiny min = 17.68 max = 19.85 avg = 18.18 + nanodet_m min = 8.35 max = 8.70 avg = 8.45 + yolo-fastest-1.1 min = 5.70 max = 7.11 avg = 6.05 + yolo-fastestv2 min = 4.85 max = 5.70 avg = 5.37 + vision_transformer min = 214.36 max = 259.56 avg = 245.47 + FastestDet min = 5.01 max = 5.42 avg = 5.17 +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 32 0 -1 0 +loop_count = 300 +num_threads = 32 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 2.30 max = 2.94 avg = 2.46 + squeezenet_int8 min = 3.08 max = 4.88 avg = 4.03 + mobilenet min = 2.49 max = 2.76 avg = 2.53 + mobilenet_int8 min = 2.86 max = 3.73 avg = 2.95 + mobilenet_v2 min = 4.51 max = 5.20 avg = 4.74 + mobilenet_v3 min = 5.11 max = 6.91 avg = 6.10 + shufflenet min = 5.57 max = 6.51 avg = 5.78 + shufflenet_v2 min = 4.37 max = 4.66 avg = 4.48 + mnasnet min = 3.72 max = 4.08 avg = 3.90 + proxylessnasnet min = 4.19 max = 6.18 avg = 4.79 + efficientnet_b0 min = 6.80 max = 7.22 avg = 6.89 + efficientnetv2_b0 min = 13.98 max = 17.55 avg = 15.06 + regnety_400m min = 16.10 max = 16.72 avg = 16.26 + blazeface min = 2.12 max = 2.53 avg = 2.17 + googlenet min = 8.63 max = 9.89 avg = 8.77 + googlenet_int8 min = 9.90 max = 11.09 avg = 10.08 + resnet18 min = 6.54 max = 6.99 avg = 6.73 + resnet18_int8 min = 8.34 max = 9.00 avg = 8.67 + alexnet min = 6.64 max = 7.15 avg = 6.93 + vgg16 min = 22.79 max = 23.91 avg = 23.50 + vgg16_int8 min = 32.37 max = 37.51 avg = 33.13 + resnet50 min = 11.19 max = 16.40 avg = 11.47 + resnet50_int8 min = 11.92 max = 12.55 avg = 12.13 + squeezenet_ssd min = 10.75 max = 12.28 avg = 11.12 + squeezenet_ssd_int8 min = 11.31 max = 12.29 avg = 11.57 + mobilenet_ssd min = 10.25 max = 11.26 avg = 10.79 + mobilenet_ssd_int8 min = 11.39 max = 16.99 avg = 11.98 + mobilenet_yolo min = 52.11 max = 60.46 avg = 53.84 + mobilenetv2_yolov3 min = 12.07 max = 12.47 avg = 12.20 + yolov4-tiny min = 17.48 max = 17.79 avg = 17.58 + nanodet_m min = 13.06 max = 14.71 avg = 13.64 + yolo-fastest-1.1 min = 5.70 max = 5.89 avg = 5.79 + yolo-fastestv2 min = 8.89 max = 9.99 avg = 9.21 + vision_transformer min = 158.92 max = 187.40 avg = 168.21 + FastestDet min = 8.70 max = 9.43 avg = 9.00 +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 64 0 -1 0 +loop_count = 300 +num_threads = 64 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 6.85 max = 78.56 avg = 7.81 + squeezenet_int8 min = 8.06 max = 88.91 avg = 9.23 + mobilenet min = 3.02 max = 86.86 avg = 5.89 + mobilenet_int8 min = 3.58 max = 4.55 avg = 3.68 + mobilenet_v2 min = 5.05 max = 150.06 avg = 13.04 + mobilenet_v3 min = 4.85 max = 125.22 avg = 8.34 + shufflenet min = 17.80 max = 220.55 avg = 21.01 + shufflenet_v2 min = 11.23 max = 381.95 avg = 13.71 + mnasnet min = 9.83 max = 128.42 avg = 11.10 + proxylessnasnet min = 10.53 max = 68.52 avg = 12.03 + efficientnet_b0 min = 16.78 max = 968.87 avg = 23.94 + efficientnetv2_b0 min = 26.23 max = 551.18 avg = 31.34 + regnety_400m min = 70.14 max = 407.92 avg = 78.30 + blazeface min = 7.27 max = 191.44 avg = 9.37 + googlenet min = 16.69 max = 820.58 avg = 25.06 + googlenet_int8 min = 20.58 max = 849.09 avg = 29.87 + resnet18 min = 8.67 max = 349.00 avg = 11.33 + resnet18_int8 min = 10.40 max = 128.98 avg = 11.45 + alexnet min = 6.15 max = 196.01 avg = 10.24 + vgg16 min = 21.11 max = 288.66 avg = 29.37 + vgg16_int8 min = 30.72 max = 251.95 avg = 37.68 + resnet50 min = 19.10 max = 114.08 avg = 22.00 + resnet50_int8 min = 18.99 max = 436.89 avg = 24.36 + squeezenet_ssd min = 22.22 max = 510.52 avg = 28.76 + squeezenet_ssd_int8 min = 23.42 max = 614.70 avg = 30.82 + mobilenet_ssd min = 7.62 max = 202.66 avg = 14.59 + mobilenet_ssd_int8 min = 7.89 max = 109.82 avg = 8.80 + mobilenet_yolo min = 31.43 max = 742.10 avg = 45.52 + mobilenetv2_yolov3 min = 18.31 max = 273.05 avg = 20.78 + yolov4-tiny min = 21.03 max = 400.05 avg = 33.64 + nanodet_m min = 19.94 max = 114.18 avg = 21.89 + yolo-fastest-1.1 min = 7.20 max = 174.60 avg = 9.13 + yolo-fastestv2 min = 7.50 max = 170.55 avg = 9.01 + vision_transformer min = 126.90 max = 335.71 avg = 157.38 + FastestDet min = 6.59 max = 19.77 avg = 6.77 +``` + ### Intel Atom x5-Z8350 ``` nihui@nihui-ROCK-Pi-X:~/ncnn/build/benchmark$ ./benchncnn 20 4 0 -1 1 diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md index 05996f8d7354..de4d6b428e99 100644 --- a/docs/developer-guide/operators.md +++ b/docs/developer-guide/operators.md @@ -71,6 +71,7 @@ * [Reorg](#reorg) * [Requantize](#requantize) * [Reshape](#reshape) +* [RMSNorm](#rmsnorm) * [RNN](#rnn) * [Scale](#scale) * [SELU](#selu) @@ -836,11 +837,13 @@ y = embedding(x) | 1 | input_dim | int | 0 | | | 2 | bias_term | int | 0 | | | 3 | weight_data_size | int | 0 | | +| 18 | int8_scale_term| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float | [weight_data_size] | | bias_term | float | [num_output] | +| weight_data_int8_scales| float | [1] | # Exp ``` @@ -1670,6 +1673,26 @@ Reshape flag: - -1 = remaining - -233 = drop this dim(default) +# RMSNorm +``` +split x along outmost axis into part x0, x1 ... +root mean square normalize for each part x0, x1 ... +y = x * gamma by elementwise +``` + +* one_blob_only +* support_inplace + +| param id | name | type | default | description | +| --------- | ------------- | ----- | --------- | ----------------- | +| 0 | affine_size | int | 0 | | +| 1 | eps | float | 0.001f | x = x / sqrt(var + eps) | +| 2 | affine | int | 1 | | + +| weight | type | shape | +| ------------- | ----- | --------------------- | +| gamma_data | float | [affine_size] | + # RNN Apply a single-layer RNN to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`. diff --git a/docs/faq.en.md b/docs/faq.en.md index 807c4a9e3ee6..44d0068263b6 100644 --- a/docs/faq.en.md +++ b/docs/faq.en.md @@ -262,7 +262,7 @@ Fully customizable op, first change to one that can export (e.g. concat slice), Set net.opt.use_vulkan_compute = true before load_param / load_model; -- ## How to ececute multiple blob inputs, multiple blob outputs? +- ## How to execute multiple blob inputs, multiple blob outputs? Multiple execute `ex.input()` and `ex.extract()` like following ``` ex.input("data1", in_1); diff --git a/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md b/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md index 9b0559a8eb8f..e0195aa1403c 100644 --- a/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md +++ b/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md @@ -2,8 +2,114 @@ Here is a practical guide for converting pytorch model to ncnn resnet18 is used as the example -## pytorch to onnx - +## pytorch to ncnn, onnx to ncnn + +### What's the pnnx? +PyTorch Neural Network eXchange(PNNX) is an open standard for PyTorch model interoperability. PNNX provides an open model format for PyTorch. It defines computation graph as well as high level operators strictly matches PyTorch. +It is recommended to use the `pnnx` tool to convert your `onnx` or `pytorch` model into a ncnn model now. + +### How to install pnnx? +* A. python pip (recommended) + * Windows/Linux/macOS 64bit + * python 3.7 or later + + ```shell + pip3 install pnnx + ``` + +* B. portable binary package (recommended if you hate python) + * Windows/Linux/macOS 64bit + * For Linux, glibc 2.17+ + + Download portable pnnx binary package from https://github.com/pnnx/pnnx/releases and extract it. + +* C. build from source + 1. install pytorch + 2. (optional) install torchvision for pnnx torchvision operator support + 3. (optional) install protobuf for pnnx onnx-zero support + 4. clone https://github.com/Tencent/ncnn.git + 5. build pnnx in ncnn/tools/pnnx with cmake + + You will probably refer https://github.com/pnnx/pnnx/blob/main/.github/workflows/release.yml for detailed steps + + ```shell + git clone https://github.com/Tencent/ncnn.git + mkdir ncnn/tools/pnnx/build + cd ncnn/tools/pnnx/build + cmake -DCMAKE_INSTALL_PREFIX=install -DTorch_INSTALL_DIR= -DTorchVision_INSTALL_DIR= .. + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install + ``` + +### How to use pnnx? +* A. python + 1. optimize and export your torch model with pnnx.export() + ```python + import torch + import torchvision.models as models + import pnnx + + model = models.resnet18(pretrained=True) + + x = torch.rand(1, 3, 224, 224) + + opt_model = pnnx.export(model, "resnet18.pt", x) + + # use tuple for model with multiple inputs + # opt_model = pnnx.export(model, "resnet18.pt", (x, y, z)) + ``` + 2. use optimized module just like the normal one + ```python + result = opt_model(x) + ``` + 3. pick resnet18_pnnx.py for pnnx-optimized torch model + 4. pick resnet18.ncnn.param and resnet18.ncnn.bin for ncnn inference + +B. command line + 1. export your torch model to torchscript / onnx + ```python + import torch + import torchvision.models as models + + net = models.resnet18(pretrained=True) + net = net.eval() + + x = torch.rand(1, 3, 224, 224) + + # You could try disabling checking when tracing raises error + # mod = torch.jit.trace(net, x, check_trace=False) + mod = torch.jit.trace(net, x) + + mod.save("resnet18.pt") + + # You could also try exporting to the good-old onnx + torch.onnx.export(net, x, 'resnet18.onnx') + ``` + + 2. pnnx convert torchscript / onnx to optimized pnnx model and ncnn model files + ```shell + ./pnnx resnet18.pt inputshape=[1,3,224,224] + ./pnnx resnet18.onnx inputshape=[1,3,224,224] + ``` + macOS zsh user may need double quotes to prevent ambiguity + ```shell + ./pnnx resnet18.pt "inputshape=[1,3,224,224]" + ``` + For model with multiple inputs, use list + ```shell + ./pnnx resnet18.pt inputshape=[1,3,224,224],[1,32] + ``` + For model with non-fp32 input data type, add type suffix + ```shell + ./pnnx resnet18.pt inputshape=[1,3,224,224]f32,[1,32]i64 + ``` + 3. pick resnet18_pnnx.py for pnnx-optimized torch model + 4. pick resnet18.ncnn.param and resnet18.ncnn.bin for ncnn inference + +see more pnnx informations: https://github.com/pnnx/pnnx + +## pytorch to onnx (deprecated) +
pytorch to onnx The official pytorch tutorial for exporting onnx model https://pytorch.org/tutorials/advanced/super_resolution_with_caffe2.html @@ -22,9 +128,10 @@ x = torch.rand(1, 3, 224, 224) # Export the model torch_out = torch.onnx._export(model, x, "resnet18.onnx", export_params=True) ``` +
-## simplify onnx model - +## simplify onnx model (deprecated) +
simplify onnx model The exported resnet18.onnx model may contains many redundant operators such as Shape, Gather and Unsqueeze that is not supported in ncnn ``` @@ -37,19 +144,36 @@ Unsqueeze not supported yet! # axes 7 ``` -Fortunately, daquexian developed a handy tool to eliminate them. cheers! +### onnxsim -https://github.com/daquexian/onnx-simplifier +Fortunately, [@daquexian](https://github.com/daquexian) developed a handy tool to eliminate them. cheers! +#### how to use onnxsim? +```shell +pip install onnxsim +python -m onnxsim resnet18.onnx resnet18-sim.onnx ``` -python3 -m onnxsim resnet18.onnx resnet18-sim.onnx -``` +more informations: https://github.com/daquexian/onnx-simplifier -## onnx to ncnn +### onnxslim -Finally, you can convert the model to ncnn using tools/onnx2ncnn +Or you can use another powerful model simplification tool implemented in pure Python development by [@inisis](https://github.com/inisis): +#### how to use onnxslim? +```shell +pip install onnxslim +python -m onnxslim resnet18.onnx resnet18-slim.onnx ``` -onnx2ncnn resnet18-sim.onnx resnet18.param resnet18.bin -``` +more informations: https://github.com/inisis/OnnxSlim +
+ +## onnx2ncnn (deprecated) + +~~The onnx2ncnn tool has stopped maintenance. It is recommended to use the PNNX tool~~ + +
onnx2ncnn tool + +~~Finally, you can convert the model to ncnn using tools/onnx2ncnn~~ +~~onnx2ncnn resnet18-sim.onnx resnet18.param resnet18.bin~~ +
\ No newline at end of file diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a7739be27e51..bf3017dbe680 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -69,6 +69,7 @@ if(NCNN_PIXEL) ncnn_add_example(yolov4) ncnn_add_example(rvm) ncnn_add_example(p2pnet) + ncnn_add_example(yolov8) endif() else() message(WARNING "OpenCV not found and NCNN_SIMPLEOCV disabled, examples won't be built") diff --git a/examples/yolov8.cpp b/examples/yolov8.cpp new file mode 100644 index 000000000000..e166e6c1d174 --- /dev/null +++ b/examples/yolov8.cpp @@ -0,0 +1,410 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Copyright (C) 2024 whyb(https://github.com/whyb). All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +// ReadMe +// Convert yolov8 model to ncnn model workflow: +// +// step 1: +// If you don't want to train the model yourself. You should go to the ultralytics website download the pretrained model file. +// original pretrained model from https://docs.ultralytics.com/models/yolov8/#supported-tasks-and-modes +// +// step 2: +// run this command. +// conda create --name yolov8 python=3.11 +// conda activate yolov8 +// pip install ultralytics onnx numpy protobuf +// +// step 3: +// save source code file(export_model_to_ncnn.py): +// from ultralytics import YOLO +// detection_models = [ +// ["./Detection-pt/yolov8n.pt", "./Detection-pt/"], +// ["./Detection-pt/yolov8s.pt", "./Detection-pt/"], +// ["./Detection-pt/yolov8m.pt", "./Detection-pt/"], +// ["./Detection-pt/yolov8l.pt", "./Detection-pt/"], +// ["./Detection-pt/yolov8x.pt", "./Detection-pt/"] +// ] +// for model_dict in detection_models: +// model = YOLO(model_dict[0]) # load an official pretrained weight model +// model.export(format="ncnn", dynamic=True, save_dir=model_dict[1], simplify=True) +// +// step 4: +// run command: python export_model_to_ncnn.py + +#include +#include +#include +#include "layer.h" +#include "net.h" + +#include +#include +#include +#include +#include + +#define MAX_STRIDE 32 + +struct Object +{ + cv::Rect_ rect; + int label; + float prob; +}; + +static inline float intersection_area(const Object& a, const Object& b) +{ + cv::Rect_ inter = a.rect & b.rect; + return inter.area(); +} + +static void qsort_descent_inplace(std::vector& objects, int left, int right) +{ + int i = left; + int j = right; + float p = objects[(left + right) / 2].prob; + + while (i <= j) + { + while (objects[i].prob > p) + i++; + + while (objects[j].prob < p) + j--; + + if (i <= j) + { + // swap + std::swap(objects[i], objects[j]); + + i++; + j--; + } + } + + #pragma omp parallel sections + { + #pragma omp section + { + if (left < j) qsort_descent_inplace(objects, left, j); + } + #pragma omp section + { + if (i < right) qsort_descent_inplace(objects, i, right); + } + } +} + +static void qsort_descent_inplace(std::vector& objects) +{ + if (objects.empty()) + return; + + qsort_descent_inplace(objects, 0, objects.size() - 1); +} + +static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold, bool agnostic = false) +{ + picked.clear(); + + const int n = faceobjects.size(); + + std::vector areas(n); + for (int i = 0; i < n; i++) + { + areas[i] = faceobjects[i].rect.area(); + } + + for (int i = 0; i < n; i++) + { + const Object& a = faceobjects[i]; + + int keep = 1; + for (int j = 0; j < (int)picked.size(); j++) + { + const Object& b = faceobjects[picked[j]]; + + if (!agnostic && a.label != b.label) + continue; + + // intersection over union + float inter_area = intersection_area(a, b); + float union_area = areas[i] + areas[picked[j]] - inter_area; + // float IoU = inter_area / union_area + if (inter_area / union_area > nms_threshold) + keep = 0; + } + + if (keep) + picked.push_back(i); + } +} + +static inline float sigmoid(float x) +{ + return static_cast(1.f / (1.f + exp(-x))); +} + +static inline float clampf(float d, float min, float max) +{ + const float t = d < min ? min : d; + return t > max ? max : t; +} + +static void parse_yolov8_detections( + float* inputs, float confidence_threshold, + int num_channels, int num_anchors, int num_labels, + int infer_img_width, int infer_img_height, + std::vector& objects) +{ + std::vector detections; + cv::Mat output = cv::Mat((int)num_channels, (int)num_anchors, CV_32F, inputs).t(); + + for (int i = 0; i < num_anchors; i++) + { + const float* row_ptr = output.row(i).ptr(); + const float* bboxes_ptr = row_ptr; + const float* scores_ptr = row_ptr + 4; + const float* max_s_ptr = std::max_element(scores_ptr, scores_ptr + num_labels); + float score = *max_s_ptr; + if (score > confidence_threshold) + { + float x = *bboxes_ptr++; + float y = *bboxes_ptr++; + float w = *bboxes_ptr++; + float h = *bboxes_ptr; + + float x0 = clampf((x - 0.5f * w), 0.f, (float)infer_img_width); + float y0 = clampf((y - 0.5f * h), 0.f, (float)infer_img_height); + float x1 = clampf((x + 0.5f * w), 0.f, (float)infer_img_width); + float y1 = clampf((y + 0.5f * h), 0.f, (float)infer_img_height); + + cv::Rect_ bbox; + bbox.x = x0; + bbox.y = y0; + bbox.width = x1 - x0; + bbox.height = y1 - y0; + Object object; + object.label = max_s_ptr - scores_ptr; + object.prob = score; + object.rect = bbox; + detections.push_back(object); + } + } + objects = detections; +} + +static int detect_yolov8(const cv::Mat& bgr, std::vector& objects) +{ + ncnn::Net yolov8; + + yolov8.opt.use_vulkan_compute = true; // if you want detect in hardware, then enable it + + yolov8.load_param("yolov8n.param"); + yolov8.load_model("yolov8n.bin"); + + const int target_size = 640; + const float prob_threshold = 0.25f; + const float nms_threshold = 0.45f; + + int img_w = bgr.cols; + int img_h = bgr.rows; + + // letterbox pad to multiple of MAX_STRIDE + int w = img_w; + int h = img_h; + float scale = 1.f; + if (w > h) + { + scale = (float)target_size / w; + w = target_size; + h = h * scale; + } + else + { + scale = (float)target_size / h; + h = target_size; + w = w * scale; + } + + ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); + + int wpad = (target_size + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - w; + int hpad = (target_size + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - h; + ncnn::Mat in_pad; + ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); + + const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; + in_pad.substract_mean_normalize(0, norm_vals); + + ncnn::Extractor ex = yolov8.create_extractor(); + + ex.input("in0", in_pad); + + std::vector proposals; + + // stride 32 + { + ncnn::Mat out; + ex.extract("out0", out); + + std::vector objects32; + const int num_labels = 80; // COCO has detect 80 object labels. + parse_yolov8_detections( + (float*)out.data, prob_threshold, + out.h, out.w, num_labels, + in_pad.w, in_pad.h, + objects32); + proposals.insert(proposals.end(), objects32.begin(), objects32.end()); + } + + // sort all proposals by score from highest to lowest + qsort_descent_inplace(proposals); + + // apply nms with nms_threshold + std::vector picked; + nms_sorted_bboxes(proposals, picked, nms_threshold); + + int count = picked.size(); + + objects.resize(count); + for (int i = 0; i < count; i++) + { + objects[i] = proposals[picked[i]]; + + // adjust offset to original unpadded + float x0 = (objects[i].rect.x - (wpad / 2)) / scale; + float y0 = (objects[i].rect.y - (hpad / 2)) / scale; + float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; + float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; + + // clip + x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); + y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); + x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); + y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); + + objects[i].rect.x = x0; + objects[i].rect.y = y0; + objects[i].rect.width = x1 - x0; + objects[i].rect.height = y1 - y0; + } + + return 0; +} + +static void draw_objects(const cv::Mat& bgr, const std::vector& objects) +{ + static const char* class_names[] = { + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", + "hair drier", "toothbrush" + }; + + static const unsigned char colors[19][3] = { + {54, 67, 244}, + {99, 30, 233}, + {176, 39, 156}, + {183, 58, 103}, + {181, 81, 63}, + {243, 150, 33}, + {244, 169, 3}, + {212, 188, 0}, + {136, 150, 0}, + {80, 175, 76}, + {74, 195, 139}, + {57, 220, 205}, + {59, 235, 255}, + {7, 193, 255}, + {0, 152, 255}, + {34, 87, 255}, + {72, 85, 121}, + {158, 158, 158}, + {139, 125, 96} + }; + + int color_index = 0; + + cv::Mat image = bgr.clone(); + + for (size_t i = 0; i < objects.size(); i++) + { + const Object& obj = objects[i]; + + const unsigned char* color = colors[color_index % 19]; + color_index++; + + cv::Scalar cc(color[0], color[1], color[2]); + + fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, + obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); + + cv::rectangle(image, obj.rect, cc, 2); + + char text[256]; + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); + + int baseLine = 0; + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + + int x = obj.rect.x; + int y = obj.rect.y - label_size.height - baseLine; + if (y < 0) + y = 0; + if (x + label_size.width > image.cols) + x = image.cols - label_size.width; + + cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), + cc, -1); + + cv::putText(image, text, cv::Point(x, y + label_size.height), + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(255, 255, 255)); + } + + cv::imshow("image", image); + cv::waitKey(0); +} + +int main(int argc, char** argv) +{ + if (argc != 2) + { + fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); + return -1; + } + + const char* imagepath = argv[1]; + + cv::Mat m = cv::imread(imagepath, 1); + if (m.empty()) + { + fprintf(stderr, "cv::imread %s failed\n", imagepath); + return -1; + } + + std::vector objects; + detect_yolov8(m, objects); + + draw_objects(m, objects); + + return 0; +} diff --git a/python/src/main.cpp b/python/src/main.cpp index a7ed0528c6ab..e5b1264264c9 100644 --- a/python/src/main.cpp +++ b/python/src/main.cpp @@ -34,6 +34,20 @@ using namespace ncnn; namespace py = pybind11; +class DataReaderFromMemoryCopy : public DataReaderFromMemory +{ +public: + explicit DataReaderFromMemoryCopy(const unsigned char*& mem) + : DataReaderFromMemory(mem) + { + } + + virtual size_t reference(size_t size, const void** buf) const + { + return 0; + } +}; + struct LayerFactory { std::string name; @@ -956,6 +970,13 @@ PYBIND11_MODULE(ncnn, m) #endif // NCNN_STRING .def("load_param_bin", (int (Net::*)(const char*)) & Net::load_param_bin, py::arg("protopath")) .def("load_model", (int (Net::*)(const char*)) & Net::load_model, py::arg("modelpath")) + .def( + "load_model_mem", [](Net& net, const char* mem) { + const unsigned char* _mem = (const unsigned char*)mem; + DataReaderFromMemoryCopy dr(_mem); + net.load_model(dr); + }, + py::arg("mem")) #endif // NCNN_STDIO .def("clear", &Net::clear) diff --git a/python/tests/test_net.py b/python/tests/test_net.py index 03271aff4623..362cc4791fb8 100644 --- a/python/tests/test_net.py +++ b/python/tests/test_net.py @@ -42,6 +42,32 @@ def test_net(): assert len(net.blobs()) == 0 and len(net.layers()) == 0 +def test_net_mem(): + modelbin = bytearray(303940) + modelbin[0:4] = 71,107,48,1 + modelbin[180:184] = 71,107,48,1 + + with ncnn.Net() as net: + ret = net.load_param("tests/test.param") + net.load_model_mem(bytes(modelbin)) + assert ret == 0 and len(net.blobs()) == 3 and len(net.layers()) == 3 + + input_names = net.input_names() + output_names = net.output_names() + assert len(input_names) > 0 and len(output_names) > 0 + + in_mat = ncnn.Mat((227, 227, 3)) + + with net.create_extractor() as ex: + ex.input("data", in_mat) + ret, out_mat = ex.extract("output") + + assert ret == 0 and out_mat.dims == 1 and out_mat.w == 1 + + net.clear() + assert len(net.blobs()) == 0 and len(net.layers()) == 0 + + def test_net_vulkan(): if not hasattr(ncnn, "get_gpu_count"): return diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d3f55ce77900..803c34a780d4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -166,6 +166,7 @@ ncnn_add_layer(Erf) ncnn_add_layer(Diag) ncnn_add_layer(CELU) ncnn_add_layer(Shrink) +ncnn_add_layer(RMSNorm) if(NCNN_VULKAN) ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp) diff --git a/src/c_api.cpp b/src/c_api.cpp index 5662d1b51554..f8146e054c27 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -1240,6 +1240,13 @@ void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt) ((Net*)net->pthis)->opt = *((Option*)opt); } +#if NCNN_VULKAN +void ncnn_net_set_vulkan_device(ncnn_net_t net, int device_index) +{ + ((Net*)net->pthis)->set_vulkan_device(device_index); +} +#endif + static ::ncnn::Layer* __Layer_c_api_layer_creator(void* userdata) { ncnn_net_custom_layer_factory_t ud = (ncnn_net_custom_layer_factory_t)userdata; diff --git a/src/c_api.h b/src/c_api.h index d153b2a4ef0f..f752bfed6636 100644 --- a/src/c_api.h +++ b/src/c_api.h @@ -275,6 +275,10 @@ NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net); NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net); NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt); +#if NCNN_VULKAN +NCNN_EXPORT void ncnn_net_set_vulkan_device(ncnn_net_t net, int device_index); +#endif + #if NCNN_STRING NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata); #endif /* NCNN_STRING */ diff --git a/src/cpu.cpp b/src/cpu.cpp index ba050e7b1e62..e42bcfafeb21 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -47,10 +47,9 @@ #include #endif -#if defined _WIN32 && !(defined __MINGW32__) +#if defined _WIN32 #define WIN32_LEAN_AND_MEAN #include -#include #endif #if defined __ANDROID__ || defined __linux__ @@ -130,8 +129,10 @@ #include #endif +#if (defined _WIN32 && (__aarch64__ || __arm__)) #define RUAPU_IMPLEMENTATION #include "ruapu.h" +#endif // topology info static int g_cpucount; @@ -597,9 +598,6 @@ static int get_cpu_support_x86_avx2() static int get_cpu_support_x86_avx_vnni() { -#if __APPLE__ - return ruapu_supports("avxvnni"); -#else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -618,13 +616,16 @@ static int get_cpu_support_x86_avx_vnni() x86_cpuid_sublevel(7, 1, cpu_info); return cpu_info[0] & (1u << 4); -#endif } static int get_cpu_support_x86_avx512() { #if __APPLE__ - return ruapu_supports("avx512f") && ruapu_supports("avx512bw") && ruapu_supports("avx512cd") && ruapu_supports("avx512dq") && ruapu_supports("avx512vl"); + return get_hw_capability("hw.optional.avx512f") + && get_hw_capability("hw.optional.avx512bw") + && get_hw_capability("hw.optional.avx512cd") + && get_hw_capability("hw.optional.avx512dq") + && get_hw_capability("hw.optional.avx512vl"); #else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -654,7 +655,7 @@ static int get_cpu_support_x86_avx512() static int get_cpu_support_x86_avx512_vnni() { #if __APPLE__ - return ruapu_supports("avx512vnni"); + return get_hw_capability("hw.optional.avx512vnni"); #else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -684,7 +685,7 @@ static int get_cpu_support_x86_avx512_vnni() static int get_cpu_support_x86_avx512_bf16() { #if __APPLE__ - return ruapu_supports("avx512bf16"); + return get_hw_capability("hw.optional.avx512bf16"); #else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -710,7 +711,7 @@ static int get_cpu_support_x86_avx512_bf16() static int get_cpu_support_x86_avx512_fp16() { #if __APPLE__ - return ruapu_supports("avx512fp16"); + return get_hw_capability("hw.optional.avx512fp16"); #else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -746,7 +747,7 @@ static int get_cpucount() count = emscripten_num_logical_cores(); else count = 1; -#elif (defined _WIN32 && !(defined __MINGW32__)) +#elif defined _WIN32 SYSTEM_INFO system_info; GetSystemInfo(&system_info); count = system_info.dwNumberOfProcessors; @@ -813,7 +814,7 @@ static int get_thread_siblings(int cpuid) static int get_physical_cpucount() { int count = 0; -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); if (glpi == NULL) @@ -1051,7 +1052,7 @@ static int get_big_cpu_data_cache_size(int level) static int get_cpu_level2_cachesize() { int size = 0; -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); if (glpi != NULL) @@ -1121,7 +1122,7 @@ static int get_cpu_level2_cachesize() static int get_cpu_level3_cachesize() { int size = 0; -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); if (glpi != NULL) @@ -1168,7 +1169,7 @@ static int get_cpu_level3_cachesize() return size; } -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 static ncnn::CpuSet get_smt_cpu_mask() { ncnn::CpuSet smt_cpu_mask; @@ -1262,7 +1263,7 @@ static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask) return 0; } -#endif // (defined _WIN32 && !(defined __MINGW32__)) +#endif // defined _WIN32 #if defined __ANDROID__ || defined __linux__ static int get_max_freq_khz(int cpuid) @@ -1436,7 +1437,7 @@ static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::Cp mask_all.enable(i); } -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 // get max freq mhz for all cores int max_freq_mhz_min = INT_MAX; int max_freq_mhz_max = 0; @@ -1953,7 +1954,7 @@ static void initialize_global_cpu_info() g_powersave = 0; initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big); -#if (defined _WIN32 && (__aarch64__ || __arm__)) || __APPLE__ +#if (defined _WIN32 && (__aarch64__ || __arm__)) if (!is_being_debugged()) { ruapu_init(); @@ -2030,7 +2031,7 @@ static inline void try_initialize_global_cpu_info() namespace ncnn { -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 CpuSet::CpuSet() { disable_all(); @@ -2799,7 +2800,7 @@ const CpuSet& get_cpu_thread_affinity_mask(int powersave) int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask) { try_initialize_global_cpu_info(); -#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__)) +#if defined __ANDROID__ || defined __linux__ || defined _WIN32 #ifdef _OPENMP int num_threads = thread_affinity_mask.num_enabled(); diff --git a/src/cpu.h b/src/cpu.h index 7d6bfce1108a..2ae6b8c3ffe9 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -17,7 +17,7 @@ #include -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 #define WIN32_LEAN_AND_MEAN #include #endif @@ -40,7 +40,7 @@ class NCNN_EXPORT CpuSet int num_enabled() const; public: -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 ULONG_PTR mask; #endif #if defined __ANDROID__ || defined __linux__ diff --git a/src/layer/arm/convolution_3x3_pack1to8_fp16s.h b/src/layer/arm/convolution_3x3_pack1to8_fp16s.h index bd03d450b2e8..40e276cdedff 100644 --- a/src/layer/arm/convolution_3x3_pack1to8_fp16s.h +++ b/src/layer/arm/convolution_3x3_pack1to8_fp16s.h @@ -68,8 +68,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "sub %0, %0, #64 \n" "prfm pldl1keep, [%1, #128] \n" - "ld1 {v0.8h}, [%1], #16 \n" // r0 - "ld1 {v1.4h}, [%1] \n" + "ldr q0, [%1], #16 \n" // r0 + "ldr s1, [%1] \n" "fmla v24.8h, %8.8h, v0.h[0] \n" "fmla v25.8h, %8.8h, v0.h[1] \n" @@ -99,8 +99,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %10.8h, v1.h[1] \n" "prfm pldl1keep, [%2, #128] \n" - "ld1 {v2.8h}, [%2], #16 \n" // r1 - "ld1 {v3.4h}, [%2] \n" + "ldr q2, [%2], #16 \n" // r1 + "ldr s3, [%2] \n" "fmla v24.8h, %11.8h, v2.h[0] \n" "fmla v25.8h, %11.8h, v2.h[1] \n" @@ -130,8 +130,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %13.8h, v3.h[1] \n" "prfm pldl1keep, [%3, #128] \n" - "ld1 {v4.8h}, [%3], #16 \n" // r2 - "ld1 {v5.4h}, [%3] \n" + "ldr q4, [%3], #16 \n" // r2 + "ldr s5, [%3] \n" "fmla v24.8h, %14.8h, v4.h[0] \n" "fmla v25.8h, %14.8h, v4.h[1] \n" @@ -189,7 +189,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 sum1 sum2 sum3 "prfm pldl1keep, [%1, #128] \n" - "ld1 {v0.8h}, [%1] \n" // r0 + "ldr q0, [%1] \n" // r0 "fmla v28.8h, %8.8h, v0.h[0] \n" "fmla v29.8h, %8.8h, v0.h[1] \n" @@ -207,7 +207,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %10.8h, v0.h[5] \n" "prfm pldl1keep, [%2, #128] \n" - "ld1 {v1.8h}, [%2] \n" // r1 + "ldr q1, [%2] \n" // r1 "fmla v28.8h, %11.8h, v1.h[0] \n" "fmla v29.8h, %11.8h, v1.h[1] \n" @@ -225,7 +225,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %13.8h, v1.h[5] \n" "prfm pldl1keep, [%3, #128] \n" - "ld1 {v2.8h}, [%3] \n" // r2 + "ldr q2, [%3] \n" // r2 "fmla v28.8h, %14.8h, v2.h[0] \n" "fmla v29.8h, %14.8h, v2.h[1] \n" @@ -274,7 +274,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "ld1 {v30.8h, v31.8h}, [%0] \n" // sum0 sum1 "prfm pldl1keep, [%1, #64] \n" - "ld1 {v0.4h}, [%1] \n" // r0 + "ldr d0, [%1] \n" // r0 "fmla v30.8h, %8.8h, v0.h[0] \n" "fmla v31.8h, %8.8h, v0.h[1] \n" @@ -284,7 +284,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %10.8h, v0.h[3] \n" "prfm pldl1keep, [%2, #64] \n" - "ld1 {v1.4h}, [%2] \n" // r1 + "ldr d1, [%2] \n" // r1 "fmla v30.8h, %11.8h, v1.h[0] \n" "fmla v31.8h, %11.8h, v1.h[1] \n" @@ -294,7 +294,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %13.8h, v1.h[3] \n" "prfm pldl1keep, [%3, #64] \n" - "ld1 {v2.4h}, [%3] \n" // r2 + "ldr d2, [%3] \n" // r2 "fmla v30.8h, %14.8h, v2.h[0] \n" "fmla v31.8h, %14.8h, v2.h[1] \n" @@ -332,24 +332,24 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob { asm volatile( "prfm pldl1keep, [%0, #128] \n" - "ld1 {v30.8h}, [%0] \n" // sum0 + "ldr q30, [%0] \n" // sum0 "prfm pldl1keep, [%1, #64] \n" - "ld1 {v0.4h}, [%1] \n" // r0 + "ldr d0, [%1] \n" // r0 "fmla v30.8h, %8.8h, v0.h[0] \n" "fmla v30.8h, %9.8h, v0.h[1] \n" "fmla v30.8h, %10.8h, v0.h[2] \n" "prfm pldl1keep, [%2, #64] \n" - "ld1 {v1.4h}, [%2] \n" // r1 + "ldr d1, [%2] \n" // r1 "fmla v30.8h, %11.8h, v1.h[0] \n" "fmla v30.8h, %12.8h, v1.h[1] \n" "fmla v30.8h, %13.8h, v1.h[2] \n" "prfm pldl1keep, [%3, #64] \n" - "ld1 {v2.4h}, [%3] \n" // r2 + "ldr d2, [%3] \n" // r2 "fmla v30.8h, %14.8h, v2.h[0] \n" "fmla v30.8h, %15.8h, v2.h[1] \n" @@ -359,7 +359,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "add %2, %2, #2 \n" "add %3, %3, #2 \n" - "st1 {v30.8h}, [%0], #16 \n" + "str q30, [%0], #16 \n" : "=r"(outptr0), // %0 "=r"(r0), // %1 @@ -445,8 +445,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 sum1 sum2 sum3 "prfm pldl1keep, [%1, #128] \n" - "ld1 {v0.8h}, [%1], #16 \n" // r0 - "ld1 {v1.h}[0], [%1] \n" + "ldr q0, [%1], #16 \n" // r0 + "ldr h1, [%1] \n" "fmla v28.8h, %8.8h, v0.h[0] \n" "fmla v29.8h, %8.8h, v0.h[2] \n" @@ -464,8 +464,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %10.8h, v1.h[0] \n" "prfm pldl1keep, [%2, #128] \n" - "ld1 {v2.8h}, [%2], #16 \n" // r1 - "ld1 {v3.h}[0], [%2] \n" + "ldr q2, [%2], #16 \n" // r1 + "ldr h3, [%2] \n" "fmla v28.8h, %11.8h, v2.h[0] \n" "fmla v29.8h, %11.8h, v2.h[2] \n" @@ -483,8 +483,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %13.8h, v3.h[0] \n" "prfm pldl1keep, [%3, #128] \n" - "ld1 {v4.8h}, [%3], #16 \n" // r2 - "ld1 {v5.h}[0], [%3] \n" + "ldr q4, [%3], #16 \n" // r2 + "ldr h5, [%3] \n" "fmla v28.8h, %14.8h, v4.h[0] \n" "fmla v29.8h, %14.8h, v4.h[2] \n" @@ -529,8 +529,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "ld1 {v30.8h, v31.8h}, [%0] \n" // sum0 sum1 "prfm pldl1keep, [%1, #64] \n" - "ld1 {v0.4h}, [%1], #8 \n" // r0 - "ld1 {v1.h}[0], [%1] \n" + "ldr d0, [%1], #8 \n" // r0 + "ldr h1, [%1] \n" "fmla v30.8h, %8.8h, v0.h[0] \n" "fmla v31.8h, %8.8h, v0.h[2] \n" @@ -540,8 +540,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %10.8h, v1.h[0] \n" "prfm pldl1keep, [%2, #64] \n" - "ld1 {v2.4h}, [%2], #8 \n" // r1 - "ld1 {v3.h}[0], [%2] \n" + "ldr d2, [%2], #8 \n" // r1 + "ldr h3, [%2] \n" "fmla v30.8h, %11.8h, v2.h[0] \n" "fmla v31.8h, %11.8h, v2.h[2] \n" @@ -551,8 +551,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %13.8h, v3.h[0] \n" "prfm pldl1keep, [%3, #64] \n" - "ld1 {v4.4h}, [%3], #8 \n" // r2 - "ld1 {v5.h}[0], [%3] \n" + "ldr d4, [%3], #8 \n" // r2 + "ldr h5, [%3] \n" "fmla v30.8h, %14.8h, v4.h[0] \n" "fmla v31.8h, %14.8h, v4.h[2] \n" @@ -586,24 +586,24 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob { asm volatile( "prfm pldl1keep, [%0, #128] \n" - "ld1 {v30.8h}, [%0] \n" // sum0 + "ldr q30, [%0] \n" // sum0 "prfm pldl1keep, [%1, #64] \n" - "ld1 {v0.4h}, [%1] \n" // r0 + "ldr d0, [%1] \n" // r0 "fmla v30.8h, %8.8h, v0.h[0] \n" "fmla v30.8h, %9.8h, v0.h[1] \n" "fmla v30.8h, %10.8h, v0.h[2] \n" "prfm pldl1keep, [%2, #64] \n" - "ld1 {v1.4h}, [%2] \n" // r1 + "ldr d1, [%2] \n" // r1 "fmla v30.8h, %11.8h, v1.h[0] \n" "fmla v30.8h, %12.8h, v1.h[1] \n" "fmla v30.8h, %13.8h, v1.h[2] \n" "prfm pldl1keep, [%3, #64] \n" - "ld1 {v2.4h}, [%3] \n" // r2 + "ldr d2, [%3] \n" // r2 "fmla v30.8h, %14.8h, v2.h[0] \n" "fmla v30.8h, %15.8h, v2.h[1] \n" @@ -613,7 +613,7 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "add %2, %2, #4 \n" "add %3, %3, #4 \n" - "st1 {v30.8h}, [%0], #16 \n" + "str q30, [%0], #16 \n" : "=r"(outptr0), // %0 "=r"(r0), // %1 diff --git a/src/layer/arm/convolution_im2col_gemm.h b/src/layer/arm/convolution_im2col_gemm.h index af501efa2f80..25a3e94d781a 100644 --- a/src/layer/arm/convolution_im2col_gemm.h +++ b/src/layer/arm/convolution_im2col_gemm.h @@ -3377,7 +3377,7 @@ static void convolution_gemm_transB_packed_tile(const Mat& AT_tile, const Mat& B "cbz %w10, 0f \n" "ld1 {v30.4s, v31.4s}, [%0] \n" - "b 3f \n" + "b 2f \n" "0: \n" // if pC diff --git a/src/layer/arm/convolution_im2col_gemm_bf16s.h b/src/layer/arm/convolution_im2col_gemm_bf16s.h index 82319d05850c..95819e2d679f 100644 --- a/src/layer/arm/convolution_im2col_gemm_bf16s.h +++ b/src/layer/arm/convolution_im2col_gemm_bf16s.h @@ -3110,7 +3110,7 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const "cbz %w10, 0f \n" "ld1 {v30.4s, v31.4s}, [%0] \n" - "b 3f \n" + "b 2f \n" "0: \n" // if pC @@ -3125,15 +3125,13 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const "eor v31.16b, v31.16b, v31.16b \n" "2: \n" - - "3: \n" "lsr w4, %w9, #2 \n" // w4 = max_kk >> 2 "cmp w4, #0 \n" - "beq 5f \n" + "beq 4f \n" "eor v28.16b, v28.16b, v28.16b \n" "eor v29.16b, v29.16b, v29.16b \n" - "4: \n" + "3: \n" "prfm pldl1keep, [%2, #64] \n" "ld1 {v0.4h}, [%2], #8 \n" "shll v0.4s, v0.4h, #16 \n" @@ -3156,16 +3154,16 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const "subs w4, w4, #1 \n" "fmla v30.4s, v10.4s, v0.s[3] \n" "fmla v31.4s, v11.4s, v0.s[3] \n" - "bne 4b \n" + "bne 3b \n" "fadd v30.4s, v30.4s, v28.4s \n" "fadd v31.4s, v31.4s, v29.4s \n" - "5: \n" + "4: \n" "and w4, %w9, #3 \n" // w4 = remain = max_kk & 3 "cmp w4, #0 \n" - "beq 7f \n" + "beq 6f \n" - "6: \n" + "5: \n" "ld1r {v0.4h}, [%2], #2 \n" "shll v0.4s, v0.4h, #16 \n" "ld1 {v3.8h}, [%1], #16 \n" @@ -3174,26 +3172,26 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const "subs w4, w4, #1 \n" "fmla v30.4s, v4.4s, v0.4s \n" "fmla v31.4s, v5.4s, v0.4s \n" - "bne 6b \n" + "bne 5b \n" - "7: \n" + "6: \n" "shrn v30.4h, v30.4s, #16 \n" "shrn v31.4h, v31.4s, #16 \n" "tst %w11, #255 \n" - "beq 10f \n" + "beq 9f \n" // if out_elempack == 4 "cmp %w12, #4 \n" - "bne 8f \n" + "bne 7f \n" "lsl w4, %w13, #2 \n" "add x4, %3, w4, sxtw 1 \n" "st1 {v30.4h}, [%3], #8 \n" "st1 {v31.4h}, [x4] \n" - "b 9f \n" + "b 8f \n" // if out_elempack == 1 - "8: \n" + "7: \n" "add x4, %3, %w13, sxtw 1 \n" "st1 {v30.h}[0], [%3], #2 \n" "st1 {v30.h}[1], [x4] \n" @@ -3210,14 +3208,14 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const "add x4, x4, %w13, sxtw 1 \n" "st1 {v31.h}[3], [x4] \n" - "9: \n" + "8: \n" "add %0, %0, #32 \n" - "b 11f \n" + "b 10f \n" - "10: \n" + "9: \n" "st1 {v30.4s, v31.4s}, [%0], #32 \n" - "11: \n" + "10: \n" : "=r"(outptr), // %0 "=r"(pA), // %1 diff --git a/src/layer/arm/rmsnorm_arm.cpp b/src/layer/arm/rmsnorm_arm.cpp new file mode 100644 index 000000000000..e19136ca29d6 --- /dev/null +++ b/src/layer/arm/rmsnorm_arm.cpp @@ -0,0 +1,417 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "rmsnorm_arm.h" + +#if __ARM_NEON +#include +#endif // __ARM_NEON + +#include "arm_usability.h" +#include "cpu.h" + +namespace ncnn { + +RMSNorm_arm::RMSNorm_arm() +{ +#if __ARM_NEON + support_packing = true; +#if NCNN_ARM82 + support_fp16_storage = cpu_support_arm_asimdhp(); +#endif +#endif // __ARM_NEON + +#if NCNN_BF16 + support_bf16_storage = true; +#endif +} + +static void rmsnorm(float* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack) +{ + const int size = elemcount * elempack; + +#if __ARM_NEON + float32x4_t _rms = vdupq_n_f32(0.f); +#endif // __ARM_NEON + float rms = 0.f; + { + const float* ptr0 = ptr; + + int i = 0; +#if __ARM_NEON + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vld1q_f32(ptr0); + _rms = vmlaq_f32(_rms, _p, _p); + ptr0 += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + rms += ptr0[0] * ptr0[0]; + ptr0++; + } + } + +#if __ARM_NEON + if (elempack == 4) + { + float32x4_t _elemcount = vdupq_n_f32(elemcount); + float32x4_t _eps = vdupq_n_f32(eps); + +#if __aarch64__ + _rms = vdivq_f32(_rms, _elemcount); + _rms = vaddq_f32(_rms, _eps); +#else + float32x4_t _inv_elemcount = vrecpeq_f32(_elemcount); + _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount); + _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount); + _rms = vmlaq_f32(_eps, _rms, _inv_elemcount); +#endif + + float32x4_t _rsqrt_rms = vrsqrteq_f32(_rms); + _rsqrt_rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms); + _rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms); + } +#endif // __ARM_NEON + if (elempack == 1) + { +#if __ARM_NEON +#if __aarch64__ + rms += vaddvq_f32(_rms); +#else + float32x2_t _s2 = vadd_f32(vget_low_f32(_rms), vget_high_f32(_rms)); + _s2 = vpadd_f32(_s2, _s2); + rms += vget_lane_f32(_s2, 0); +#endif +#endif // __ARM_NEON + + rms = 1.f / sqrtf(rms / elemcount + eps); +#if __ARM_NEON + _rms = vdupq_n_f32(rms); +#endif // __ARM_NEON + } + + if (gamma_ptr) + { + int i = 0; +#if __ARM_NEON + if (elempack == 4) + { + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vld1q_f32(ptr); + float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]); + _p = vmulq_f32(_p, _rms); + _p = vmulq_f32(_p, _gamma); + vst1q_f32(ptr, _p); + ptr += 4; + gamma_ptr += 1; + } + } + if (elempack == 1) + { + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vld1q_f32(ptr); + float32x4_t _gamma = vld1q_f32(gamma_ptr); + _p = vmulq_f32(_p, _rms); + _p = vmulq_f32(_p, _gamma); + vst1q_f32(ptr, _p); + ptr += 4; + gamma_ptr += 4; + } + } +#endif // __ARM_NEON + for (; i < size; i++) + { + ptr[0] = (ptr[0] * rms) * gamma_ptr[0]; + ptr++; + gamma_ptr++; + } + } + else + { + int i = 0; +#if __ARM_NEON + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vld1q_f32(ptr); + _p = vmulq_f32(_p, _rms); + vst1q_f32(ptr, _p); + ptr += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + ptr[0] = ptr[0] * rms; + ptr++; + } + } +} + +int RMSNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int elembits = bottom_top_blob.elembits(); + +#if NCNN_ARM82 + if (support_fp16_storage && opt.use_fp16_storage && elembits == 16) + return forward_inplace_fp16s(bottom_top_blob, opt); +#endif + +#if NCNN_BF16 + if (opt.use_bf16_storage && elembits == 16) + return forward_inplace_bf16s(bottom_top_blob, opt); +#endif + + const int dims = bottom_top_blob.dims; + const int w = bottom_top_blob.w; + const int h = bottom_top_blob.h; + const int channels = bottom_top_blob.c; + const int elempack = bottom_top_blob.elempack; + + if (dims == 1) + { + // assert affine_size == w + + float* ptr = bottom_top_blob; + rmsnorm(ptr, gamma_data, eps, w * elempack, 1); + } + + if (dims == 2) + { + // assert affine_size == w + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + rmsnorm(ptr, gamma_data, eps, w, elempack); + } + } + + if (dims == 3) + { + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.channel(q).row(i); + rmsnorm(ptr, gamma_data, eps, w, elempack); + } + } + } + else // if (affine_size == w * h) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + rmsnorm(ptr, gamma_data, eps, w * h, elempack); + } + } + } + + return 0; +} + +#if NCNN_BF16 +static void rmsnorm_bf16s(unsigned short* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack) +{ + const int size = elemcount * elempack; + +#if __ARM_NEON + float32x4_t _rms = vdupq_n_f32(0.f); +#endif // __ARM_NEON + float rms = 0.f; + { + const unsigned short* ptr0 = ptr; + + int i = 0; +#if __ARM_NEON + for (; i + 3 < size; i += 4) + { + float32x4_t _p = bfloat2float(vld1_u16(ptr0)); + _rms = vmlaq_f32(_rms, _p, _p); + ptr0 += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + float v = bfloat16_to_float32(ptr0[0]); + rms += v * v; + ptr0++; + } + } + +#if __ARM_NEON + if (elempack == 4) + { + float32x4_t _elemcount = vdupq_n_f32(elemcount); + float32x4_t _eps = vdupq_n_f32(eps); + +#if __aarch64__ + _rms = vdivq_f32(_rms, _elemcount); + _rms = vaddq_f32(_rms, _eps); +#else + float32x4_t _inv_elemcount = vrecpeq_f32(_elemcount); + _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount); + _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount); + _rms = vmlaq_f32(_eps, _rms, _inv_elemcount); +#endif + + float32x4_t _rsqrt_rms = vrsqrteq_f32(_rms); + _rsqrt_rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms); + _rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms); + } +#endif // __ARM_NEON + if (elempack == 1) + { +#if __ARM_NEON +#if __aarch64__ + rms += vaddvq_f32(_rms); +#else + float32x2_t _s2 = vadd_f32(vget_low_f32(_rms), vget_high_f32(_rms)); + _s2 = vpadd_f32(_s2, _s2); + rms += vget_lane_f32(_s2, 0); +#endif +#endif // __ARM_NEON + + rms = 1.f / sqrtf(rms / elemcount + eps); +#if __ARM_NEON + _rms = vdupq_n_f32(rms); +#endif // __ARM_NEON + } + + if (gamma_ptr) + { + int i = 0; +#if __ARM_NEON + if (elempack == 4) + { + for (; i + 3 < size; i += 4) + { + float32x4_t _p = bfloat2float(vld1_u16(ptr)); + float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]); + _p = vmulq_f32(_p, _rms); + _p = vmulq_f32(_p, _gamma); + vst1_u16(ptr, float2bfloat(_p)); + ptr += 4; + gamma_ptr += 1; + } + } + if (elempack == 1) + { + for (; i + 3 < size; i += 4) + { + float32x4_t _p = bfloat2float(vld1_u16(ptr)); + float32x4_t _gamma = vld1q_f32(gamma_ptr); + _p = vmulq_f32(_p, _rms); + _p = vmulq_f32(_p, _gamma); + vst1_u16(ptr, float2bfloat(_p)); + ptr += 4; + gamma_ptr += 4; + } + } +#endif // __ARM_NEON + for (; i < size; i++) + { + float v = bfloat16_to_float32(ptr[0]); + ptr[0] = float32_to_bfloat16((v * rms) * gamma_ptr[0]); + ptr++; + gamma_ptr++; + } + } + else + { + int i = 0; +#if __ARM_NEON + for (; i + 3 < size; i += 4) + { + float32x4_t _p = bfloat2float(vld1_u16(ptr)); + _p = vmulq_f32(_p, _rms); + vst1_u16(ptr, float2bfloat(_p)); + ptr += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + float v = bfloat16_to_float32(ptr[0]); + ptr[0] = float32_to_bfloat16(v * rms); + ptr++; + } + } +} + +int RMSNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const +{ + const int dims = bottom_top_blob.dims; + const int w = bottom_top_blob.w; + const int h = bottom_top_blob.h; + const int channels = bottom_top_blob.c; + const int elempack = bottom_top_blob.elempack; + + if (dims == 1) + { + // assert affine_size == w + + unsigned short* ptr = bottom_top_blob; + rmsnorm_bf16s(ptr, gamma_data, eps, w * elempack, 1); + } + + if (dims == 2) + { + // assert affine_size == w + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + unsigned short* ptr = bottom_top_blob.row(i); + rmsnorm_bf16s(ptr, gamma_data, eps, w, elempack); + } + } + + if (dims == 3) + { + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + unsigned short* ptr = bottom_top_blob.channel(q).row(i); + rmsnorm_bf16s(ptr, gamma_data, eps, w, elempack); + } + } + } + else // if (affine_size == w * h) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + unsigned short* ptr = bottom_top_blob.channel(q); + rmsnorm_bf16s(ptr, gamma_data, eps, w * h, elempack); + } + } + } + + return 0; +} +#endif // NCNN_BF16 + +} // namespace ncnn diff --git a/src/layer/arm/rmsnorm_arm.h b/src/layer/arm/rmsnorm_arm.h new file mode 100644 index 000000000000..440153333710 --- /dev/null +++ b/src/layer/arm/rmsnorm_arm.h @@ -0,0 +1,40 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_RMSNORM_ARM_H +#define LAYER_RMSNORM_ARM_H + +#include "rmsnorm.h" + +namespace ncnn { + +class RMSNorm_arm : public RMSNorm +{ +public: + RMSNorm_arm(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; + +protected: +#if NCNN_ARM82 + int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; +#endif +#if NCNN_BF16 + int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif +}; + +} // namespace ncnn + +#endif // LAYER_RMSNORM_ARM_H diff --git a/src/layer/arm/rmsnorm_arm_asimdhp.cpp b/src/layer/arm/rmsnorm_arm_asimdhp.cpp new file mode 100644 index 000000000000..98d8e6964876 --- /dev/null +++ b/src/layer/arm/rmsnorm_arm_asimdhp.cpp @@ -0,0 +1,272 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "rmsnorm_arm.h" + +#if __ARM_NEON +#include +#include "arm_usability.h" +#endif // __ARM_NEON + +namespace ncnn { + +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +static void rmsnorm_fp16s(__fp16* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack) +{ + const int size = elemcount * elempack; + + float32x4_t _rms0 = vdupq_n_f32(0.f); + float32x4_t _rms1 = vdupq_n_f32(0.f); + float rms = 0.f; + { + const __fp16* ptr0 = ptr; + + int i = 0; + for (; i + 7 < size; i += 8) + { + float16x8_t _p = vld1q_f16(ptr0); + float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p)); + float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p)); + _rms0 = vmlaq_f32(_rms0, _p0, _p0); + _rms1 = vmlaq_f32(_rms1, _p1, _p1); + ptr0 += 8; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr0)); + _rms0 = vmlaq_f32(_rms0, _p, _p); + ptr0 += 4; + } + for (; i < size; i++) + { + rms += (float)ptr0[0] * (float)ptr0[0]; + ptr0++; + } + } + + if (elempack == 8) + { + float32x4_t _elemcount = vdupq_n_f32(elemcount); + float32x4_t _eps = vdupq_n_f32(eps); + + _rms0 = vdivq_f32(_rms0, _elemcount); + _rms1 = vdivq_f32(_rms1, _elemcount); + _rms0 = vaddq_f32(_rms0, _eps); + _rms1 = vaddq_f32(_rms1, _eps); + + float32x4_t _rsqrt_rms0 = vrsqrteq_f32(_rms0); + float32x4_t _rsqrt_rms1 = vrsqrteq_f32(_rms1); + _rsqrt_rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0); + _rsqrt_rms1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms1, _rsqrt_rms1), _rsqrt_rms1), _rsqrt_rms1); + _rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0); + _rms1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms1, _rsqrt_rms1), _rsqrt_rms1), _rsqrt_rms1); + } + if (elempack == 4) + { + _rms0 = vaddq_f32(_rms0, _rms1); + + float32x4_t _elemcount = vdupq_n_f32(elemcount); + float32x4_t _eps = vdupq_n_f32(eps); + + _rms0 = vdivq_f32(_rms0, _elemcount); + _rms0 = vaddq_f32(_rms0, _eps); + + float32x4_t _rsqrt_rms0 = vrsqrteq_f32(_rms0); + _rsqrt_rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0); + _rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0); + _rms1 = _rms0; + } + if (elempack == 1) + { + _rms0 = vaddq_f32(_rms0, _rms1); + rms += vaddvq_f32(_rms0); + + rms = 1.f / sqrtf(rms / elemcount + eps); + _rms0 = vdupq_n_f32(rms); + _rms1 = _rms0; + } + + if (gamma_ptr) + { + int i = 0; + if (elempack == 8) + { + for (; i + 7 < size; i += 8) + { + float16x8_t _p = vld1q_f16(ptr); + float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p)); + float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p)); + float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]); + _p0 = vmulq_f32(_p0, _rms0); + _p1 = vmulq_f32(_p1, _rms1); + _p0 = vmulq_f32(_p0, _gamma); + _p1 = vmulq_f32(_p1, _gamma); + _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)); + vst1q_f16(ptr, _p); + ptr += 8; + gamma_ptr += 1; + } + } + if (elempack == 4) + { + for (; i + 7 < size; i += 8) + { + float16x8_t _p = vld1q_f16(ptr); + float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p)); + float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p)); + float32x4_t _gamma0 = vdupq_n_f32(gamma_ptr[0]); + float32x4_t _gamma1 = vdupq_n_f32(gamma_ptr[1]); + _p0 = vmulq_f32(_p0, _rms0); + _p1 = vmulq_f32(_p1, _rms1); + _p0 = vmulq_f32(_p0, _gamma0); + _p1 = vmulq_f32(_p1, _gamma1); + _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)); + vst1q_f16(ptr, _p); + ptr += 8; + gamma_ptr += 2; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr)); + float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]); + _p = vmulq_f32(_p, _rms0); + _p = vmulq_f32(_p, _gamma); + vst1_f16(ptr, vcvt_f16_f32(_p)); + ptr += 4; + gamma_ptr += 1; + } + } + if (elempack == 1) + { + for (; i + 7 < size; i += 8) + { + float16x8_t _p = vld1q_f16(ptr); + float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p)); + float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p)); + float32x4_t _gamma0 = vld1q_f32(gamma_ptr); + float32x4_t _gamma1 = vld1q_f32(gamma_ptr + 4); + _p0 = vmulq_f32(_p0, _rms0); + _p1 = vmulq_f32(_p1, _rms1); + _p0 = vmulq_f32(_p0, _gamma0); + _p1 = vmulq_f32(_p1, _gamma1); + _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)); + vst1q_f16(ptr, _p); + ptr += 8; + gamma_ptr += 8; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr)); + float32x4_t _gamma = vld1q_f32(gamma_ptr); + _p = vmulq_f32(_p, _rms0); + _p = vmulq_f32(_p, _gamma); + vst1_f16(ptr, vcvt_f16_f32(_p)); + ptr += 4; + gamma_ptr += 4; + } + } + for (; i < size; i++) + { + ptr[0] = (__fp16)(((float)ptr[0] * rms) * gamma_ptr[0]); + ptr++; + gamma_ptr++; + } + } + else + { + int i = 0; + for (; i + 7 < size; i += 8) + { + float16x8_t _p = vld1q_f16(ptr); + float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p)); + float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p)); + _p0 = vmulq_f32(_p0, _rms0); + _p1 = vmulq_f32(_p1, _rms1); + _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)); + vst1q_f16(ptr, _p); + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr)); + _p = vmulq_f32(_p, _rms0); + vst1_f16(ptr, vcvt_f16_f32(_p)); + ptr += 4; + } + for (; i < size; i++) + { + ptr[0] = (__fp16)((float)ptr[0] * rms); + ptr++; + } + } +} + +int RMSNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + const int dims = bottom_top_blob.dims; + const int w = bottom_top_blob.w; + const int h = bottom_top_blob.h; + const int channels = bottom_top_blob.c; + const int elempack = bottom_top_blob.elempack; + + if (dims == 1) + { + // assert affine_size == w + + __fp16* ptr = bottom_top_blob; + rmsnorm_fp16s(ptr, gamma_data, eps, w * elempack, 1); + } + + if (dims == 2) + { + // assert affine_size == w + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + rmsnorm_fp16s(ptr, gamma_data, eps, w, elempack); + } + } + + if (dims == 3) + { + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i); + rmsnorm_fp16s(ptr, gamma_data, eps, w, elempack); + } + } + } + else // if (affine_size == w * h) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + rmsnorm_fp16s(ptr, gamma_data, eps, w * h, elempack); + } + } + } + + return 0; +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +} // namespace ncnn diff --git a/src/layer/embed.cpp b/src/layer/embed.cpp index ddda6b8bf199..2b9f8a60042c 100644 --- a/src/layer/embed.cpp +++ b/src/layer/embed.cpp @@ -30,6 +30,7 @@ int Embed::load_param(const ParamDict& pd) input_dim = pd.get(1, 0); bias_term = pd.get(2, 0); weight_data_size = pd.get(3, 0); + int8_scale_term = pd.get(18, 0); return 0; } @@ -47,18 +48,23 @@ int Embed::load_model(const ModelBin& mb) return -100; } +#if NCNN_INT8 + if (int8_scale_term) + { + weight_data_int8_scale = mb.load(1, 1)[0]; + } +#endif // NCNN_INT8 + return 0; } -int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void embed(const Mat& bottom_blob, const Mat& weight_data, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt) { - int words = static_cast(bottom_blob.total()); + const int num_output = top_blob.w; + const int words = top_blob.h; - top_blob.create(num_output, words, 4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const float* bias_ptr = bias_data; - // num_output #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < words; q++) { @@ -73,15 +79,79 @@ int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) con const float* em = (const float*)weight_data + num_output * word_index; - memcpy(outptr, em, num_output * sizeof(float)); + if (bias_ptr) + { + for (int p = 0; p < num_output; p++) + { + outptr[p] = em[p] + bias_ptr[p]; + } + } + else + { + memcpy(outptr, em, num_output * sizeof(float)); + } + } +} + +#if NCNN_INT8 +static void embed_int8(const Mat& bottom_blob, const Mat& weight_data, float weight_data_int8_scale, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt) +{ + const int num_output = top_blob.w; + const int words = top_blob.h; + + const float* bias_ptr = bias_data; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < words; q++) + { + float* outptr = top_blob.row(q); + + int word_index = ((const int*)bottom_blob)[q]; - if (bias_term) + if (word_index < 0) + word_index = 0; + if (word_index >= input_dim) + word_index = input_dim - 1; + + const float descale_em = 1.f / weight_data_int8_scale; + + const signed char* em = (const signed char*)weight_data + num_output * word_index; + + if (bias_ptr) { for (int p = 0; p < num_output; p++) { - outptr[p] += bias_data[p]; + outptr[p] = em[p] * descale_em + bias_ptr[p]; } } + else + { + for (int p = 0; p < num_output; p++) + { + outptr[p] = em[p] * descale_em; + } + } + } +} +#endif // NCNN_INT8 + +int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int words = static_cast(bottom_blob.total()); + + top_blob.create(num_output, words, 4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if NCNN_INT8 + if (int8_scale_term) + { + embed_int8(bottom_blob, weight_data, weight_data_int8_scale, bias_data, top_blob, input_dim, opt); + } + else +#endif // NCNN_INT8 + { + embed(bottom_blob, weight_data, bias_data, top_blob, input_dim, opt); } return 0; diff --git a/src/layer/embed.h b/src/layer/embed.h index 8e2366567163..b94c2b17bee4 100644 --- a/src/layer/embed.h +++ b/src/layer/embed.h @@ -38,9 +38,15 @@ class Embed : public Layer int weight_data_size; + int int8_scale_term; + // model Mat weight_data; Mat bias_data; + +#if NCNN_INT8 + float weight_data_int8_scale; +#endif }; } // namespace ncnn diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h index 980261a14966..2ec10bae48a5 100644 --- a/src/layer/riscv/rvv_mathfun.h +++ b/src/layer/riscv/rvv_mathfun.h @@ -308,7 +308,7 @@ _RVV_FLOAT32_COS_OP(8, 4) \ /* clamp the inputs to the range [-9, 9] since anything outside */ \ /* this range is -/+1.0f in single-precision. */ \ - x2 = vfmin_vf_f32m##LMUL(x, c_tanh_hi, vl); \ + x2 = vfmin_vf_f32m##LMUL(x2, c_tanh_hi, vl); \ \ /* since the polynomials are odd/even, we need x**2. */ \ vfloat32m##LMUL##_t z = vfmul_vv_f32m##LMUL(x2, x2, vl); \ diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h index ee5ffe4a304b..2cf5d08f4f0b 100644 --- a/src/layer/riscv/rvv_mathfun_fp16s.h +++ b/src/layer/riscv/rvv_mathfun_fp16s.h @@ -308,7 +308,7 @@ _RVV_FLOAT16_COS_OP(8, 2) \ /* clamp the inputs to the range [-9, 9] since anything outside */ \ /* this range is -/+1.0f in single-precision. */ \ - x2 = vfmin_vf_f16m##LMUL(x, c_tanh_hi, vl); \ + x2 = vfmin_vf_f16m##LMUL(x2, c_tanh_hi, vl); \ \ /* since the polynomials are odd/even, we need x**2. */ \ vfloat16m##LMUL##_t z = vfmul_vv_f16m##LMUL(x2, x2, vl); \ diff --git a/src/layer/rmsnorm.cpp b/src/layer/rmsnorm.cpp new file mode 100644 index 000000000000..77c74c6bccbb --- /dev/null +++ b/src/layer/rmsnorm.cpp @@ -0,0 +1,200 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "rmsnorm.h" + +namespace ncnn { + +RMSNorm::RMSNorm() +{ + one_blob_only = true; + support_inplace = true; +} + +int RMSNorm::load_param(const ParamDict& pd) +{ + affine_size = pd.get(0, 0); + eps = pd.get(1, 0.001f); + affine = pd.get(2, 1); + + return 0; +} + +int RMSNorm::load_model(const ModelBin& mb) +{ + if (affine == 0) + return 0; + + gamma_data = mb.load(affine_size, 1); + if (gamma_data.empty()) + return -100; + + return 0; +} + +int RMSNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + // x = x / sqrt(rms + eps) * gamma + + int dims = bottom_top_blob.dims; + + if (dims == 1) + { + int w = bottom_top_blob.w; + // assert affine_size == w + + float* ptr = bottom_top_blob; + + float sqsum = 0.f; + for (int i = 0; i < w; i++) + { + sqsum += ptr[i] * ptr[i]; + } + float rms = sqrtf(sqsum / w + eps); + + float a = 1.f / rms; + + if (affine) + { + for (int i = 0; i < w; i++) + { + ptr[i] = (ptr[i] * a) * gamma_data[i]; + } + } + else + { + for (int i = 0; i < w; i++) + { + ptr[i] = ptr[i] * a; + } + } + } + + if (dims == 2) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + // assert affine_size == w + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + + float sqsum = 0.f; + for (int j = 0; j < w; j++) + { + sqsum += ptr[j] * ptr[j]; + } + float rms = sqrtf(sqsum / w + eps); + + float a = 1.f / rms; + + if (affine) + { + for (int j = 0; j < w; j++) + { + ptr[j] = (ptr[j] * a) * gamma_data[j]; + } + } + else + { + for (int j = 0; j < w; j++) + { + ptr[j] = ptr[j] * a; + } + } + } + } + + if (dims == 3) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.channel(q).row(i); + + float sqsum = 0.f; + for (int j = 0; j < w; j++) + { + sqsum += ptr[j] * ptr[j]; + } + float rms = sqrtf(sqsum / w + eps); + + float a = 1.f / rms; + + if (affine) + { + for (int j = 0; j < w; j++) + { + ptr[j] = (ptr[j] * a) * gamma_data[j]; + } + } + else + { + for (int j = 0; j < w; j++) + { + ptr[j] = ptr[j] * a; + } + } + } + } + } + else // if (affine_size == size) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + float sqsum = 0.f; + for (int i = 0; i < size; i++) + { + sqsum += ptr[i] * ptr[i]; + } + float rms = sqrtf(sqsum / size + eps); + + float a = 1.f / rms; + + if (affine) + { + for (int i = 0; i < size; i++) + { + ptr[i] = (ptr[i] * a) * gamma_data[i]; + } + } + else + { + for (int i = 0; i < size; i++) + { + ptr[i] = ptr[i] * a; + } + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/rmsnorm.h b/src/layer/rmsnorm.h new file mode 100644 index 000000000000..4a09f2548bdf --- /dev/null +++ b/src/layer/rmsnorm.h @@ -0,0 +1,43 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_RMSNORM_H +#define LAYER_RMSNORM_H + +#include "layer.h" + +namespace ncnn { + +class RMSNorm : public Layer +{ +public: + RMSNorm(); + + virtual int load_param(const ParamDict& pd); + + virtual int load_model(const ModelBin& mb); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; + +public: + int affine_size; + float eps; + int affine; + + Mat gamma_data; +}; + +} // namespace ncnn + +#endif // LAYER_RMSNORM_H diff --git a/src/layer/x86/rmsnorm_x86.cpp b/src/layer/x86/rmsnorm_x86.cpp new file mode 100644 index 000000000000..db592c3e3810 --- /dev/null +++ b/src/layer/x86/rmsnorm_x86.cpp @@ -0,0 +1,413 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "rmsnorm_x86.h" + +#if __SSE2__ +#include +#if __AVX__ +#include +#endif // __AVX__ +#endif // __SSE2__ + +#include "x86_usability.h" + +namespace ncnn { + +RMSNorm_x86::RMSNorm_x86() +{ +#if __SSE2__ + support_packing = true; +#endif // __SSE2__ +} + +static void rmsnorm(float* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack) +{ + const int size = elemcount * elempack; + +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + __m512 _rms_avx512 = _mm512_set1_ps(0.f); +#endif // __AVX512F__ + __m256 _rms_avx = _mm256_set1_ps(0.f); +#endif // __AVX__ + __m128 _rms = _mm_set1_ps(0.f); +#endif // __SSE2__ + float rms = 0.f; + { + const float* ptr0 = ptr; + + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr0); + _rms_avx512 = _mm512_fmadd_ps(_p, _p, _rms_avx512); + ptr0 += 16; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr0); + _rms_avx = _mm256_comp_fmadd_ps(_p, _p, _rms_avx); + ptr0 += 8; + } +#endif // __AVX__ + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_loadu_ps(ptr0); + _rms = _mm_comp_fmadd_ps(_p, _p, _rms); + ptr0 += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + rms += ptr0[0] * ptr0[0]; + ptr0++; + } + } + +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (elempack == 16) + { + __m512 _elemcount = _mm512_set1_ps((float)elemcount); + __m512 _eps = _mm512_set1_ps(eps); + + _rms_avx512 = _mm512_div_ps(_rms_avx512, _elemcount); + _rms_avx512 = _mm512_add_ps(_rms_avx512, _eps); + + __m256 _rms0 = _mm256_rsqrt_ps(_mm512_extractf32x8_ps(_rms_avx512, 0)); + __m256 _rms1 = _mm256_rsqrt_ps(_mm512_extractf32x8_ps(_rms_avx512, 1)); + _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms0), _rms1, 1); + } +#endif // __AVX512F__ + if (elempack == 8) + { +#if __AVX512F__ + { + __m256 _rms0 = _mm512_castps512_ps256(_rms_avx512); + __m256 _rms1 = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(_rms_avx512), 1)); + _rms_avx = _mm256_add_ps(_rms_avx, _rms0); + _rms_avx = _mm256_add_ps(_rms_avx, _rms1); + } +#endif // __AVX512F__ + + __m256 _elemcount = _mm256_set1_ps((float)elemcount); + __m256 _eps = _mm256_set1_ps(eps); + + _rms_avx = _mm256_div_ps(_rms_avx, _elemcount); + _rms_avx = _mm256_add_ps(_rms_avx, _eps); + + _rms_avx = _mm256_rsqrt_ps(_rms_avx); +#if __AVX512F__ + _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms_avx), _rms_avx, 1); +#endif // __AVX512F__ + } +#endif // __AVX__ + if (elempack == 4) + { +#if __AVX__ +#if __AVX512F__ + { + __m256 _rms0 = _mm512_castps512_ps256(_rms_avx512); + __m256 _rms1 = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(_rms_avx512), 1)); + _rms_avx = _mm256_add_ps(_rms_avx, _rms0); + _rms_avx = _mm256_add_ps(_rms_avx, _rms1); + } +#endif // __AVX512F__ + { + __m128 _rms0 = _mm256_castps256_ps128(_rms_avx); + __m128 _rms1 = _mm256_extractf128_ps(_rms_avx, 1); + _rms = _mm_add_ps(_rms, _rms0); + _rms = _mm_add_ps(_rms, _rms1); + } +#endif // __AVX__ + + __m128 _elemcount = _mm_set1_ps((float)elemcount); + __m128 _eps = _mm_set1_ps(eps); + + _rms = _mm_div_ps(_rms, _elemcount); + _rms = _mm_add_ps(_rms, _eps); + + _rms = _mm_rsqrt_ps(_rms); +#if __AVX__ + _rms_avx = _mm256_insertf128_ps(_mm256_castps128_ps256(_rms), _rms, 1); +#if __AVX512F__ + _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms_avx), _rms_avx, 1); +#endif // __AVX512F__ +#endif // __AVX__ + } +#endif // __SSE2__ + if (elempack == 1) + { +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + rms += _mm512_comp_reduce_add_ps(_rms_avx512); +#endif // __AVX512F__ + rms += _mm256_reduce_add_ps(_rms_avx); +#endif // __AVX__ + rms += _mm_reduce_add_ps(_rms); +#endif // __SSE2__ + + rms = 1.f / sqrtf(rms / elemcount + eps); +#if __SSE2__ + _rms = _mm_set1_ps(rms); +#if __AVX__ + _rms_avx = _mm256_insertf128_ps(_mm256_castps128_ps256(_rms), _rms, 1); +#if __AVX512F__ + _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms_avx), _rms_avx, 1); +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ + } + + if (gamma_ptr) + { + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (elempack == 16) + { + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _gamma = _mm512_set1_ps(gamma_ptr[0]); + _p = _mm512_mul_ps(_p, _rms_avx512); + _p = _mm512_mul_ps(_p, _gamma); + _mm512_storeu_ps(ptr, _p); + ptr += 16; + gamma_ptr += 1; + } + } +#endif // __AVX512F__ + if (elempack == 8) + { +#if __AVX512F__ + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m256 _gamma0 = _mm256_set1_ps(gamma_ptr[0]); + __m256 _gamma1 = _mm256_set1_ps(gamma_ptr[1]); + __m512 _gamma = _mm512_insertf32x8(_mm512_castps256_ps512(_gamma0), _gamma1, 1); + _p = _mm512_mul_ps(_p, _rms_avx512); + _p = _mm512_mul_ps(_p, _gamma); + _mm512_storeu_ps(ptr, _p); + ptr += 16; + gamma_ptr += 2; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _gamma = _mm256_set1_ps(gamma_ptr[0]); + _p = _mm256_mul_ps(_p, _rms_avx); + _p = _mm256_mul_ps(_p, _gamma); + _mm256_storeu_ps(ptr, _p); + ptr += 8; + gamma_ptr += 1; + } + } +#endif // __AVX__ + if (elempack == 4) + { +#if __AVX__ +#if __AVX512F__ + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m128 _gamma0 = _mm_set1_ps(gamma_ptr[0]); + __m128 _gamma1 = _mm_set1_ps(gamma_ptr[1]); + __m128 _gamma2 = _mm_set1_ps(gamma_ptr[2]); + __m128 _gamma3 = _mm_set1_ps(gamma_ptr[3]); + __m256 _gamma01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_gamma0), _gamma1, 1); + __m256 _gamma23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_gamma2), _gamma3, 1); + __m512 _gamma = _mm512_insertf32x8(_mm512_castps256_ps512(_gamma01), _gamma23, 1); + _p = _mm512_mul_ps(_p, _rms_avx512); + _p = _mm512_mul_ps(_p, _gamma); + _mm512_storeu_ps(ptr, _p); + ptr += 16; + gamma_ptr += 4; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m128 _gamma0 = _mm_set1_ps(gamma_ptr[0]); + __m128 _gamma1 = _mm_set1_ps(gamma_ptr[1]); + __m256 _gamma = _mm256_insertf128_ps(_mm256_castps128_ps256(_gamma0), _gamma1, 1); + _p = _mm256_mul_ps(_p, _rms_avx); + _p = _mm256_mul_ps(_p, _gamma); + _mm256_storeu_ps(ptr, _p); + ptr += 8; + gamma_ptr += 2; + } +#endif // __AVX__ + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _gamma = _mm_set1_ps(gamma_ptr[0]); + _p = _mm_mul_ps(_p, _rms); + _p = _mm_mul_ps(_p, _gamma); + _mm_storeu_ps(ptr, _p); + ptr += 4; + gamma_ptr += 1; + } + } + if (elempack == 1) + { +#if __AVX__ +#if __AVX512F__ + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _gamma = _mm512_loadu_ps(gamma_ptr); + _p = _mm512_mul_ps(_p, _rms_avx512); + _p = _mm512_mul_ps(_p, _gamma); + _mm512_storeu_ps(ptr, _p); + ptr += 16; + gamma_ptr += 16; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _gamma = _mm256_loadu_ps(gamma_ptr); + _p = _mm256_mul_ps(_p, _rms_avx); + _p = _mm256_mul_ps(_p, _gamma); + _mm256_storeu_ps(ptr, _p); + ptr += 8; + gamma_ptr += 8; + } +#endif // __AVX__ + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _gamma = _mm_loadu_ps(gamma_ptr); + _p = _mm_mul_ps(_p, _rms); + _p = _mm_mul_ps(_p, _gamma); + _mm_storeu_ps(ptr, _p); + ptr += 4; + gamma_ptr += 4; + } + } +#endif // __SSE2__ + for (; i < size; i++) + { + ptr[0] = (ptr[0] * rms) * gamma_ptr[0]; + ptr++; + gamma_ptr++; + } + } + else + { + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + _p = _mm512_mul_ps(_p, _rms_avx512); + _mm512_storeu_ps(ptr, _p); + ptr += 16; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr); + _p = _mm256_mul_ps(_p, _rms_avx); + _mm256_storeu_ps(ptr, _p); + ptr += 8; + } +#endif // __AVX__ + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_loadu_ps(ptr); + _p = _mm_mul_ps(_p, _rms); + _mm_storeu_ps(ptr, _p); + ptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + ptr[0] = ptr[0] * rms; + ptr++; + } + } +} + +int RMSNorm_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + const int dims = bottom_top_blob.dims; + const int w = bottom_top_blob.w; + const int h = bottom_top_blob.h; + const int channels = bottom_top_blob.c; + const int elempack = bottom_top_blob.elempack; + + if (dims == 1) + { + // assert affine_size == w + + float* ptr = bottom_top_blob; + rmsnorm(ptr, gamma_data, eps, w * elempack, 1); + } + + if (dims == 2) + { + // assert affine_size == w + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + rmsnorm(ptr, gamma_data, eps, w, elempack); + } + } + + if (dims == 3) + { + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.channel(q).row(i); + rmsnorm(ptr, gamma_data, eps, w, elempack); + } + } + } + else // if (affine_size == w * h) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + rmsnorm(ptr, gamma_data, eps, w * h, elempack); + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/x86/rmsnorm_x86.h b/src/layer/x86/rmsnorm_x86.h new file mode 100644 index 000000000000..2e6296db1c32 --- /dev/null +++ b/src/layer/x86/rmsnorm_x86.h @@ -0,0 +1,32 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_RMSNORM_X86_H +#define LAYER_RMSNORM_X86_H + +#include "rmsnorm.h" + +namespace ncnn { + +class RMSNorm_x86 : public RMSNorm +{ +public: + RMSNorm_x86(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_RMSNORM_X86_H diff --git a/src/platform.h.in b/src/platform.h.in index a0f17f39e315..50a9454b7da0 100644 --- a/src/platform.h.in +++ b/src/platform.h.in @@ -70,7 +70,7 @@ #ifdef __cplusplus #if NCNN_THREADS -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 #define WIN32_LEAN_AND_MEAN #include #include @@ -86,7 +86,7 @@ namespace ncnn { #if NCNN_THREADS -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 class NCNN_EXPORT Mutex { public: @@ -141,7 +141,7 @@ public: private: DWORD key; }; -#else // (defined _WIN32 && !(defined __MINGW32__)) +#else // defined _WIN32 class NCNN_EXPORT Mutex { public: @@ -186,7 +186,7 @@ public: private: pthread_key_t key; }; -#endif // (defined _WIN32 && !(defined __MINGW32__)) +#endif // defined _WIN32 #else // NCNN_THREADS class NCNN_EXPORT Mutex { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d0d2a66899a6..54e778e35e79 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -153,6 +153,7 @@ ncnn_add_layer_test(Dropout) ncnn_add_layer_test(Einsum) ncnn_add_layer_test(Eltwise) ncnn_add_layer_test(ELU) +ncnn_add_layer_test(Embed) ncnn_add_layer_test(Erf) ncnn_add_layer_test(ExpandDims) ncnn_add_layer_test(Flatten) @@ -193,6 +194,7 @@ ncnn_add_layer_test(ReLU) ncnn_add_layer_test(Reorg) ncnn_add_layer_test(Requantize) ncnn_add_layer_test(Reshape) +ncnn_add_layer_test(RMSNorm) ncnn_add_layer_test(RNN) ncnn_add_layer_test(ROIPooling) ncnn_add_layer_test(ROIAlign) diff --git a/tests/test_embed.cpp b/tests/test_embed.cpp new file mode 100644 index 000000000000..9c007ee5d7e7 --- /dev/null +++ b/tests/test_embed.cpp @@ -0,0 +1,108 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "testutil.h" + +static int test_embed(int words, int num_output, int input_dim, int bias) +{ + ncnn::ParamDict pd; + pd.set(0, num_output); + pd.set(1, input_dim); + pd.set(2, bias); + pd.set(3, num_output * input_dim); + + std::vector weights(bias ? 2 : 1); + weights[0] = RandomMat(num_output * input_dim); + if (bias) + weights[1] = RandomMat(num_output); + + ncnn::Mat a(words); + RandomizeInt(a, 0, input_dim); + + int ret = test_layer("Embed", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_embed failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias); + } + + return ret; +} + +static int test_embed_0() +{ + return 0 + || test_embed(128, 128, 128, 0) + || test_embed(128, 128, 128, 1) + || test_embed(127, 127, 127, 0) + || test_embed(127, 127, 127, 1) + || test_embed(124, 124, 124, 0) + || test_embed(124, 124, 124, 1); +} + +#if NCNN_INT8 +static int test_embed_int8(int words, int num_output, int input_dim, int bias) +{ + ncnn::ParamDict pd; + pd.set(0, num_output); + pd.set(1, input_dim); + pd.set(2, bias); + pd.set(3, num_output * input_dim); + pd.set(18, 2); + + std::vector weights(bias ? 3 : 2); + weights[0] = RandomS8Mat(num_output * input_dim); + if (bias) + { + weights[1] = RandomMat(num_output); + weights[2] = RandomMat(1, 100.f, 200.f); + } + else + { + weights[1] = RandomMat(1, 100.f, 200.f); + } + + ncnn::Mat a(words); + RandomizeInt(a, 0, input_dim); + + int ret = test_layer("Embed", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_embed_int8 failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias); + } + + return ret; +} + +static int test_embed_1() +{ + return 0 + || test_embed_int8(128, 128, 128, 0) + || test_embed_int8(128, 128, 128, 1) + || test_embed_int8(127, 127, 127, 0) + || test_embed_int8(127, 127, 127, 1) + || test_embed_int8(124, 124, 124, 0) + || test_embed_int8(124, 124, 124, 1); +} +#endif // NCNN_INT8 + +int main() +{ + SRAND(7767517); + +#if NCNN_INT8 + return test_embed_0() || test_embed_1(); +#else + return test_embed_0(); +#endif +} diff --git a/tests/test_rmsnorm.cpp b/tests/test_rmsnorm.cpp new file mode 100644 index 000000000000..2d88c162d8b5 --- /dev/null +++ b/tests/test_rmsnorm.cpp @@ -0,0 +1,121 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "testutil.h" + +static int test_rmsnorm(const ncnn::Mat& a, int affine_size, float eps, int affine) +{ + ncnn::ParamDict pd; + pd.set(0, affine_size); + pd.set(1, eps); + pd.set(2, affine); + + std::vector weights(1); + weights[0] = RandomMat(affine_size); + + int ret = test_layer("RMSNorm", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_rmsnorm failed a.dims=%d a=(%d %d %d) affine_size=%d eps=%f affine=%d\n", a.dims, a.w, a.h, a.c, affine_size, eps, affine); + } + + return ret; +} + +static int test_rmsnorm_0() +{ + return 0 + || test_rmsnorm(RandomMat(6, 4, 2), 6, 0.01f, 0) + || test_rmsnorm(RandomMat(4, 5, 6), 4, 0.01f, 0) + || test_rmsnorm(RandomMat(3, 3, 8), 3, 0.002f, 0) + || test_rmsnorm(RandomMat(5, 6, 12), 5, 0.02f, 0) + || test_rmsnorm(RandomMat(4, 7, 16), 4, 0.02f, 0) + || test_rmsnorm(RandomMat(6, 7, 24), 6, 0.001f, 0) + || test_rmsnorm(RandomMat(5, 8, 32), 5, 0.001f, 0) + || test_rmsnorm(RandomMat(6, 4, 2), 6, 0.01f, 1) + || test_rmsnorm(RandomMat(4, 5, 6), 4, 0.01f, 1) + || test_rmsnorm(RandomMat(3, 3, 8), 3, 0.002f, 1) + || test_rmsnorm(RandomMat(5, 6, 12), 5, 0.02f, 1) + || test_rmsnorm(RandomMat(4, 7, 16), 4, 0.02f, 1) + || test_rmsnorm(RandomMat(6, 7, 24), 6, 0.001f, 1) + || test_rmsnorm(RandomMat(5, 8, 32), 5, 0.001f, 1); +} + +static int test_rmsnorm_1() +{ + return 0 + || test_rmsnorm(RandomMat(6, 4, 2), 24, 0.01f, 0) + || test_rmsnorm(RandomMat(4, 5, 6), 20, 0.01f, 0) + || test_rmsnorm(RandomMat(3, 3, 8), 9, 0.002f, 0) + || test_rmsnorm(RandomMat(5, 6, 12), 30, 0.02f, 0) + || test_rmsnorm(RandomMat(4, 7, 16), 28, 0.02f, 0) + || test_rmsnorm(RandomMat(6, 7, 24), 42, 0.001f, 0) + || test_rmsnorm(RandomMat(5, 8, 32), 40, 0.001f, 0) + || test_rmsnorm(RandomMat(6, 4, 2), 24, 0.01f, 1) + || test_rmsnorm(RandomMat(4, 5, 6), 20, 0.01f, 1) + || test_rmsnorm(RandomMat(3, 3, 8), 9, 0.002f, 1) + || test_rmsnorm(RandomMat(5, 6, 12), 30, 0.02f, 1) + || test_rmsnorm(RandomMat(4, 7, 16), 28, 0.02f, 1) + || test_rmsnorm(RandomMat(6, 7, 24), 42, 0.001f, 1) + || test_rmsnorm(RandomMat(5, 8, 32), 40, 0.001f, 1); +} + +static int test_rmsnorm_2() +{ + return 0 + || test_rmsnorm(RandomMat(4, 2), 4, 0.01f, 0) + || test_rmsnorm(RandomMat(5, 6), 5, 0.01f, 0) + || test_rmsnorm(RandomMat(3, 8), 3, 0.002f, 0) + || test_rmsnorm(RandomMat(6, 12), 6, 0.02f, 0) + || test_rmsnorm(RandomMat(4, 16), 4, 0.02f, 0) + || test_rmsnorm(RandomMat(7, 24), 7, 0.001f, 0) + || test_rmsnorm(RandomMat(8, 32), 8, 0.001f, 0) + || test_rmsnorm(RandomMat(4, 2), 4, 0.01f, 1) + || test_rmsnorm(RandomMat(5, 6), 5, 0.01f, 1) + || test_rmsnorm(RandomMat(3, 8), 3, 0.002f, 1) + || test_rmsnorm(RandomMat(6, 12), 6, 0.02f, 1) + || test_rmsnorm(RandomMat(4, 16), 4, 0.02f, 1) + || test_rmsnorm(RandomMat(7, 24), 7, 0.001f, 1) + || test_rmsnorm(RandomMat(8, 32), 8, 0.001f, 1); +} + +static int test_rmsnorm_3() +{ + return 0 + || test_rmsnorm(RandomMat(2), 2, 0.01f, 0) + || test_rmsnorm(RandomMat(6), 6, 0.01f, 0) + || test_rmsnorm(RandomMat(8), 8, 0.002f, 0) + || test_rmsnorm(RandomMat(12), 12, 0.02f, 0) + || test_rmsnorm(RandomMat(16), 16, 0.02f, 0) + || test_rmsnorm(RandomMat(24), 24, 0.001f, 0) + || test_rmsnorm(RandomMat(32), 32, 0.001f, 0) + || test_rmsnorm(RandomMat(2), 2, 0.01f, 1) + || test_rmsnorm(RandomMat(6), 6, 0.01f, 1) + || test_rmsnorm(RandomMat(8), 8, 0.002f, 1) + || test_rmsnorm(RandomMat(12), 12, 0.02f, 1) + || test_rmsnorm(RandomMat(16), 16, 0.02f, 1) + || test_rmsnorm(RandomMat(24), 24, 0.001f, 1) + || test_rmsnorm(RandomMat(32), 32, 0.001f, 1); +} + +int main() +{ + SRAND(7767517); + + return 0 + || test_rmsnorm_0() + || test_rmsnorm_1() + || test_rmsnorm_2() + || test_rmsnorm_3(); +} diff --git a/tools/modelwriter.h b/tools/modelwriter.h index 88ccb948a9c8..ff86338bca9c 100644 --- a/tools/modelwriter.h +++ b/tools/modelwriter.h @@ -99,6 +99,7 @@ #include "layer/reorg.h" #include "layer/requantize.h" #include "layer/reshape.h" +#include "layer/rmsnorm.h" #include "layer/rnn.h" #include "layer/roialign.h" #include "layer/roipooling.h" @@ -1676,9 +1677,20 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 1=%d", input_dim) fprintf_param_value(" 2=%d", bias_term) fprintf_param_value(" 3=%d", weight_data_size) + fprintf_param_value(" 18=%d", int8_scale_term) fwrite_weight_tag_data(op->weight_data, bp); fwrite_weight_data(op->bias_data, bp); + +#if NCNN_INT8 + // write int8_scale data + if (op->int8_scale_term) + { + ncnn::Mat weight_data_int8_scales(1); + weight_data_int8_scales[0] = op->weight_data_int8_scale; + fwrite_weight_data(weight_data_int8_scales, bp, 90, 100); + } +#endif // NCNN_INT8 } else if (layer->type == "Exp") { @@ -2007,6 +2019,7 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 3=%d", kdim) fprintf_param_value(" 4=%d", vdim) fprintf_param_value(" 5=%d", attn_mask) + fprintf_param_value(" 6=%e", scale) fwrite_weight_tag_data(op->q_weight_data, bp); fwrite_weight_data(op->q_bias_data, bp); @@ -2301,6 +2314,17 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 2=%d", c) fprintf_param_value(" 3=%d", permute) } + else if (layer->type == "RMSNorm") + { + ncnn::RMSNorm* op = (ncnn::RMSNorm*)layer; + ncnn::RMSNorm* op_default = (ncnn::RMSNorm*)layer_default; + + fprintf_param_value(" 0=%d", affine_size) + fprintf_param_value(" 1=%e", eps) + fprintf_param_value(" 2=%d", affine) + + fwrite_weight_data(op->gamma_data, bp); + } else if (layer->type == "RNN") { ncnn::RNN* op = (ncnn::RNN*)layer; diff --git a/tools/onnx/onnx2ncnn.cpp b/tools/onnx/onnx2ncnn.cpp index e443a28edf14..1b29e34c1285 100644 --- a/tools/onnx/onnx2ncnn.cpp +++ b/tools/onnx/onnx2ncnn.cpp @@ -2956,6 +2956,15 @@ static std::string trunc_name(std::string name) int main(int argc, char** argv) { + fprintf(stderr, "onnx2ncnn may not fully meet your needs. For more accurate and elegant\n\ +conversion results, please use PNNX. PyTorch Neural Network eXchange (PNNX) is\n\ +an open standard for PyTorch model interoperability. PNNX provides an open model\n\ +format for PyTorch. It defines computation graph as well as high level operators\n\ +strictly matches PyTorch. You can obtain pnnx through the following ways:\n\ +1. Install via python\n\ + pip3 install pnnx\n\ +2. Get the executable from https://github.com/pnnx/pnnx\n\ +For more information, please refer to https://github.com/pnnx/pnnx\n"); if (!(argc == 2 || argc == 4)) { fprintf(stderr, "Usage: %s [onnxpb] [ncnnparam] [ncnnbin]\n", argv[0]); diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index e2fc28da9a9c..7743a8ae453e 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -77,6 +77,7 @@ set(pnnx_pass_level1_SRCS pass_level1/nn_ReplicationPad1d.cpp pass_level1/nn_ReplicationPad2d.cpp pass_level1/nn_ReplicationPad3d.cpp + pass_level1/nn_RMSNorm.cpp pass_level1/nn_RNN.cpp pass_level1/nn_RReLU.cpp pass_level1/nn_SELU.cpp @@ -163,6 +164,7 @@ set(pnnx_pass_level2_SRCS pass_level2/F_prelu.cpp pass_level2/F_relu.cpp pass_level2/F_relu6.cpp + pass_level2/F_rms_norm.cpp pass_level2/F_rrelu.cpp pass_level2/F_scaled_dot_product_attention.cpp pass_level2/F_selu.cpp @@ -367,6 +369,7 @@ set(pnnx_pass_level5_SRCS pass_level5/fuse_pixel_unshuffle.cpp pass_level5/fuse_layernorm.cpp pass_level5/fuse_multiheadattention.cpp + pass_level5/fuse_rmsnorm.cpp pass_level5/fuse_scaled_dot_product_attention.cpp pass_level5/fuse_select_to_unbind.cpp pass_level5/fuse_silu.cpp @@ -383,6 +386,7 @@ set(pnnx_pass_level5_SRCS pass_level5/fuse_static_layernorm.cpp pass_level5/fuse_static_linear.cpp pass_level5/fuse_static_prelu.cpp + pass_level5/fuse_static_rmsnorm.cpp pass_level5/normalize_einsum_equation.cpp pass_level5/unroll_rnn_op.cpp ) @@ -472,6 +476,8 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/F_prelu.cpp pass_ncnn/F_relu.cpp pass_ncnn/F_relu6.cpp + pass_ncnn/F_rms_norm.cpp + pass_ncnn/F_scaled_dot_product_attention.cpp pass_ncnn/F_selu.cpp pass_ncnn/F_sigmoid.cpp pass_ncnn/F_silu.cpp @@ -537,6 +543,7 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/nn_ReplicationPad1d.cpp pass_ncnn/nn_ReplicationPad2d.cpp pass_ncnn/nn_ReplicationPad3d.cpp + pass_ncnn/nn_RMSNorm.cpp pass_ncnn/nn_RNN.cpp pass_ncnn/nn_SELU.cpp pass_ncnn/nn_Sigmoid.cpp @@ -571,6 +578,7 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/torch_mm.cpp pass_ncnn/torch_norm.cpp pass_ncnn/torch_prod.cpp + pass_ncnn/torch_roll.cpp pass_ncnn/torch_slice_scatter.cpp pass_ncnn/torch_squeeze.cpp pass_ncnn/torch_sum.cpp @@ -586,12 +594,12 @@ if(PROTOBUF_FOUND) endif() if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE) - protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx.proto) + protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx-data.proto onnx-ml.proto onnx-operators-ml.proto) add_library(onnxproto STATIC ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS}) target_include_directories(onnxproto PUBLIC ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) target_link_libraries(onnxproto PUBLIC ${PROTOBUF_LIBRARIES}) else() - add_library(onnxproto STATIC onnx.proto) + add_library(onnxproto STATIC onnx-data.proto onnx-ml.proto onnx-operators-ml.proto) target_include_directories(onnxproto PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) protobuf_generate(TARGET onnxproto) target_link_libraries(onnxproto PUBLIC protobuf::libprotobuf) diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp index 07d2bbefefd2..8b2b6dfd2d7f 100644 --- a/tools/pnnx/src/ir.cpp +++ b/tools/pnnx/src/ir.cpp @@ -1091,7 +1091,8 @@ static std::string expand_expression(const Operator* op) || t == "maximum" || t == "min" || t == "minimum" - || t == "pow") + || t == "pow" + || t == "logaddexp") { std::string binaryop; if (t == "atan2") binaryop = "torch.atan2"; @@ -1101,6 +1102,7 @@ static std::string expand_expression(const Operator* op) if (t == "min") binaryop = "torch.min"; if (t == "minimum") binaryop = "torch.minimum"; if (t == "pow") binaryop = "torch.pow"; + if (t == "logaddexp") binaryop = "torch.logaddexp"; std::string a = exprstack.top(); exprstack.pop(); @@ -2109,6 +2111,15 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath) fprintf(pyfp, ", "); } + if (op->type == "torch.max" || op->type == "torch.max") + { + if (op->has_param("dim") && op->outputs.size() == 1) + { + // torch.max and torch.min with dim returns tuple + fprintf(pyfp, ", _"); + } + } + if (op->type.substr(0, 7) == "Tensor.") { if (op->type == "Tensor.fill") diff --git a/tools/pnnx/src/load_onnx.cpp b/tools/pnnx/src/load_onnx.cpp index 36624d916bdd..9adf2b470888 100644 --- a/tools/pnnx/src/load_onnx.cpp +++ b/tools/pnnx/src/load_onnx.cpp @@ -14,7 +14,7 @@ #include "load_onnx.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" #include #include diff --git a/tools/pnnx/src/onnx-data.proto b/tools/pnnx/src/onnx-data.proto new file mode 100644 index 000000000000..d7d925d45d02 --- /dev/null +++ b/tools/pnnx/src/onnx-data.proto @@ -0,0 +1,155 @@ +// +// WARNING: This file is automatically generated! Please edit onnx.in.proto. +// + + +// SPDX-License-Identifier: Apache-2.0 + + +syntax = "proto2"; + +package onnx; +import "onnx-ml.proto"; + +// This file contains the proto definitions for MapProto and +// SequenceProto. These protos are used to represent the data structures +// of maps and sequence for use in test data or ModelProto. + +// Sequences +// +// Defines a dense, ordered, collection of elements that are of homogeneous types. +// Sequences can be made out of tensors, maps, or sequences. +// +// If a sequence is made out of tensors, the tensors must have the same element +// type (i.e. int32). In some cases, the tensors in a sequence can have different +// shapes. Whether the tensors can have different shapes or not depends on the +// type/shape associated with the corresponding "ValueInfo". For example, +// "Sequence" means that all tensors have same shape. However, +// "Sequence" means they can have different +// shapes (all of rank 2), where "omitted" means the corresponding dimension has +// no symbolic/constant value. Finally, "Sequence>" means +// that the different tensors can have different ranks, when the "shape" itself +// is omitted from the tensor-type. For a more complete description, refer to +// https://github.com/onnx/onnx/blob/main/docs/IR.md#static-tensor-shapes. +// +message SequenceProto { + + optional string name = 1; + + enum DataType { + UNDEFINED = 0; + TENSOR = 1; + SPARSE_TENSOR = 2; + SEQUENCE = 3; + MAP = 4; + OPTIONAL = 5; + } + + // The data type of the element. + // This field MUST have a valid SequenceProto.DataType value + optional int32 elem_type = 2; + + // For TensorProto values. + // When this field is present, the elem_type field MUST be TENSOR. + repeated TensorProto tensor_values = 3; + + // For SparseTensorProto values. + // When this field is present, the elem_type field MUST be SPARSE_TENSOR. + repeated SparseTensorProto sparse_tensor_values = 4; + + // For SequenceProto values, allowing sequences to be of themselves. + // When this field is present, the elem_type field MUST be SEQUENCE. + repeated SequenceProto sequence_values = 5; + + // For MapProto values. + // When this field is present, the elem_type field MUST be MAP. + repeated MapProto map_values = 6; + + // For OptionalProto values. + // When this field is present, the elem_type field MUST be Optional. + repeated OptionalProto optional_values = 7; + +} + + +// Maps +// +// Specifies an associative table, defined by keys and values. +// MapProto is formed with a repeated field of keys (of type INT8, INT16, INT32, +// INT64, UINT8, UINT16, UINT32, UINT64, or STRING) and values (of type TENSOR, +// SPARSE_TENSOR, SEQUENCE, or MAP). Key types and value types have to remain +// the same throughout the instantiation of the MapProto. +// +message MapProto { + + optional string name = 1; + + // All MapProto data types must have the same length of keys and values. + + // The data type of the key. + // This field MUST have a valid TensorProto.DataType value of + // INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64, or STRING + optional int32 key_type = 2; + + // Every element of keys has to be one of the following data types + // INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64, or STRING. + // The integer cases are represented by the repeated int64 field keys below. + repeated int64 keys = 3; + + // If keys are strings, they are represented by the repeated bytes field + // string_keys below. + repeated bytes string_keys = 4; + + // MapProto values are represented in a SequenceProto of the same length as the + // repeated keys field and have to be one of the following data types + // TENSOR, SPARSE_TENSOR, MAP, SEQUENCE. + optional SequenceProto values = 5; +} + +// Optional +// +// +message OptionalProto { + + optional string name = 1; + + enum DataType { + UNDEFINED = 0; + TENSOR = 1; + SPARSE_TENSOR = 2; + SEQUENCE = 3; + MAP = 4; + OPTIONAL = 5; + } + + // The data type of the element, identifies if the OptionalProto value + // is Tensor, Sparse Tensor, Sequence, Map, or Optional. + // The type of the optional value MUST match the elem_type specified. + // This field MUST have a valid OptionalProto.DataType value. + optional int32 elem_type = 2; + + // For TensorProto value. + // When this field is present, the elem_type field MUST be TENSOR. + optional TensorProto tensor_value = 3; + + // For SparseTensorProto value. + // When this field is present, the elem_type field MUST be SPARSE_TENSOR. + optional SparseTensorProto sparse_tensor_value = 4; + + // For SequenceProto value. + // When this field is present, the elem_type field MUST be SEQUENCE. + optional SequenceProto sequence_value = 5; + + // For MapProto value. + // When this field is present, the elem_type field MUST be MAP. + optional MapProto map_value = 6; + + // For OptionalProto value, allowing optional to be of itself (completeness) + // When this field is present, the elem_type field MUST be OPTIONAL. + optional OptionalProto optional_value = 7; + +} + +// For using protobuf-lite +option optimize_for = LITE_RUNTIME; + diff --git a/tools/pnnx/src/onnx.proto b/tools/pnnx/src/onnx-ml.proto similarity index 92% rename from tools/pnnx/src/onnx.proto rename to tools/pnnx/src/onnx-ml.proto index 15012ce65c38..5f4c0f4a4e28 100644 --- a/tools/pnnx/src/onnx.proto +++ b/tools/pnnx/src/onnx-ml.proto @@ -24,6 +24,8 @@ package onnx; // // The normative semantic specification of the ONNX IR is found in docs/IR.md. // Definitions of the built-in neural network operators may be found in docs/Operators.md. +// Definitions of the built-in classical machine learning operators may be found in +// docs/Operators-ml.md. // Notes // @@ -106,7 +108,11 @@ enum Version { // IR VERSION 9 published on May 5, 2023 // Added AttributeProto to FunctionProto so that default attribute values can be set. // Added FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ. - IR_VERSION = 0x0000000000000009; + IR_VERSION_2023_5_5 = 0x0000000000000009; + + // IR VERSION 10 published on TBD + // Added UINT4, INT4. + IR_VERSION = 0x000000000000000A; } // Attributes @@ -190,6 +196,8 @@ message ValueInfoProto { optional TypeProto type = 2; // A human-readable documentation for this value. Markdown is allowed. optional string doc_string = 3; + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 4; } // Nodes @@ -211,12 +219,17 @@ message NodeProto { optional string op_type = 4; // namespace Operator // The domain of the OperatorSet that specifies the operator named by op_type. optional string domain = 7; // namespace Domain + // Overload identifier, used only to map this to a model-local function. + optional string overload = 8; // Additional named attributes. repeated AttributeProto attribute = 5; // A human-readable documentation for this node. Markdown is allowed. optional string doc_string = 6; + + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 9; } // Training information @@ -401,7 +414,7 @@ message ModelProto { // A list of function protos local to the model. // - // Name of the function "FunctionProto.name" should be unique within the domain "FunctionProto.domain". + // The (domain, name, overload) tuple must be unique across the function protos in this list. // In case of any conflicts the behavior (whether the model local functions are given higher priority, // or standard operator sets are given higher priotity or this is treated as error) is defined by // the runtimes. @@ -475,6 +488,9 @@ message GraphProto { // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model. repeated TensorAnnotation quantization_annotation = 14; + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 16; + reserved 3, 4, 6 to 9; reserved "ir_version", "producer_version", "producer_tag", "domain"; } @@ -520,7 +536,11 @@ message TensorProto { FLOAT8E4M3FN = 17; // float 8, mostly used for coefficients, supports nan, not inf FLOAT8E4M3FNUZ = 18; // float 8, mostly used for coefficients, supports nan, not inf, no negative zero FLOAT8E5M2 = 19; // follows IEEE 754, supports nan, inf, mostly used for gradients - FLOAT8E5M2FNUZ = 20; // follows IEEE 754, supports nan, inf, mostly used for gradients, no negative zero + FLOAT8E5M2FNUZ = 20; // follows IEEE 754, supports nan, not inf, mostly used for gradients, no negative zero + + // 4-bit data-types + UINT4 = 21; // Unsigned integer in range [0, 15] + INT4 = 22; // Signed integer in range [-8, 7], using two's-complement representation // Future extensions go here. } @@ -555,11 +575,13 @@ message TensorProto { // When this field is present, the data_type field MUST be FLOAT or COMPLEX64. repeated float float_data = 4 [packed = true]; - // For int32, uint8, int8, uint16, int16, bool, float8, and float16 values + // For int32, uint8, int8, uint16, int16, uint4, int4, bool, float8 and float16 values // float16 and float8 values must be bit-wise converted to an uint16_t prior // to writing to the buffer. + // uint4 and int4 values must be packed to 4bitx2 prior to writing to the buffer, the first element is stored in + // the 4 LSB and the second element is stored in the 4 MSB. // When this field is present, the data_type field MUST be - // INT32, INT16, INT8, UINT16, UINT8, BOOL, FLOAT16, BFLOAT16, FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ + // INT32, INT16, INT8, INT4, UINT16, UINT8, UINT4, BOOL, FLOAT16, BFLOAT16, FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ repeated int32 int32_data = 5 [packed = true]; // For strings. @@ -589,6 +611,7 @@ message TensorProto { // Complex64 elements must be written as two consecutive FLOAT values, real component first. // Complex128 elements must be written as two consecutive DOUBLE values, real component first. // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false). + // uint4 and int4 values must be packed to 4bitx2, the first element is stored in the 4 LSB and the second element is stored in the 4 MSB. // // Note: the advantage of specific field rather than the raw_data field is // that in some cases (e.g. int data), protobuf does a better packing via @@ -631,6 +654,9 @@ message TensorProto { // When this field is present, the data_type field MUST be // UINT32 or UINT64 repeated uint64 uint64_data = 11 [packed = true]; + + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 16; } // A serialized sparse-tensor value @@ -724,6 +750,17 @@ message TypeProto { } + message Opaque { + // When missing, the domain is the same as the model's. + optional string domain = 1; + // The name is optional but significant when provided. + optional string name = 2; + // parameters that help defining the type + // DEPRECATED do not use. + // repeated TypeProto parameters = 3; + } + + oneof value { // The type of a tensor. Tensor tensor_type = 1; @@ -746,6 +783,9 @@ message TypeProto { // Type of the sparse tensor SparseTensor sparse_tensor_type = 8; + + Opaque opaque_type = 7; + } // An optional denotation can be used to denote the whole @@ -777,9 +817,8 @@ enum OperatorStatus { } message FunctionProto { - // The name of the function, similar usage of op_type in OperatorProto. - // Combined with FunctionProto.domain, this forms the unique identity of - // the FunctionProto. + // The name of the function, similar to op_type in NodeProto. + // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model. optional string name = 1; // Deprecated since IR Version 8 @@ -826,9 +865,22 @@ message FunctionProto { repeated OperatorSetIdProto opset_import = 9; - // The domain which this function belongs to. Combined with FunctionProto.name, this forms the unique identity of - // the FunctionProto. + // The domain which this function belongs to. + // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model. optional string domain = 10; + + // The overload identifier of the function. + // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model. + optional string overload = 13; + + // Information for the values in the function. The ValueInfoProto.name's + // must be distinct and refer to names in the function (including inputs, + // outputs, and intermediate values). It is optional for a value to appear + // in value_info list. + repeated ValueInfoProto value_info = 12; + + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 14; } // For using protobuf-lite diff --git a/tools/pnnx/src/onnx-operators-ml.proto b/tools/pnnx/src/onnx-operators-ml.proto new file mode 100644 index 000000000000..de62706f5cbd --- /dev/null +++ b/tools/pnnx/src/onnx-operators-ml.proto @@ -0,0 +1,136 @@ +// +// WARNING: This file is automatically generated! Please edit onnx.in.proto. +// + + +// Copyright (c) ONNX Project Contributors. +// Licensed under the Apache-2.0 license. + +syntax = "proto2"; + +package onnx; +import "onnx-ml.proto"; + +// +// This file contains the proto definitions for OperatorSetProto and +// OperatorProto. OperatorSetProtos are used to describe a versioned +// set of operators that can be used by a ModelProto. +// +// Like ModelProto, OperatorSetProto is defined as a top-level file/wire +// format, however their usage is different. +// +// ModelProto files are used to describe executable graphs that can be +// executed directly by a framework, runtime, or engine. +// +// OperatorSetProto files are used to describe a set of operators that are +// available in a given environment. The file TBD.TBD is the OperatorSetProto +// that describes the ONNX standard operators. +// + +// An OperatorProto represents the immutable specification of the signature +// and semantics of an operator. +// +// Operators are declared as part of an OperatorSet, which also defines the +// domain name for the set. +// +// Operators are uniquely identified by a three part identifier +// (domain, op_type, since_version) +// where +// *domain* is the domain of an operator set that +// contains this operator specification. +// +// *op_type* is the name of the operator as referenced by a +// NodeProto.op_type +// +// *since_version* is the version of the operator set that +// this operator was initially declared in. +// +message OperatorProto { + // The name of the operator within a domain. + // This field MUST be present in this version of the IR. + optional string op_type = 1; + + // The version of the operator set that first introduced this + // operator. This value MUST be the same value as the + // opset_version of the operator set that first published this operator. + // Subsequent versions of the operator set MUST NOT alter the signature + // or semantics of the operator once published as STABLE. + // This field MUST be present in this version of the IR. + optional int64 since_version = 2; + + // This field indicates whether the syntax, semantics, or presence + // of this operator is in an experimental or stable stage. Once an + // operator is published as STABLE, it's syntax and semantics MUST NOT + // change in subsequent versions of the operator set. + // When an operator is published as EXPERIMENTAL, the syntax and semantics + // of the operator MAY change across operator set versions. + // Operators "become" stable by deprecating the experimental version and + // introducing a new stable operator with the same op_type. + optional OperatorStatus status = 3; + + // Eventually we will declare the signature of the operator here + + // A human-readable documentation for this operator. Markdown is allowed. + optional string doc_string = 10; +} + +// An OperatorSetProto represents an immutable set of immutable operator +// specifications. +// +// The domain of the set (OperatorSetProto.domain) is a reverse-DNS name +// that disambiguates operator sets defined by independent entities. +// +// The version of the set (opset_version) is a monotonically increasing +// integer that indicates changes to the membership of the operator set. +// +// +// Operator sets are uniquely identified by a two part identifier (domain, opset_version) +// +// Like ModelProto, OperatorSetProto is intended as a top-level file/wire format, +// and thus has the standard format headers in addition to the operator set information. +// +message OperatorSetProto { + // All OperatorSetProtos start with a distingushed byte sequence to disambiguate + // protobuf files containing OperatorSets from other content. + // This field MUST be "ONNXOPSET" + // This field MUST be present in this version of the IR + optional string magic = 1; + + // All OperatorSetProtos indicate the version of the IR syntax and semantics + // they adhere to. It is always IR_VERSION. + // This field MUST be present in this version of the IR + optional int64 ir_version = 2; + + // The prerelease component of the SemVer of the IR. + // This field MAY be absent in this version of the IR + optional string ir_version_prerelease = 3; + + // The build metadata component of the SemVer of the IR. + // This field MAY be absent in this version of the IR + optional string ir_build_metadata = 7; + + // Domain name of the operator set, in reverse DNS form (e.g., com.acme.dnnops). + optional string domain = 4; + + // The version of the set of operators. This is a simple int value + // that is monotonically increasing as new versions of the operator set + // are published. All operators in this set MUST have since_version + // <= opset_version. + optional int64 opset_version = 5; + + // A human-readable documentation for this set of operators. Markdown is allowed. + optional string doc_string = 6; + + // The operators specified by this operator set. + // The (name, version) MUST be unique across all OperatorProtos in operator + repeated OperatorProto operator = 8; + + // The functions specified by this operator set. + // The (name, version) MUST be unique across all OperatorProtos/FunctionProtos in operator/functions + repeated FunctionProto functions = 9; +} + + +// For using protobuf-lite +option optimize_for = LITE_RUNTIME; + diff --git a/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp b/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp new file mode 100644 index 000000000000..498f0453c14f --- /dev/null +++ b/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp @@ -0,0 +1,51 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_level1.h" + +#include "../utils.h" + +namespace pnnx { + +class RMSNorm : public FuseModulePass +{ +public: + const char* match_type_str() const + { + return "__torch__.torch.nn.modules.normalization.RMSNorm"; + } + + const char* type_str() const + { + return "nn.RMSNorm"; + } + + void write(Operator* op, const std::shared_ptr& graph, const torch::jit::Module& mod) const + { + const torch::jit::Node* rmsn = find_node_by_kind(graph, "aten::rms_norm"); + + op->params["normalized_shape"] = rmsn->namedInput("normalized_shape"); + op->params["eps"] = rmsn->namedInput("eps"); + op->params["elementwise_affine"] = mod.hasattr("weight"); + + if (mod.hasattr("weight")) + { + op->attrs["weight"] = mod.attr("weight").toTensor(); + } + } +}; + +REGISTER_GLOBAL_PNNX_FUSE_MODULE_PASS(RMSNorm) + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level2.cpp b/tools/pnnx/src/pass_level2.cpp index bc7e51b8d5d0..de44a3553662 100644 --- a/tools/pnnx/src/pass_level2.cpp +++ b/tools/pnnx/src/pass_level2.cpp @@ -1166,6 +1166,18 @@ static void functionize(Graph& graph) if (out0->consumers.size() == 1) continue; + bool all_consumers_are_same = true; + for (size_t j = 1; j < out0->consumers.size(); j++) + { + if (out0->consumers[j] != out0->consumers[0]) + { + all_consumers_are_same = false; + break; + } + } + if (all_consumers_are_same) + continue; + for (int j = (int)out0->consumers.size() - 1; j > 0; j--) { Operator* op1 = out0->consumers[j]; diff --git a/tools/pnnx/src/pass_level2/F_hardswish.cpp b/tools/pnnx/src/pass_level2/F_hardswish.cpp index caa724f55a73..2ce9e1b420bf 100644 --- a/tools/pnnx/src/pass_level2/F_hardswish.cpp +++ b/tools/pnnx/src/pass_level2/F_hardswish.cpp @@ -343,4 +343,30 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_hardswish_onnx_2, 9) +class F_hardswish_onnx_3 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +8 7 +pnnx.Input input 0 1 input +prim::Constant op_0 0 1 v3 value=3 +aten::add op_1 2 1 input v3 a +aten::clamp op_2 1 1 a b max=6 min=0 +aten::mul op_3 2 1 input b c +prim::Constant op_4 0 1 v6 value=6 +aten::div op_5 2 1 c v6 out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.hardswish"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_hardswish_onnx_3, 9) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_interpolate.cpp b/tools/pnnx/src/pass_level2/F_interpolate.cpp index b93bd2df6c8d..119842b1c780 100644 --- a/tools/pnnx/src/pass_level2/F_interpolate.cpp +++ b/tools/pnnx/src/pass_level2/F_interpolate.cpp @@ -1005,7 +1005,7 @@ class F_interpolate_onnx : public GraphRewriterPass return R"PNNXIR(7767517 3 2 pnnx.Input input 0 1 input -Resize op_0 1 1 input out sizes=%sizes coordinate_transformation_mode=%coordinate_transformation_mode mode=%mode nearest_mode=floor cubic_coeff_a=* +Resize op_0 1 1 input out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } @@ -1017,104 +1017,69 @@ pnnx.Output output 1 0 out bool match(const std::map& matched_operators, const std::map& captured_params, const std::map& /*captured_attrs*/) const { - if (captured_params.at("sizes").type != 5) + if (captured_params.find("op_0.coordinate_transformation_mode") == captured_params.end()) return false; - const std::vector& sizes = captured_params.at("sizes").ai; - - if (sizes.size() < 3 || sizes.size() > 5) + if (captured_params.at("op_0.coordinate_transformation_mode").type != 4) return false; - const std::vector& input_shape = matched_operators.at("op_0")->inputs[0]->shape; - if (input_shape.size() < 3 || input_shape.size() > 5) + if (captured_params.find("op_0.mode") == captured_params.end()) return false; - if (input_shape[0] != sizes[0] || input_shape[1] != sizes[1]) + if (captured_params.at("op_0.mode").type != 4) return false; - return true; - } - - void write(Operator* op, const std::map& captured_params) const - { - const std::string& coordinate_transformation_mode = captured_params.at("coordinate_transformation_mode").s; - std::string mode = captured_params.at("mode").s; - const std::vector& sizes = captured_params.at("sizes").ai; - - if (mode == "linear") + if (captured_params.find("op_0.nearest_mode") != captured_params.end()) { - if (coordinate_transformation_mode == "half_pixel") - op->params["align_corners"] = false; - if (coordinate_transformation_mode == "align_corners") - op->params["align_corners"] = true; - - if (sizes.size() == 4) - mode = "bilinear"; - if (sizes.size() == 5) - mode = "trilinear"; + if (captured_params.at("op_0.nearest_mode").type != 4 || captured_params.at("op_0.nearest_mode").s != "floor") + return false; } - if (mode == "cubic") + if (captured_params.find("op_0.roi") != captured_params.end()) { - if (coordinate_transformation_mode == "half_pixel") - op->params["align_corners"] = false; - if (coordinate_transformation_mode == "align_corners") - op->params["align_corners"] = true; - - mode = "bicubic"; + if (captured_params.at("op_0.roi").type != 6 || !captured_params.at("op_0.roi").ai.empty()) + return false; } - op->params["mode"] = mode; - if (sizes.size() == 3) - op->params["size"] = {sizes[2]}; - if (sizes.size() == 4) - op->params["size"] = {sizes[2], sizes[3]}; - if (sizes.size() == 5) - op->params["size"] = {sizes[2], sizes[3], sizes[4]}; - } -}; + if (captured_params.find("op_0.sizes") == captured_params.end() && captured_params.find("op_0.scales") == captured_params.end()) + return false; -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_interpolate_onnx, 10) + if (captured_params.find("op_0.sizes") != captured_params.end() && captured_params.at("op_0.sizes").type == 5 && !captured_params.at("op_0.sizes").ai.empty()) + { + const std::vector& sizes = captured_params.at("op_0.sizes").ai; -class F_interpolate_onnx_1 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -3 2 -pnnx.Input input 0 1 input -Resize op_0 1 1 input out scales=%scales coordinate_transformation_mode=%coordinate_transformation_mode mode=%mode nearest_mode=floor cubic_coeff_a=* -pnnx.Output output 1 0 out -)PNNXIR"; - } + if (sizes.size() < 3 || sizes.size() > 5) + return false; - const char* type_str() const - { - return "F.interpolate"; - } + const std::vector& input_shape = matched_operators.at("op_0")->inputs[0]->shape; + if (input_shape.size() < 3 || input_shape.size() > 5) + return false; - bool match(const std::map& captured_params) const - { - if (captured_params.at("scales").type != 6) - return false; - - const std::vector& scales = captured_params.at("scales").af; + if (input_shape[0] != sizes[0] || input_shape[1] != sizes[1]) + return false; + } + else if (captured_params.find("op_0.scales") != captured_params.end() && captured_params.at("op_0.scales").type == 6 && !captured_params.at("op_0.scales").af.empty()) + { + const std::vector& scales = captured_params.at("op_0.scales").af; - if (scales.size() < 3 || scales.size() > 5) - return false; + if (scales.size() < 3 || scales.size() > 5) + return false; - if (scales[0] != 1.f || scales[1] != 1.f) + if (scales[0] != 1.f || scales[1] != 1.f) + return false; + } + else + { return false; + } return true; } void write(Operator* op, const std::map& captured_params) const { - const std::string& coordinate_transformation_mode = captured_params.at("coordinate_transformation_mode").s; - std::string mode = captured_params.at("mode").s; - const std::vector& scales = captured_params.at("scales").af; + const std::string& coordinate_transformation_mode = captured_params.at("op_0.coordinate_transformation_mode").s; + std::string mode = captured_params.at("op_0.mode").s; if (mode == "linear") { @@ -1122,11 +1087,6 @@ pnnx.Output output 1 0 out op->params["align_corners"] = false; if (coordinate_transformation_mode == "align_corners") op->params["align_corners"] = true; - - if (scales.size() == 4) - mode = "bilinear"; - if (scales.size() == 5) - mode = "trilinear"; } if (mode == "cubic") @@ -1135,22 +1095,63 @@ pnnx.Output output 1 0 out op->params["align_corners"] = false; if (coordinate_transformation_mode == "align_corners") op->params["align_corners"] = true; - - mode = "bicubic"; } - op->params["mode"] = mode; - op->params["recompute_scale_factor"] = false; - if (scales.size() == 3) - op->params["scale_factor"] = {scales[2]}; - if (scales.size() == 4) - op->params["scale_factor"] = {scales[2], scales[3]}; - if (scales.size() == 5) - op->params["scale_factor"] = {scales[2], scales[3], scales[4]}; + if (captured_params.find("op_0.sizes") != captured_params.end() && captured_params.at("op_0.sizes").type == 5 && !captured_params.at("op_0.sizes").ai.empty()) + { + const std::vector& sizes = captured_params.at("op_0.sizes").ai; + + if (mode == "linear") + { + if (sizes.size() == 4) + mode = "bilinear"; + if (sizes.size() == 5) + mode = "trilinear"; + } + + if (mode == "cubic") + { + mode = "bicubic"; + } + + op->params["mode"] = mode; + if (sizes.size() == 3) + op->params["size"] = {sizes[2]}; + if (sizes.size() == 4) + op->params["size"] = {sizes[2], sizes[3]}; + if (sizes.size() == 5) + op->params["size"] = {sizes[2], sizes[3], sizes[4]}; + } + else if (captured_params.find("op_0.scales") != captured_params.end() && captured_params.at("op_0.scales").type == 6 && !captured_params.at("op_0.scales").af.empty()) + { + const std::vector& scales = captured_params.at("op_0.scales").af; + + if (mode == "linear") + { + if (scales.size() == 4) + mode = "bilinear"; + if (scales.size() == 5) + mode = "trilinear"; + } + + if (mode == "cubic") + { + mode = "bicubic"; + } + + op->params["mode"] = mode; + op->params["recompute_scale_factor"] = false; + if (scales.size() == 3) + op->params["scale_factor"] = {scales[2]}; + if (scales.size() == 4) + op->params["scale_factor"] = {scales[2], scales[3]}; + if (scales.size() == 5) + op->params["scale_factor"] = {scales[2], scales[3], scales[4]}; + } } }; -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_interpolate_onnx_1, 10) +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_interpolate_onnx, 10) class F_interpolate_onnx_2 : public GraphRewriterPass { diff --git a/tools/pnnx/src/pass_level2/F_linear.cpp b/tools/pnnx/src/pass_level2/F_linear.cpp index 4c454581ec3f..62f9d62e5054 100644 --- a/tools/pnnx/src/pass_level2/F_linear.cpp +++ b/tools/pnnx/src/pass_level2/F_linear.cpp @@ -129,7 +129,7 @@ class F_linear_onnx : public GraphRewriterPass pnnx.Input input_0 0 1 input pnnx.Input input_1 0 1 weight pnnx.Input input_2 0 1 bias -Gemm op_0 3 1 input weight bias out alpha=1.000000e+00 beta=1.000000e+00 transB=1 +Gemm gemm 3 1 input weight bias out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } @@ -138,6 +138,39 @@ pnnx.Output output 1 0 out { return "F.linear"; } + + bool match(const std::map& matched_operators, const std::map& captured_params, const std::map& /*captured_attrs*/) const + { + if (captured_params.find("gemm.alpha") != captured_params.end()) + { + if (captured_params.at("gemm.alpha").type != 3 || captured_params.at("gemm.alpha").f != 1.f) + return false; + } + + if (captured_params.find("gemm.beta") != captured_params.end()) + { + if (captured_params.at("gemm.beta").type != 3 || captured_params.at("gemm.beta").f != 1.f) + return false; + } + + if (captured_params.find("gemm.transA") != captured_params.end()) + { + if (captured_params.at("gemm.transA").type != 2 || captured_params.at("gemm.transA").i != 0) + return false; + } + + if (captured_params.find("gemm.transB") == captured_params.end()) + return false; + + if (captured_params.at("gemm.transB").type != 2 || captured_params.at("gemm.transB").i != 1) + return false; + + return true; + } + + void write(Operator* op, const std::map& /*captured_params*/) const + { + } }; REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_linear_onnx, 10) @@ -152,7 +185,7 @@ class F_linear_onnx_1 : public GraphRewriterPass pnnx.Input input_0 0 1 input pnnx.Input input_1 0 1 bias pnnx.Attribute weight 0 1 weight @data=(%in_features,%out_features)f32 -Gemm gemm 3 1 input weight bias out alpha=1.000000e+00 beta=1.000000e+00 +Gemm gemm 3 1 input weight bias out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } @@ -169,6 +202,35 @@ pnnx.Output output 1 0 out )PNNXIR"; } + bool match(const std::map& matched_operators, const std::map& captured_params, const std::map& /*captured_attrs*/) const + { + if (captured_params.find("gemm.alpha") != captured_params.end()) + { + if (captured_params.at("gemm.alpha").type != 3 || captured_params.at("gemm.alpha").f != 1.f) + return false; + } + + if (captured_params.find("gemm.beta") != captured_params.end()) + { + if (captured_params.at("gemm.beta").type != 3 || captured_params.at("gemm.beta").f != 1.f) + return false; + } + + if (captured_params.find("gemm.transA") != captured_params.end()) + { + if (captured_params.at("gemm.transA").type != 2 || captured_params.at("gemm.transA").i != 0) + return false; + } + + if (captured_params.find("gemm.transB") != captured_params.end()) + { + if (captured_params.at("gemm.transB").type != 2 || captured_params.at("gemm.transB").i != 0) + return false; + } + + return true; + } + void write(const std::map& ops, const std::map& captured_params, const std::map& captured_attrs) const { const int in_features = captured_params.at("in_features").i; diff --git a/tools/pnnx/src/pass_level2/F_log_softmax.cpp b/tools/pnnx/src/pass_level2/F_log_softmax.cpp index 0264973783b0..ad9eba30d1cf 100644 --- a/tools/pnnx/src/pass_level2/F_log_softmax.cpp +++ b/tools/pnnx/src/pass_level2/F_log_softmax.cpp @@ -39,4 +39,77 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_log_softmax, 10) +class F_log_softmax_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input_0 0 1 input +LogSoftmax op_0 1 1 input out axis=%dim +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.log_softmax"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_log_softmax_onnx, 10) + +class F_log_softmax_onnx_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input_0 0 1 input +Transpose op_0 1 1 input a perm=%perm +LogSoftmax op_1 1 1 a b axis=%axis +Transpose op_2 1 1 b out perm=%perm +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.log_softmax"; + } + + bool match(const std::map& captured_params) const + { + const std::vector& perm = captured_params.at("perm").ai; + const int axis = captured_params.at("axis").i; + + if (axis >= (int)perm.size()) + return false; + + int excount = 0; + for (int i = 0; i < (int)perm.size(); i++) + { + if (perm[i] != i) + excount++; + } + + if (excount != 2) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params) const + { + const std::vector& perm = captured_params.at("perm").ai; + const int axis = captured_params.at("axis").i; + + op->params["dim"] = perm[axis]; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_log_softmax_onnx_1, 9) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_logsigmoid.cpp b/tools/pnnx/src/pass_level2/F_logsigmoid.cpp index e35670686a0e..e0d4df607f23 100644 --- a/tools/pnnx/src/pass_level2/F_logsigmoid.cpp +++ b/tools/pnnx/src/pass_level2/F_logsigmoid.cpp @@ -37,4 +37,26 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_logsigmoid, 10) +class F_logsigmoid_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +aten::sigmoid op_0 1 1 input a +aten::log op_1 1 1 a out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.logsigmoid"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_logsigmoid_onnx, 9) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_mish.cpp b/tools/pnnx/src/pass_level2/F_mish.cpp index 1a083ba85d9a..485a7e3b0b52 100644 --- a/tools/pnnx/src/pass_level2/F_mish.cpp +++ b/tools/pnnx/src/pass_level2/F_mish.cpp @@ -62,4 +62,27 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_mish_1, 9) +class F_mish_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +Softplus op_0 1 1 input a +aten::tanh op_1 1 1 a b +aten::mul op_2 2 1 input b out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.mish"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_mish_onnx, 9) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_rms_norm.cpp b/tools/pnnx/src/pass_level2/F_rms_norm.cpp new file mode 100644 index 000000000000..aaa1813c5639 --- /dev/null +++ b/tools/pnnx/src/pass_level2/F_rms_norm.cpp @@ -0,0 +1,43 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_level2.h" + +namespace pnnx { + +class F_rms_norm : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 weight +pnnx.Input input_2 0 1 normalized_shape +prim::Constant op_0 0 1 eps value=%eps +aten::rms_norm op_1 4 1 input normalized_shape weight eps out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.rms_norm"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_rms_norm, 10) + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_selu.cpp b/tools/pnnx/src/pass_level2/F_selu.cpp index 592c3dd8ed77..9df970b1bbc1 100644 --- a/tools/pnnx/src/pass_level2/F_selu.cpp +++ b/tools/pnnx/src/pass_level2/F_selu.cpp @@ -37,4 +37,25 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_selu, 10) +class F_selu_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +Selu op_0 1 1 input out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.selu"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_selu_onnx, 10) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_softmin.cpp b/tools/pnnx/src/pass_level2/F_softmin.cpp index bb0768663c53..89e5d9aeaf83 100644 --- a/tools/pnnx/src/pass_level2/F_softmin.cpp +++ b/tools/pnnx/src/pass_level2/F_softmin.cpp @@ -40,4 +40,26 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softmin, 9) +class F_softmin_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +aten::neg op_0 1 1 input 6 +Softmax op_1 1 1 6 out axis=%dim +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.softmin"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softmin_onnx, 9) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_softplus.cpp b/tools/pnnx/src/pass_level2/F_softplus.cpp index c6a5279b4140..8d346eb76ed5 100644 --- a/tools/pnnx/src/pass_level2/F_softplus.cpp +++ b/tools/pnnx/src/pass_level2/F_softplus.cpp @@ -39,4 +39,62 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softplus, 10) +class F_softplus_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input_0 0 1 input +Softplus op_0 1 1 input out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.softplus"; + } + + void write(Operator* op, const std::map& /*captured_params*/) const + { + op->params["beta"] = 1.f; + op->params["threshold"] = 20.f; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softplus_onnx, 10) + +class F_softplus_onnx_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +7 6 +pnnx.Input input_0 0 1 input +prim::Constant op_0 0 1 beta value=%beta +aten::mul op_1 2 1 input beta a +Softplus op_2 1 1 a b +prim::Constant op_3 0 1 beta2 value=%beta +aten::div op_4 2 1 b beta2 out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.softplus"; + } + + void write(Operator* op, const std::map& captured_params) const + { + op->params["beta"] = captured_params.at("beta"); + op->params["threshold"] = 20.f; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softplus_onnx_1, 9) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_softshrink.cpp b/tools/pnnx/src/pass_level2/F_softshrink.cpp index 286990bf2c57..8d14a8a644b4 100644 --- a/tools/pnnx/src/pass_level2/F_softshrink.cpp +++ b/tools/pnnx/src/pass_level2/F_softshrink.cpp @@ -38,4 +38,62 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softshrink, 10) +static bool NearlyEqual(float a, float b, float epsilon) +{ + if (a == b) + return true; + + float diff = (float)fabs(a - b); + if (diff <= epsilon) + return true; + + // relative error + return diff < epsilon * std::max(fabs(a), fabs(b)); +} + +class F_softshrink_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +15 14 +pnnx.Input input 0 1 input +prim::Constant op_0 0 1 lambd value=%lambd +aten::gt op_1 2 1 input lambd 8 +prim::Constant op_2 0 1 lambd2 value=%lambd +aten::sub op_3 2 1 input lambd2 9 +prim::Constant op_4 0 1 zero value=0 +aten::where op_5 3 1 8 9 zero a +prim::Constant op_6 0 1 mlambd value=%lambd2 +aten::lt op_7 2 1 input mlambd 11 +prim::Constant op_8 0 1 lambd3 value=%lambd +aten::add op_9 2 1 input lambd3 12 +prim::Constant op_10 0 1 zero2 value=0 +aten::where op_11 3 1 11 12 zero2 b +aten::add op_12 2 1 a b out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.softshrink"; + } + + bool match(const std::map& captured_params) const + { + float lambd = captured_params.at("lambd").f; + float lambd2 = captured_params.at("lambd2").f; + return NearlyEqual(lambd, -lambd2, 0.001); + } + + void write(Operator* op, const std::map& captured_params) const + { + op->params["lambd"] = captured_params.at("lambd"); + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softshrink_onnx, 10) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_softsign.cpp b/tools/pnnx/src/pass_level2/F_softsign.cpp index 4ec8ae9e520d..ae6005d63376 100644 --- a/tools/pnnx/src/pass_level2/F_softsign.cpp +++ b/tools/pnnx/src/pass_level2/F_softsign.cpp @@ -41,4 +41,28 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softsign, 10) +class F_softsign_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input 0 1 input +aten::abs op_0 1 1 input 6 +prim::Constant op_1 0 1 8 value=1 +aten::add op_2 2 1 6 8 9 +aten::div op_3 2 1 input 9 out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.softsign"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softsign_onnx, 10) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_tanhshrink.cpp b/tools/pnnx/src/pass_level2/F_tanhshrink.cpp index d8d6c311fcd8..01e578bf8ade 100644 --- a/tools/pnnx/src/pass_level2/F_tanhshrink.cpp +++ b/tools/pnnx/src/pass_level2/F_tanhshrink.cpp @@ -39,4 +39,26 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_tanhshrink, 9) +class F_tanhshrink_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +aten::tanh op_0 1 1 input 7 +aten::sub op_1 2 1 input 7 out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.tanhshrink"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_tanhshrink_onnx, 9) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/Tensor_expand.cpp b/tools/pnnx/src/pass_level2/Tensor_expand.cpp index 23c1af6a863d..4c94d7b8e04f 100644 --- a/tools/pnnx/src/pass_level2/Tensor_expand.cpp +++ b/tools/pnnx/src/pass_level2/Tensor_expand.cpp @@ -61,4 +61,52 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_expand_1, 20) +class Tensor_expand_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +Expand op_0 1 1 input out %*=%* +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "Tensor.expand"; + } + + bool match(const std::map& captured_params) const + { + if (captured_params.find("op_0.shape") == captured_params.end()) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params) const + { + if (captured_params.at("op_0.shape").type == 5) + { + op->params["shape"] = captured_params.at("op_0.shape"); + } + else // if (captured_params.at("op_0.shape").type == 2) + { + op->params["shape"] = std::vector{captured_params.at("op_0.shape").i}; + } + + // onnx set expand shape 1 for not changing the size of that dimension while torch uses -1 + for (size_t i = 0; i < op->params["shape"].ai.size(); i++) + { + if (op->params["shape"].ai[i] == 1) + op->params["shape"].ai[i] = -1; + } + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_expand_onnx, 20) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/Tensor_reshape.cpp b/tools/pnnx/src/pass_level2/Tensor_reshape.cpp index 1c578a8d6333..412e609cc403 100644 --- a/tools/pnnx/src/pass_level2/Tensor_reshape.cpp +++ b/tools/pnnx/src/pass_level2/Tensor_reshape.cpp @@ -48,7 +48,7 @@ class Tensor_reshape_onnx : public GraphRewriterPass pnnx.Input input_0 0 1 input pnnx.Input input_1 0 1 shape aten::cat op_0 1 1 shape cat dim=0 -Reshape op_1 2 1 input cat out allowzero=* +Reshape op_1 2 1 input cat out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } @@ -57,46 +57,15 @@ pnnx.Output output 1 0 out { return "Tensor.reshape"; } -}; - -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx, 19) - -class Tensor_reshape_onnx_1 : public Tensor_reshape_onnx -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input_0 0 1 input -pnnx.Input input_1 0 1 shape -aten::cat op_0 1 1 shape cat dim=0 -Reshape op_1 2 1 input cat out -pnnx.Output output 1 0 out -)PNNXIR"; - } -}; - -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_1, 19) -class Tensor_reshape_onnx_2 : public Tensor_reshape_onnx -{ -public: - const char* match_pattern_graph() const + void write(Operator* /*op*/, const std::map& /*captured_params*/) const { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input_0 0 1 input -pnnx.Input input_1 0 1 shape -Reshape op_1 2 1 input shape out allowzero=* -pnnx.Output output 1 0 out -)PNNXIR"; } }; -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_2, 20) +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx, 19) -class Tensor_reshape_onnx_3 : public Tensor_reshape_onnx +class Tensor_reshape_onnx_1 : public Tensor_reshape_onnx { public: const char* match_pattern_graph() const @@ -105,15 +74,15 @@ class Tensor_reshape_onnx_3 : public Tensor_reshape_onnx 4 3 pnnx.Input input_0 0 1 input pnnx.Input input_1 0 1 shape -Reshape op_1 2 1 input shape out +Reshape op_0 2 1 input shape out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } }; -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_3, 20) +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_1, 20) -class Tensor_reshape_onnx_4 : public GraphRewriterPass +class Tensor_reshape_onnx_2 : public GraphRewriterPass { public: const char* match_pattern_graph() const @@ -121,7 +90,7 @@ class Tensor_reshape_onnx_4 : public GraphRewriterPass return R"PNNXIR(7767517 3 2 pnnx.Input input 0 1 input -Reshape op_1 1 1 input out shape=%shape allowzero=* +Reshape op_0 1 1 input out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } @@ -130,24 +99,28 @@ pnnx.Output output 1 0 out { return "Tensor.reshape"; } -}; -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_4, 20) + bool match(const std::map& captured_params) const + { + if (captured_params.find("op_0.shape") == captured_params.end()) + return false; + + return true; + } -class Tensor_reshape_onnx_5 : public Tensor_reshape_onnx_4 -{ -public: - const char* match_pattern_graph() const + void write(Operator* op, const std::map& captured_params) const { - return R"PNNXIR(7767517 -3 2 -pnnx.Input input 0 1 input -Reshape op_1 1 1 input out shape=%shape -pnnx.Output output 1 0 out -)PNNXIR"; + if (captured_params.at("op_0.shape").type == 5) + { + op->params["shape"] = captured_params.at("op_0.shape"); + } + else // if (captured_params.at("op_0.shape").type == 2) + { + op->params["shape"] = std::vector{captured_params.at("op_0.shape").i}; + } } }; -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_5, 20) +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_2, 20) } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/torch_max.cpp b/tools/pnnx/src/pass_level2/torch_max.cpp index b606fed066b8..5a993d6f55ea 100644 --- a/tools/pnnx/src/pass_level2/torch_max.cpp +++ b/tools/pnnx/src/pass_level2/torch_max.cpp @@ -35,6 +35,18 @@ pnnx.Output output 2 0 out indices { return "torch.max"; } + + void write(Operator* op, const std::map& captured_params) const + { + GraphRewriterPass::write(op, captured_params); + + // drop indices if not used + if (op->outputs[1]->consumers.empty()) + { + op->outputs[1]->producer = 0; + op->outputs.resize(1); + } + } }; REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_max, 20) @@ -78,11 +90,22 @@ pnnx.Output output 1 0 out return "torch.max"; } + bool match(const std::map& captured_params) const + { + if (captured_params.find("op_0.axes") != captured_params.end()) + { + if (captured_params.at("op_0.axes").type != 5 || captured_params.at("op_0.axes").ai.size() != 1) + return false; + } + + return true; + } + void write(Operator* op, const std::map& captured_params) const { if (captured_params.find("op_0.axes") != captured_params.end()) { - op->params["dim"] = captured_params.at("op_0.axes"); + op->params["dim"] = captured_params.at("op_0.axes").ai[0]; if (captured_params.find("op_0.keepdims") != captured_params.end()) { diff --git a/tools/pnnx/src/pass_level2/torch_min.cpp b/tools/pnnx/src/pass_level2/torch_min.cpp index 35cc4988a195..fa174614e018 100644 --- a/tools/pnnx/src/pass_level2/torch_min.cpp +++ b/tools/pnnx/src/pass_level2/torch_min.cpp @@ -35,6 +35,18 @@ pnnx.Output output 2 0 out indices { return "torch.min"; } + + void write(Operator* op, const std::map& captured_params) const + { + GraphRewriterPass::write(op, captured_params); + + // drop indices if not used + if (op->outputs[1]->consumers.empty()) + { + op->outputs[1]->producer = 0; + op->outputs.resize(1); + } + } }; REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_min, 20) @@ -78,11 +90,22 @@ pnnx.Output output 1 0 out return "torch.min"; } + bool match(const std::map& captured_params) const + { + if (captured_params.find("op_0.axes") != captured_params.end()) + { + if (captured_params.at("op_0.axes").type != 5 || captured_params.at("op_0.axes").ai.size() != 1) + return false; + } + + return true; + } + void write(Operator* op, const std::map& captured_params) const { if (captured_params.find("op_0.axes") != captured_params.end()) { - op->params["dim"] = captured_params.at("op_0.axes"); + op->params["dim"] = captured_params.at("op_0.axes").ai[0]; if (captured_params.find("op_0.keepdims") != captured_params.end()) { diff --git a/tools/pnnx/src/pass_level2/torch_squeeze.cpp b/tools/pnnx/src/pass_level2/torch_squeeze.cpp index d7e157d94b12..dabffebc1262 100644 --- a/tools/pnnx/src/pass_level2/torch_squeeze.cpp +++ b/tools/pnnx/src/pass_level2/torch_squeeze.cpp @@ -110,20 +110,23 @@ class torch_squeeze_onnx_1 : public torch_squeeze_onnx return R"PNNXIR(7767517 3 2 pnnx.Input input 0 1 input -Squeeze op_0 1 1 input out axes=%axes +Squeeze op_0 1 1 input out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } void write(Operator* op, const std::map& captured_params) const { - if (captured_params.at("axes").type == 5 && captured_params.at("axes").ai.size() == 1) + if (captured_params.find("op_0.axes") != captured_params.end()) { - op->params["dim"] = captured_params.at("axes").ai[0]; - } - else - { - op->params["dim"] = captured_params.at("axes"); + if (captured_params.at("op_0.axes").type == 5 && captured_params.at("op_0.axes").ai.size() == 1) + { + op->params["dim"] = captured_params.at("op_0.axes").ai[0]; + } + else + { + op->params["dim"] = captured_params.at("op_0.axes"); + } } } }; diff --git a/tools/pnnx/src/pass_level2/torch_tile.cpp b/tools/pnnx/src/pass_level2/torch_tile.cpp index d1504bacda84..a2f2780116c1 100644 --- a/tools/pnnx/src/pass_level2/torch_tile.cpp +++ b/tools/pnnx/src/pass_level2/torch_tile.cpp @@ -60,4 +60,45 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_tile_onnx, 20) +class torch_tile_onnx_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +Tile op_0 1 1 input out %*=%* +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "torch.tile"; + } + + bool match(const std::map& captured_params) const + { + if (captured_params.find("op_0.repeats") == captured_params.end()) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params) const + { + if (captured_params.at("op_0.repeats").type == 5) + { + op->params["dims"] = captured_params.at("op_0.repeats"); + } + else // if (captured_params.at("op_0.repeats").type == 2) + { + op->params["dims"] = std::vector{captured_params.at("op_0.repeats").i}; + } + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_tile_onnx_1, 20) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level3/fuse_expression.cpp b/tools/pnnx/src/pass_level3/fuse_expression.cpp index 708d1a548df4..8fc918fed9d7 100644 --- a/tools/pnnx/src/pass_level3/fuse_expression.cpp +++ b/tools/pnnx/src/pass_level3/fuse_expression.cpp @@ -154,6 +154,7 @@ static bool operand_maybe_tensor(const Operand* operand) || op->type == "aten::div" || op->type == "aten::floor_divide" || op->type == "aten::fmod" + || op->type == "aten::logaddexp" || op->type == "aten::max" || op->type == "aten::maximum" || op->type == "aten::min" @@ -653,6 +654,7 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s else if (op->type == "aten::atan2" || op->type == "aten::floor_divide" || op->type == "aten::fmod" + || op->type == "aten::logaddexp" || op->type == "aten::max" || op->type == "aten::maximum" || op->type == "aten::min" @@ -867,6 +869,7 @@ void fuse_expression(Graph& graph, const std::set& foldable_constan || op->type == "aten::fmod" || op->type == "aten::log" || op->type == "aten::log10" + || op->type == "aten::logaddexp" || op->type == "aten::max" || op->type == "aten::maximum" || op->type == "aten::min" diff --git a/tools/pnnx/src/pass_level5.cpp b/tools/pnnx/src/pass_level5.cpp index 4903f1851179..5f08b80f5ef9 100644 --- a/tools/pnnx/src/pass_level5.cpp +++ b/tools/pnnx/src/pass_level5.cpp @@ -44,6 +44,7 @@ #include "pass_level5/fuse_multiheadattention.h" #include "pass_level5/fuse_pad_conv1d.h" #include "pass_level5/fuse_pad_conv2d.h" +#include "pass_level5/fuse_rmsnorm.h" #include "pass_level5/fuse_scaled_dot_product_attention.h" #include "pass_level5/fuse_select_to_unbind.h" #include "pass_level5/fuse_silu.h" @@ -60,6 +61,7 @@ #include "pass_level5/fuse_static_layernorm.h" #include "pass_level5/fuse_static_linear.h" #include "pass_level5/fuse_static_prelu.h" +#include "pass_level5/fuse_static_rmsnorm.h" #include "pass_level5/normalize_einsum_equation.h" #include "pass_level4/dead_code_elimination.h" #include "pass_level4/canonicalize.h" @@ -102,6 +104,7 @@ void pass_level5(Graph& g, const std::set& foldable_constants, cons fuse_static_groupnorm(g); fuse_static_instancenorm(g); fuse_static_layernorm(g); + fuse_static_rmsnorm(g); fuse_static_conv(g); fuse_static_convtranspose(g); @@ -143,6 +146,7 @@ void pass_level5(Graph& g, const std::set& foldable_constants, cons fuse_channel_shuffle(g); fuse_layernorm(g); + fuse_rmsnorm(g); fuse_multiheadattention(g); fuse_scaled_dot_product_attention(g); diff --git a/tools/pnnx/src/pass_level5/eval_expression.cpp b/tools/pnnx/src/pass_level5/eval_expression.cpp index 44e1f7e36911..c7d5d5d02260 100644 --- a/tools/pnnx/src/pass_level5/eval_expression.cpp +++ b/tools/pnnx/src/pass_level5/eval_expression.cpp @@ -390,7 +390,8 @@ static std::string eval_expression(const Operator* op) || t == "floor_divide" || t == "fmod" || t == "pow" - || t == "remainder") + || t == "remainder" + || t == "logaddexp") { std::string a = exprstack.top(); exprstack.pop(); @@ -459,6 +460,11 @@ static std::string eval_expression(const Operator* op) r += bf; exprstack.push(std::to_string(r)); } + if (t == "logaddexp") + { + float r = log(exp(af) + exp(bf)); + exprstack.push(std::to_string(r)); + } } else { diff --git a/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp b/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp index 2a9f3b837b17..c178788f2a79 100644 --- a/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp +++ b/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp @@ -702,6 +702,57 @@ pnnx.Output output 1 0 out } }; +class fuse_multiheadattention_pass_1_1_1 : public fuse_multiheadattention_pass_sameqkv +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +19 18 +pnnx.Input input 0 1 input +nn.Linear op_0 1 1 input 256 bias=%qbias in_features=%embed_dim out_features=%embed_dim @bias @weight +nn.Linear op_1 1 1 input 257 bias=%kbias in_features=%embed_dim out_features=%embed_dim @bias @weight +nn.Linear op_2 1 1 input 260 bias=%vbias in_features=%embed_dim out_features=%embed_dim @bias @weight +Tensor.view op_3 1 1 256 263 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.view op_4 1 1 257 258 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.view op_5 1 1 260 261 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.permute op_6 1 1 263 264 dims=(0,2,1,3) +Tensor.permute op_7 1 1 258 259 dims=(0,2,1,3) +Tensor.permute op_8 1 1 261 262 dims=(0,2,1,3) +torch.transpose op_9 1 1 259 265 dim0=-1 dim1=-2 +torch.matmul op_10 2 1 264 265 266 +pnnx.Expression op_11 1 1 266 267 expr=div(@0,%sqrt_feat_per_head) +F.softmax softmax 1 1 267 268 dim=%softmax_dim +torch.matmul op_13 2 1 268 262 269 +Tensor.permute op_14 1 1 269 270 dims=(0,2,1,3) +Tensor.reshape op_15 1 1 270 271 shape=(%batch,%size,%embed_dim) +nn.Linear out_proj 1 1 271 out bias=%outbias in_features=%embed_dim out_features=%embed_dim @bias @weight +pnnx.Output output 1 0 out +)PNNXIR"; + } + + bool match(const std::map& matched_operators, const std::map& captured_params, const std::map& /*captured_attrs*/) const + { + const int embed_dim = captured_params.at("embed_dim").i; + const int num_heads = captured_params.at("num_heads").i; + const int feat_per_head = captured_params.at("feat_per_head").i; + const float sqrt_feat_per_head = captured_params.at("sqrt_feat_per_head").f; + const int softmax_dim = captured_params.at("softmax_dim").i; + + if (embed_dim != num_heads * feat_per_head) + return false; + + if (!NearlyEqual(sqrt_feat_per_head, sqrt(feat_per_head), 0.001)) + return false; + + int softmax_input_rank = (int)matched_operators.at("softmax")->inputs[0]->shape.size(); + if (softmax_dim != -1 && softmax_dim != softmax_input_rank - 1) + return false; + + return true; + } +}; + class fuse_multiheadattention_pass_1_2 : public fuse_multiheadattention_pass_qkv { public: @@ -1734,6 +1785,64 @@ pnnx.Output output 1 0 out } }; +class fuse_multiheadattention_pass_onnx_1_2 : public fuse_multiheadattention_pass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +21 20 +pnnx.Input input_q 0 1 input +nn.Linear op_0 1 1 input 14 bias=%qkvbias in_features=%embed_dim out_features=%qkv_out_features @bias @weight +Tensor.reshape op_1 1 1 14 15 shape=(%batch,%size,1,3,%embed_dim) +Tensor.permute op_2 1 1 15 16 dims=(3,1,2,0,4) +torch.squeeze op_3 1 1 16 17 dim=3 +torch.unbind op_4 1 3 17 18 19 20 dim=0 +Tensor.reshape op_5 1 1 18 21 shape=(%size,%num_heads,%feat_per_head) +Tensor.reshape op_6 1 1 19 23 shape=(%size,%num_heads,%feat_per_head) +Tensor.reshape op_7 1 1 20 25 shape=(%size,%num_heads,%feat_per_head) +Tensor.permute op_8 1 1 21 22 dims=(1,0,2) +Tensor.permute op_9 1 1 23 24 dims=(1,0,2) +Tensor.permute op_10 1 1 25 26 dims=(1,0,2) +Tensor.reshape op_11 1 1 22 27 shape=(%batch,%num_heads,%size,%feat_per_head) +Tensor.reshape op_12 1 1 24 28 shape=(%batch,%num_heads,%size,%feat_per_head) +Tensor.reshape op_13 1 1 26 29 shape=(%batch,%num_heads,%size,%feat_per_head) +F.scaled_dot_product_attention op_14 3 1 27 28 29 35 dropout_p=0.000000e+00 is_causal=False +Tensor.permute op_15 1 1 35 36 dims=(2,0,1,3) +Tensor.reshape op_16 1 1 36 37 shape=(%size,%embed_dim) +nn.Linear out_proj 1 1 37 38 bias=%outbias in_features=%embed_dim out_features=%embed_dim @bias @weight +Tensor.reshape op_18 1 1 38 out shape=(%size,%batch,%embed_dim) +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* replace_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +nn.MultiheadAttention attention 1 1 input out embed_dim=%embed_dim kdim=%embed_dim vdim=%embed_dim num_heads=%num_heads batch_first=False add_zero_attn=False add_bias_kv=False +pnnx.Output output 1 0 out +)PNNXIR"; + } + + bool match(const std::map& matched_operators, const std::map& captured_params, const std::map& /*captured_attrs*/) const + { + const int embed_dim = captured_params.at("embed_dim").i; + const int qkv_out_features = captured_params.at("qkv_out_features").i; + const int num_heads = captured_params.at("num_heads").i; + const int feat_per_head = captured_params.at("feat_per_head").i; + + if (qkv_out_features != embed_dim * 3) + return false; + + if (embed_dim != num_heads * feat_per_head) + return false; + + return true; + } +}; + class fuse_multiheadattention_pass_onnx_2 : public fuse_multiheadattention_pass { public: @@ -2024,6 +2133,7 @@ void fuse_multiheadattention(Graph& graph) fuse_multiheadattention_pass_q_samekv d; fuse_multiheadattention_pass_1 b1; fuse_multiheadattention_pass_1_1 b11; + fuse_multiheadattention_pass_1_1_1 b111; fuse_multiheadattention_pass_1_2 b12; fuse_multiheadattention_pass_2 c1; fuse_multiheadattention_pass_3 d1; @@ -2048,6 +2158,7 @@ void fuse_multiheadattention(Graph& graph) fuse_multiheadattention_pass_onnx onnx0; fuse_multiheadattention_pass_onnx_1 onnx1; fuse_multiheadattention_pass_onnx_1_1 onnx1a; + fuse_multiheadattention_pass_onnx_1_2 onnx1b; fuse_multiheadattention_pass_onnx_2 onnx2; fuse_multiheadattention_pass_onnx_3 onnx3; fuse_multiheadattention_pass_onnx_4 onnx4; @@ -2063,6 +2174,7 @@ void fuse_multiheadattention(Graph& graph) pnnx_graph_rewrite(graph, &d, opindex); pnnx_graph_rewrite(graph, &b1, opindex); pnnx_graph_rewrite(graph, &b11, opindex); + pnnx_graph_rewrite(graph, &b111, opindex); pnnx_graph_rewrite(graph, &b12, opindex); pnnx_graph_rewrite(graph, &c1, opindex); pnnx_graph_rewrite(graph, &d1, opindex); @@ -2087,6 +2199,7 @@ void fuse_multiheadattention(Graph& graph) pnnx_graph_rewrite(graph, &onnx0, opindex); pnnx_graph_rewrite(graph, &onnx1, opindex); pnnx_graph_rewrite(graph, &onnx1a, opindex); + pnnx_graph_rewrite(graph, &onnx1b, opindex); pnnx_graph_rewrite(graph, &onnx2, opindex); pnnx_graph_rewrite(graph, &onnx3, opindex); pnnx_graph_rewrite(graph, &onnx4, opindex); diff --git a/tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp b/tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp new file mode 100644 index 000000000000..7b99770ed6ed --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp @@ -0,0 +1,97 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_rmsnorm.h" + +#include "pass_level2.h" + +#include +#include + +namespace pnnx { + +class fuse_rmsnorm_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input 0 1 input +pnnx.Attribute op_0 0 1 weight @data #weight=(%c)f32 +pnnx.Expression op_1 1 1 input sq expr=pow(@0,2) +torch.mean op_2 1 1 sq sqmean dim=(-1) keepdim=True +pnnx.Expression op_3 3 1 weight input sqmean out expr=mul(@0,mul(@1,rsqrt(add(@2,%eps)))) +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* replace_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +nn.RMSNorm rmsnorm 1 1 input out elementwise_affine=True eps=%eps normalized_shape=(%c) @weight=%op_0.data +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +class fuse_rmsnorm_pass_1 : public fuse_rmsnorm_pass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input 0 1 input +pnnx.Attribute op_0 0 1 weight @data #weight=(%c)f32 +pnnx.Expression op_1 1 1 input sq expr=pow(@0,2.000000e+00) +torch.mean op_2 1 1 sq sqmean dim=(-1) keepdim=True +pnnx.Expression op_3 3 1 weight input sqmean out expr=mul(@0,mul(@1,reciprocal(sqrt(add(@2,%eps))))) +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +class fuse_rmsnorm_pass_onnx : public fuse_rmsnorm_pass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input 0 1 input +pnnx.Attribute op_0 0 1 weight @data #weight=(%c)f32 +pnnx.Expression op_1 1 1 input sq expr=pow(@0,2.000000e+00) +torch.mean op_2 1 1 sq sqmean dim=(-1) keepdim=True +pnnx.Expression op_3 3 1 weight input sqmean out expr=mul(@0,mul(@1,div(1.000000e+00,sqrt(add(@2,%eps))))) +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +void fuse_rmsnorm(Graph& graph) +{ + fuse_rmsnorm_pass a; + fuse_rmsnorm_pass_1 a1; + fuse_rmsnorm_pass_onnx b; + int opindex = 0; + + pnnx_graph_rewrite(graph, &a, opindex); + pnnx_graph_rewrite(graph, &a1, opindex); + pnnx_graph_rewrite(graph, &b, opindex); +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_rmsnorm.h b/tools/pnnx/src/pass_level5/fuse_rmsnorm.h new file mode 100644 index 000000000000..0ba18e37f61b --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_rmsnorm.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +void fuse_rmsnorm(Graph& graph); + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp b/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp index 8f265f374dc3..a6dcbc86db75 100644 --- a/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp +++ b/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp @@ -62,7 +62,7 @@ pnnx.Output output 1 0 out pnnx.Input input_0 0 1 query pnnx.Input input_1 0 1 key pnnx.Input input_2 0 1 value -F.scaled_dot_product_attention op_0 3 1 query key value out attn_mask=None dropout_p=0.0 is_causal=False +F.scaled_dot_product_attention sdpa 3 1 query key value out attn_mask=None dropout_p=0.0 is_causal=False pnnx.Output output 1 0 out )PNNXIR"; } @@ -114,7 +114,7 @@ pnnx.Input input_Rh 0 1 Rh pnnx.Input input_Rw 0 1 Rw pnnx.Expression RhRw 2 1 Rh Rw RhRw expr=add(@0,@1) #RhRw=(%batch,%h,%w,%h,%w)f32 Tensor.reshape attn_mask 1 1 RhRw attn_mask shape=(%batch,%qsize,%qsize) #attn_mask=(%batch,%qsize,%qsize)f32 -F.scaled_dot_product_attention op_0 4 1 query key value attn_mask out dropout_p=0.0 is_causal=False $attn_mask=attn_mask +F.scaled_dot_product_attention sdpa 4 1 query key value attn_mask out dropout_p=0.0 is_causal=False $attn_mask=attn_mask pnnx.Output output 1 0 out )PNNXIR"; } @@ -137,15 +137,95 @@ pnnx.Output output 1 0 out } }; +class fuse_scaled_dot_product_attention_pass_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +12 11 +pnnx.Input input_0 0 1 query +pnnx.Input input_1 0 1 key +pnnx.Input input_2 0 1 value +pnnx.Input input_3 0 1 attn_mask +Tensor.permute op_0 1 1 query 13 dims=(0,2,1,3) +Tensor.permute op_1 1 1 key 20 dims=(0,2,3,1) +Tensor.permute op_2 1 1 value 19 dims=(0,2,1,3) +torch.matmul op_3 2 1 13 20 21 +pnnx.Expression op_4 2 1 21 attn_mask 23 expr=add(@0,@1) +F.softmax softmax 1 1 23 24 dim=%softmax_dim +torch.matmul op_6 2 1 24 19 out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* replace_pattern_graph() const + { + return R"PNNXIR(7767517 +9 8 +pnnx.Input input_0 0 1 query +pnnx.Input input_1 0 1 key +pnnx.Input input_2 0 1 value +pnnx.Input input_3 0 1 attn_mask +Tensor.permute op_0 1 1 query q dims=(0,2,1,3) +Tensor.permute op_1 1 1 key k dims=(0,2,1,3) +Tensor.permute op_2 1 1 value v dims=(0,2,1,3) +F.scaled_dot_product_attention sdpa 4 1 q k v attn_mask out dropout_p=0.0 is_causal=False $attn_mask=attn_mask +pnnx.Output output 1 0 out +)PNNXIR"; + } + + bool match(const std::map& matched_operators, const std::map& captured_params, const std::map& /*captured_attrs*/) const + { + const int softmax_dim = captured_params.at("softmax_dim").i; + + int softmax_input_rank = (int)matched_operators.at("softmax")->inputs[0]->shape.size(); + if (softmax_dim != -1 && softmax_dim != softmax_input_rank - 1) + return false; + + return true; + } + + void write(const std::map& ops, const std::map& /*captured_params*/, const std::map& /*captured_attrs*/) const + { + Operator* op = ops.at("sdpa"); + + op->params["scale"] = 1.f; + + // rewrite qkv shape + { + std::vector q_shape = ops.at("op_0")->inputs[0]->shape; + std::vector k_shape = ops.at("op_1")->inputs[0]->shape; + std::vector v_shape = ops.at("op_2")->inputs[0]->shape; + + if (!q_shape.empty()) + std::swap(q_shape[1], q_shape[2]); + if (!k_shape.empty()) + std::swap(k_shape[1], k_shape[2]); + if (!v_shape.empty()) + std::swap(v_shape[1], v_shape[2]); + + ops.at("op_0")->outputs[0]->shape = q_shape; + ops.at("op_0")->outputs[0]->type = ops.at("op_0")->inputs[0]->type; + ops.at("op_1")->outputs[0]->shape = k_shape; + ops.at("op_1")->outputs[0]->type = ops.at("op_1")->inputs[0]->type; + ops.at("op_2")->outputs[0]->shape = v_shape; + ops.at("op_2")->outputs[0]->type = ops.at("op_2")->inputs[0]->type; + } + } +}; + void fuse_scaled_dot_product_attention(Graph& graph) { #if TORCH_VERSION_MAJOR >= 2 fuse_scaled_dot_product_attention_pass a; fuse_scaled_dot_product_attention_pass_1 b; + fuse_scaled_dot_product_attention_pass_onnx onnx0; int opindex = 0; pnnx_graph_rewrite(graph, &a, opindex); pnnx_graph_rewrite(graph, &b, opindex); + pnnx_graph_rewrite(graph, &onnx0, opindex); #endif } diff --git a/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp new file mode 100644 index 000000000000..ed68c026d309 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp @@ -0,0 +1,57 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_static_rmsnorm.h" + +#include "pass_level2.h" + +#include +#include + +namespace pnnx { + +class fuse_static_Frmsnorm_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @data +F.rms_norm op_0 2 1 input weight out normalized_shape=%normalized_shape eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* replace_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +nn.RMSNorm rmsn 1 1 input out normalized_shape=%normalized_shape eps=%eps elementwise_affine=True @weight=%op_weight.data +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +void fuse_static_rmsnorm(Graph& graph) +{ + fuse_static_Frmsnorm_pass a; + int opindex = 0; + + pnnx_graph_rewrite(graph, &a, opindex); +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h new file mode 100644 index 000000000000..c88b703cb072 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +void fuse_static_rmsnorm(Graph& graph); + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp b/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp index 1d9ca98e03d8..aaef7db2d74b 100644 --- a/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp +++ b/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp @@ -63,6 +63,22 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool1d, 20) +class F_max_pool1d_1 : public F_max_pool1d +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +F.max_pool1d op_0 1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding ceil_mode=%ceil_mode return_indices=False +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool1d_1, 20) + } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp b/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp index ba5a52f4f7dd..3519c8a022b7 100644 --- a/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp +++ b/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp @@ -66,6 +66,22 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool2d, 20) +class F_max_pool2d_1 : public F_max_pool2d +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +F.max_pool2d op_0 1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding ceil_mode=%ceil_mode return_indices=False +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool2d_1, 20) + } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp b/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp index 5476907fa881..2caede16a293 100644 --- a/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp +++ b/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp @@ -69,6 +69,22 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool3d, 20) +class F_max_pool3d_1 : public F_max_pool3d +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +F.max_pool3d op_0 1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding ceil_mode=%ceil_mode return_indices=False +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool3d_1, 20) + } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_rms_norm.cpp b/tools/pnnx/src/pass_ncnn/F_rms_norm.cpp new file mode 100644 index 000000000000..8230168312c2 --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/F_rms_norm.cpp @@ -0,0 +1,65 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class F_rms_norm : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +F.rms_norm op_0 1 1 input out weight=None normalized_shape=%normalized_shape eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "RMSNorm"; + } + + const char* name_str() const + { + return "rmsn"; + } + + void write(Operator* op, const std::map& captured_params) const + { + const std::vector& normalized_shape = captured_params.at("normalized_shape").ai; + int affine_size = normalized_shape[0]; + for (size_t i = 1; i < normalized_shape.size(); i++) + { + affine_size *= normalized_shape[i]; + } + + const float eps = captured_params.at("eps").type == 0 ? 0.f : captured_params.at("eps").f; + + op->params["0"] = affine_size; + op->params["1"] = eps; + op->params["2"] = 0; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_rms_norm, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp b/tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp new file mode 100644 index 000000000000..af9f06b3f528 --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp @@ -0,0 +1,223 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class F_scaled_dot_product_attention : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +16 15 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 attn_mask +nn.Linear op_0 1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight +nn.Linear op_1 1 1 input k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight +nn.Linear op_2 1 1 input v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight +Tensor.reshape op_3 1 1 q 10 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.reshape op_4 1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.reshape op_5 1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.permute op_6 1 1 10 16 dims=(0,2,1,3) +Tensor.permute op_7 1 1 12 17 dims=(0,2,1,3) +Tensor.permute op_8 1 1 14 18 dims=(0,2,1,3) +F.scaled_dot_product_attention op_9 4 1 16 17 18 attn_mask 19 dropout_p=0.0 is_causal=False scale=%scale +Tensor.permute op_10 1 1 19 20 dims=(0,2,1,3) +Tensor.reshape op_11 1 1 20 21 shape=(%batch,%size,%embed_dim) +nn.Linear out_proj 1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "MultiHeadAttention"; + } + + const char* name_str() const + { + return "sdpa_attention"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + op->params["0"] = captured_params.at("embed_dim"); + op->params["1"] = captured_params.at("num_heads"); + + const int embed_dim = captured_params.at("embed_dim").i; + const int qdim = captured_params.at("qdim").i; + const int kdim = captured_params.at("kdim").i; + const int vdim = captured_params.at("vdim").i; + + op->params["2"] = embed_dim * qdim; + op->params["3"] = kdim; + op->params["4"] = vdim; + op->params["5"] = 1; + op->params["6"] = captured_params.at("scale"); + + op->attrs["0"] = Attribute(); + op->attrs["0"].data = {0, 0, 0, 0}; + op->attrs["1"] = captured_attrs.at("op_0.weight"); + if (captured_params.at("qbias").b) + { + op->attrs["2"] = captured_attrs.at("op_0.bias"); + } + else + { + op->attrs["2"] = Attribute({embed_dim}, std::vector(embed_dim, 0.f)); + } + op->attrs["3"] = Attribute(); + op->attrs["3"].data = {0, 0, 0, 0}; + op->attrs["4"] = captured_attrs.at("op_1.weight"); + if (captured_params.at("kbias").b) + { + op->attrs["5"] = captured_attrs.at("op_1.bias"); + } + else + { + op->attrs["5"] = Attribute({embed_dim}, std::vector(embed_dim, 0.f)); + } + op->attrs["6"] = Attribute(); + op->attrs["6"].data = {0, 0, 0, 0}; + op->attrs["7"] = captured_attrs.at("op_2.weight"); + if (captured_params.at("vbias").b) + { + op->attrs["8"] = captured_attrs.at("op_2.bias"); + } + else + { + op->attrs["8"] = Attribute({embed_dim}, std::vector(embed_dim, 0.f)); + } + op->attrs["9"] = Attribute(); + op->attrs["9"].data = {0, 0, 0, 0}; + op->attrs["a"] = captured_attrs.at("out_proj.weight"); + if (captured_params.at("outbias").b) + { + op->attrs["b"] = captured_attrs.at("out_proj.bias"); + } + else + { + op->attrs["b"] = Attribute({qdim}, std::vector(qdim, 0.f)); + } + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention, 10) + +class F_scaled_dot_product_attention_1 : public F_scaled_dot_product_attention +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +17 16 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 kv +pnnx.Input input_2 0 1 attn_mask +nn.Linear op_0 1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight +nn.Linear op_1 1 1 kv k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight +nn.Linear op_2 1 1 kv v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight +Tensor.reshape op_3 1 1 q 10 shape=(%batch,%qsize,%num_heads,%feat_per_head) +Tensor.reshape op_4 1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.reshape op_5 1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.permute op_6 1 1 10 16 dims=(0,2,1,3) +Tensor.permute op_7 1 1 12 17 dims=(0,2,1,3) +Tensor.permute op_8 1 1 14 18 dims=(0,2,1,3) +F.scaled_dot_product_attention op_9 4 1 16 17 18 attn_mask 19 dropout_p=0.0 is_causal=False scale=%scale +Tensor.permute op_10 1 1 19 20 dims=(0,2,1,3) +Tensor.reshape op_11 1 1 20 21 shape=(%batch,%qsize,%embed_dim) +nn.Linear out_proj 1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention_1, 10) + +class F_scaled_dot_product_attention_2 : public F_scaled_dot_product_attention +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +15 14 +pnnx.Input input 0 1 input +nn.Linear op_0 1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight +nn.Linear op_1 1 1 input k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight +nn.Linear op_2 1 1 input v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight +Tensor.reshape op_3 1 1 q 10 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.reshape op_4 1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.reshape op_5 1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.permute op_6 1 1 10 16 dims=(0,2,1,3) +Tensor.permute op_7 1 1 12 17 dims=(0,2,1,3) +Tensor.permute op_8 1 1 14 18 dims=(0,2,1,3) +F.scaled_dot_product_attention op_9 3 1 16 17 18 19 dropout_p=0.0 is_causal=False attn_mask=None scale=%scale +Tensor.permute op_10 1 1 19 20 dims=(0,2,1,3) +Tensor.reshape op_11 1 1 20 21 shape=(%batch,%size,%embed_dim) +nn.Linear out_proj 1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight +pnnx.Output output 1 0 out +)PNNXIR"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + F_scaled_dot_product_attention::write(op, captured_params, captured_attrs); + op->params["5"] = 0; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention_2, 10) + +class F_scaled_dot_product_attention_3 : public F_scaled_dot_product_attention +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +16 15 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 kv +nn.Linear op_0 1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight +nn.Linear op_1 1 1 kv k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight +nn.Linear op_2 1 1 kv v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight +Tensor.reshape op_3 1 1 q 10 shape=(%batch,%qsize,%num_heads,%feat_per_head) +Tensor.reshape op_4 1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.reshape op_5 1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.permute op_6 1 1 10 16 dims=(0,2,1,3) +Tensor.permute op_7 1 1 12 17 dims=(0,2,1,3) +Tensor.permute op_8 1 1 14 18 dims=(0,2,1,3) +F.scaled_dot_product_attention op_9 3 1 16 17 18 19 dropout_p=0.0 is_causal=False attn_mask=None scale=%scale +Tensor.permute op_10 1 1 19 20 dims=(0,2,1,3) +Tensor.reshape op_11 1 1 20 21 shape=(%batch,%qsize,%embed_dim) +nn.Linear out_proj 1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight +pnnx.Output output 1 0 out +)PNNXIR"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + F_scaled_dot_product_attention::write(op, captured_params, captured_attrs); + op->params["5"] = 0; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention_3, 10) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/expand_expression.cpp b/tools/pnnx/src/pass_ncnn/expand_expression.cpp index f8f97baa55c0..2fdc6d77d62e 100644 --- a/tools/pnnx/src/pass_ncnn/expand_expression.cpp +++ b/tools/pnnx/src/pass_ncnn/expand_expression.cpp @@ -185,6 +185,7 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx || t == "div" || t == "floor_divide" || t == "fmod" + || t == "logaddexp" || t == "max" || t == "maximum" || t == "min" @@ -211,6 +212,7 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx if (t == "sub") op_binary->params["0"] = 1; if (t == "mul") op_binary->params["0"] = 2; if (t == "div") op_binary->params["0"] = 3; + if (t == "logaddexp") fprintf(stderr, "BinaryOp logaddexp not supported yet\n"); // TODO if (t == "max" || t == "maximum") op_binary->params["0"] = 4; if (t == "min" || t == "minimum") op_binary->params["0"] = 5; if (t == "floor_divide") fprintf(stderr, "BinaryOp floor_divide not supported yet\n"); // TODO diff --git a/tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp b/tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp new file mode 100644 index 000000000000..7fda637c5cac --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp @@ -0,0 +1,70 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class nn_RMSNorm : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +nn.RMSNorm op_0 1 1 input out normalized_shape=%normalized_shape eps=%eps elementwise_affine=%elementwise_affine @weight +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "RMSNorm"; + } + + const char* name_str() const + { + return "rmsn"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + const std::vector& normalized_shape = captured_params.at("normalized_shape").ai; + int affine_size = normalized_shape[0]; + for (size_t i = 1; i < normalized_shape.size(); i++) + { + affine_size *= normalized_shape[i]; + } + + const float eps = captured_params.at("eps").type == 0 ? 0.f : captured_params.at("eps").f; + + op->params["0"] = affine_size; + op->params["1"] = eps; + op->params["2"] = captured_params.at("elementwise_affine").b ? 1 : 0; + + if (captured_params.at("elementwise_affine").b) + { + op->attrs["0"] = captured_attrs.at("op_0.weight"); + } + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_RMSNorm, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp index 6e53f7aa841b..d4532422b522 100644 --- a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp +++ b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp @@ -46,6 +46,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op) "F.group_norm", "F.instance_norm", "F.interpolate", + "F.layer_norm", "F.linear", "F.local_response_norm", "F.lp_pool1d", @@ -56,6 +57,8 @@ static bool is_known_operator_with_batch_index_0(const Operator* op) "F.pixel_shuffle", "F.pixel_unshuffle", "F.prelu", + "F.rms_norm", + "F.scaled_dot_product_attention", "F.unfold", "F.upsample_bilinear", "F.upsample_nearest", @@ -90,6 +93,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op) "nn.InstanceNorm2d", "nn.InstanceNorm3d", "nn.LocalResponseNorm", + "nn.LayerNorm", "nn.LPPool1d", "nn.LPPool2d", "nn.MaxPool1d", @@ -103,6 +107,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op) "nn.ReplicationPad1d", "nn.ReplicationPad2d", "nn.ReplicationPad3d", + "nn.RMSNorm", "nn.Softmax2d", "nn.Unfold", "nn.Upsample", diff --git a/tools/pnnx/src/pass_ncnn/torch_max.cpp b/tools/pnnx/src/pass_ncnn/torch_max.cpp index 76cd33f239b6..95987da5162f 100644 --- a/tools/pnnx/src/pass_ncnn/torch_max.cpp +++ b/tools/pnnx/src/pass_ncnn/torch_max.cpp @@ -65,6 +65,22 @@ pnnx.Output output 2 0 out indices REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_max, 20) +class torch_max_0 : public torch_max +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +torch.max op_0 1 1 input out dim=%dim keepdim=%keepdim +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_max_0, 20) + class torch_max_1 : public GraphRewriterPass { public: diff --git a/tools/pnnx/src/pass_ncnn/torch_min.cpp b/tools/pnnx/src/pass_ncnn/torch_min.cpp index 49851b443dbf..3ef2ae47da00 100644 --- a/tools/pnnx/src/pass_ncnn/torch_min.cpp +++ b/tools/pnnx/src/pass_ncnn/torch_min.cpp @@ -65,6 +65,22 @@ pnnx.Output output 2 0 out indices REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_min, 20) +class torch_min_0 : public torch_min +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +torch.min op_0 1 1 input out dim=%dim keepdim=%keepdim +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_min_0, 20) + class torch_min_1 : public GraphRewriterPass { public: diff --git a/tools/pnnx/src/pass_ncnn/torch_roll.cpp b/tools/pnnx/src/pass_ncnn/torch_roll.cpp new file mode 100644 index 000000000000..c7c295933337 --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/torch_roll.cpp @@ -0,0 +1,193 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class torch_roll : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +torch.roll op_0 1 1 input out dims=%dims shifts=%shifts +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* replace_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +Slice slice 1 2 input a b +Concat concat 2 1 b a out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + bool match(const std::map& captured_params) const + { + if (captured_params.at("dims").type != 5) + return false; + + if (captured_params.at("dims").ai.size() != 1) + return false; + + if (captured_params.at("shifts").type != 5) + return false; + + if (captured_params.at("shifts").ai.size() != 1) + return false; + + return true; + } + + void write(const std::map& ops, const std::map& captured_params, const std::map& captured_attrs) const + { + GraphRewriterPass::write(ops, captured_params, captured_attrs); + + const Operand* in = ops.at("slice")->inputs[0]; + + const int batch_index = in->params.at("__batch_index").i; + + int axis = captured_params.at("dims").ai[0]; + if (axis == batch_index) + { + fprintf(stderr, "roll along batch axis %d is not supported\n", batch_index); + } + + if (axis < 0) + { + int input_rank = in->shape.size(); + axis = input_rank + axis; + } + + if (axis > batch_index) + axis -= 1; + + ops.at("slice")->params["1"] = axis; + + ops.at("concat")->params["0"] = axis; + + const int shift = captured_params.at("shifts").ai[0]; + ops.at("slice")->params["2"] = std::vector{-shift}; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_roll, 20) + +class torch_roll_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +torch.roll op_0 1 1 input out dims=%dims shifts=%shifts +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* replace_pattern_graph() const + { + return R"PNNXIR(7767517 +8 7 +pnnx.Input input 0 1 input +Slice slice 1 2 input a b +Slice slice_a 1 2 a a0 a1 +Slice slice_b 1 2 b b0 b1 +Concat concat_a 2 1 a1 a0 a10 +Concat concat_b 2 1 b1 b0 b10 +Concat concat 2 1 b10 a10 out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + bool match(const std::map& captured_params) const + { + if (captured_params.at("dims").type != 5) + return false; + + if (captured_params.at("dims").ai.size() != 2) + return false; + + if (captured_params.at("shifts").type != 5) + return false; + + if (captured_params.at("shifts").ai.size() != 2) + return false; + + return true; + } + + void write(const std::map& ops, const std::map& captured_params, const std::map& captured_attrs) const + { + GraphRewriterPass::write(ops, captured_params, captured_attrs); + + const Operand* in = ops.at("slice")->inputs[0]; + + const int batch_index = in->params.at("__batch_index").i; + + int axis0 = captured_params.at("dims").ai[0]; + int axis1 = captured_params.at("dims").ai[1]; + if (axis0 == batch_index || axis1 == batch_index) + { + fprintf(stderr, "roll along batch axis %d is not supported\n", batch_index); + } + + if (axis0 < 0) + { + int input_rank = in->shape.size(); + axis0 = input_rank + axis0; + } + + if (axis0 > batch_index) + axis0 -= 1; + + if (axis1 < 0) + { + int input_rank = in->shape.size(); + axis1 = input_rank + axis1; + } + if (axis1 > batch_index) + axis1 -= 1; + + ops.at("slice")->params["1"] = axis0; + ops.at("slice_a")->params["1"] = axis1; + ops.at("slice_b")->params["1"] = axis1; + + ops.at("concat_a")->params["0"] = axis1; + ops.at("concat_b")->params["0"] = axis1; + ops.at("concat")->params["0"] = axis0; + + const int shift0 = captured_params.at("shifts").ai[0]; + const int shift1 = captured_params.at("shifts").ai[1]; + ops.at("slice")->params["2"] = std::vector{-shift0}; + ops.at("slice_a")->params["2"] = std::vector{-shift1}; + ops.at("slice_b")->params["2"] = std::vector{-shift1}; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_roll_1, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_onnx.cpp b/tools/pnnx/src/pass_onnx.cpp index dd9194111fc1..87dd27d27cbc 100644 --- a/tools/pnnx/src/pass_onnx.cpp +++ b/tools/pnnx/src/pass_onnx.cpp @@ -14,7 +14,7 @@ #include "pass_onnx.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" #include #include @@ -820,6 +820,8 @@ void pass_onnx(const onnx::ModelProto& model, Graph& pnnx_graph) is_attr_weight = true; if (sim_op_type == "Gather" && j == 0) is_attr_weight = true; + if (sim_op_type == "Gemm" && (j == 1 || j == 2)) + is_attr_weight = true; if (sim_op_type == "GroupNormalization" && (j == 1 || j == 2)) is_attr_weight = true; if (sim_op_type == "GRU" && (j == 1 || j == 2 || j == 3 || j == 5)) diff --git a/tools/pnnx/src/pass_onnx/canonicalize.h b/tools/pnnx/src/pass_onnx/canonicalize.h index a24ad86a9fdb..6ec55f2d1401 100644 --- a/tools/pnnx/src/pass_onnx/canonicalize.h +++ b/tools/pnnx/src/pass_onnx/canonicalize.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/dead_code_elimination.h b/tools/pnnx/src/pass_onnx/dead_code_elimination.h index b890b6a7d7c5..7d8b7e0d25d6 100644 --- a/tools/pnnx/src/pass_onnx/dead_code_elimination.h +++ b/tools/pnnx/src/pass_onnx/dead_code_elimination.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/eliminate_noop.h b/tools/pnnx/src/pass_onnx/eliminate_noop.h index e465e398c0aa..3325ae9cf104 100644 --- a/tools/pnnx/src/pass_onnx/eliminate_noop.h +++ b/tools/pnnx/src/pass_onnx/eliminate_noop.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/fold_constants.h b/tools/pnnx/src/pass_onnx/fold_constants.h index 98d6ef717abc..f165a96e177f 100644 --- a/tools/pnnx/src/pass_onnx/fold_constants.h +++ b/tools/pnnx/src/pass_onnx/fold_constants.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp index a3021d33c907..39dc8d808826 100644 --- a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp +++ b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp @@ -32,6 +32,7 @@ struct constant_as_attribute }; static constant_as_attribute caas[] = { + {"Expand", 1, "shape"}, {"Gather", 1, "indices"}, {"If", 0, "cond"}, {"Pad", 1, "pads"}, @@ -42,6 +43,7 @@ static constant_as_attribute caas[] = { {"ReduceProd", 1, "axes"}, {"ReduceSum", 1, "axes"}, {"Reshape", 1, "shape"}, + {"Resize", 1, "roi"}, {"Resize", 2, "scales"}, {"Resize", 3, "sizes"}, {"Slice", 1, "starts"}, @@ -49,6 +51,7 @@ static constant_as_attribute caas[] = { {"Slice", 3, "axes"}, {"Slice", 4, "steps"}, {"Squeeze", 1, "axes"}, + {"Tile", 1, "repeats"}, {"Unsqueeze", 1, "axes"}, {"Upsample", 1, "scales"}, }; diff --git a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h index ad6cf80007c4..a90c089fee6c 100644 --- a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h +++ b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/inline_containers.h b/tools/pnnx/src/pass_onnx/inline_containers.h index 56b21f47b374..e3051c5e3330 100644 --- a/tools/pnnx/src/pass_onnx/inline_containers.h +++ b/tools/pnnx/src/pass_onnx/inline_containers.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/inline_if_graph.h b/tools/pnnx/src/pass_onnx/inline_if_graph.h index c84b5761ac57..e9c1c2f0ee8c 100644 --- a/tools/pnnx/src/pass_onnx/inline_if_graph.h +++ b/tools/pnnx/src/pass_onnx/inline_if_graph.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/model_stat.h b/tools/pnnx/src/pass_onnx/model_stat.h index dd62e67a1bc9..993630b1b4b7 100644 --- a/tools/pnnx/src/pass_onnx/model_stat.h +++ b/tools/pnnx/src/pass_onnx/model_stat.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp index 0e8851f05f20..21cf6076d2d2 100644 --- a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp index 070981e1d642..a8e3e96be6be 100644 --- a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp index 5a006fe37090..6f5be930e643 100644 --- a/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp index ff2a5dd8aad6..9fdcfdd72d64 100644 --- a/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp b/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp index c3639904d477..96448c0f25c3 100644 --- a/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp b/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp index 0f9405f160ae..afac686a22aa 100644 --- a/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp b/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp index c9aeac561ac2..2cd6b7dd750f 100644 --- a/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp b/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp index 6413685fcb5c..f90c23cbb6ab 100644 --- a/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_GELU.cpp b/tools/pnnx/src/pass_onnx/nn_GELU.cpp index f5b7000e017a..22d2823673a5 100644 --- a/tools/pnnx/src/pass_onnx/nn_GELU.cpp +++ b/tools/pnnx/src/pass_onnx/nn_GELU.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp b/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp index f4ecf2895576..fece12e2bcee 100644 --- a/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp +++ b/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_Linear.cpp b/tools/pnnx/src/pass_onnx/nn_Linear.cpp index 4dce81908b2b..0515a8ea4549 100644 --- a/tools/pnnx/src/pass_onnx/nn_Linear.cpp +++ b/tools/pnnx/src/pass_onnx/nn_Linear.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp index 47924bd33fcf..518abd434b0b 100644 --- a/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp index c8c467f5ba29..04de8bd104a2 100644 --- a/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp b/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp index a29ec9d93062..df1bd0922734 100644 --- a/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp +++ b/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/shape_inference.h b/tools/pnnx/src/pass_onnx/shape_inference.h index b4cd657bb812..b484d5265cae 100644 --- a/tools/pnnx/src/pass_onnx/shape_inference.h +++ b/tools/pnnx/src/pass_onnx/shape_inference.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/save_onnx.cpp b/tools/pnnx/src/save_onnx.cpp index 3406c730b2d0..3ef3a772a2f4 100644 --- a/tools/pnnx/src/save_onnx.cpp +++ b/tools/pnnx/src/save_onnx.cpp @@ -14,7 +14,7 @@ #include "save_onnx.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" #include #include diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt index 2046a6392566..0dd566c37b58 100644 --- a/tools/pnnx/tests/CMakeLists.txt +++ b/tools/pnnx/tests/CMakeLists.txt @@ -61,6 +61,7 @@ pnnx_add_test(F_pixel_unshuffle) pnnx_add_test(F_prelu) pnnx_add_test(F_relu) pnnx_add_test(F_relu6) +pnnx_add_test(F_rms_norm) pnnx_add_test(F_rrelu) pnnx_add_test(F_scaled_dot_product_attention) pnnx_add_test(F_selu) @@ -145,6 +146,7 @@ pnnx_add_test(nn_ReLU6) pnnx_add_test(nn_ReplicationPad1d) pnnx_add_test(nn_ReplicationPad2d) pnnx_add_test(nn_ReplicationPad3d) +pnnx_add_test(nn_RMSNorm) pnnx_add_test(nn_RNN) pnnx_add_test(nn_RReLU) pnnx_add_test(nn_SELU) @@ -234,6 +236,7 @@ pnnx_add_test(torch_ones_like) pnnx_add_test(torch_positive) pnnx_add_test(torch_prod) pnnx_add_test(torch_repeat_interleave) +pnnx_add_test(torch_roll) pnnx_add_test(torch_scatter_add) pnnx_add_test(torch_slice_scatter) pnnx_add_test(torch_sum) @@ -295,6 +298,7 @@ pnnx_add_test(torch_floor) pnnx_add_test(torch_imag) pnnx_add_test(torch_log) pnnx_add_test(torch_log10) +pnnx_add_test(torch_logaddexp) pnnx_add_test(torch_maximum) pnnx_add_test(torch_minimum) pnnx_add_test(torch_neg) @@ -342,6 +346,7 @@ pnnx_add_test(pnnx_fuse_input_unpack) pnnx_add_test(pnnx_fuse_layernorm) pnnx_add_test(pnnx_fuse_linear_batchnorm1d) pnnx_add_test(pnnx_fuse_multiheadattention) +pnnx_add_test(pnnx_fuse_rmsnorm) pnnx_add_test(pnnx_fuse_scaled_dot_product_attention) pnnx_add_test(pnnx_fuse_select_to_unbind) pnnx_add_test(pnnx_fuse_slice_to_tensor_split) diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt index a682e42835b9..49cb063f335e 100644 --- a/tools/pnnx/tests/ncnn/CMakeLists.txt +++ b/tools/pnnx/tests/ncnn/CMakeLists.txt @@ -53,6 +53,7 @@ pnnx_ncnn_add_test(F_pixel_unshuffle) pnnx_ncnn_add_test(F_prelu) pnnx_ncnn_add_test(F_relu) pnnx_ncnn_add_test(F_relu6) +pnnx_ncnn_add_test(F_rms_norm) pnnx_ncnn_add_test(F_selu) pnnx_ncnn_add_test(F_sigmoid) pnnx_ncnn_add_test(F_silu) @@ -123,6 +124,7 @@ pnnx_ncnn_add_test(nn_ReLU6) pnnx_ncnn_add_test(nn_ReplicationPad1d) pnnx_ncnn_add_test(nn_ReplicationPad2d) pnnx_ncnn_add_test(nn_ReplicationPad3d) +pnnx_ncnn_add_test(nn_RMSNorm) pnnx_ncnn_add_test(nn_RNN) pnnx_ncnn_add_test(nn_SELU) pnnx_ncnn_add_test(nn_Sigmoid) @@ -162,6 +164,7 @@ pnnx_ncnn_add_test(torch_min) pnnx_ncnn_add_test(torch_mm) pnnx_ncnn_add_test(torch_norm) pnnx_ncnn_add_test(torch_prod) +pnnx_ncnn_add_test(torch_roll) pnnx_ncnn_add_test(torch_slice_scatter) pnnx_ncnn_add_test(torch_sum) pnnx_ncnn_add_test(torch_squeeze) diff --git a/tools/pnnx/tests/ncnn/test_F_layer_norm.py b/tools/pnnx/tests/ncnn/test_F_layer_norm.py index 92244f179104..9d590aa76dda 100644 --- a/tools/pnnx/tests/ncnn/test_F_layer_norm.py +++ b/tools/pnnx/tests/ncnn/test_F_layer_norm.py @@ -37,8 +37,8 @@ def test(): net.eval() torch.manual_seed(0) - x = torch.rand(12, 24) - y = torch.rand(3, 12, 16) + x = torch.rand(1, 12, 24) + y = torch.rand(1, 3, 12, 16) a = net(x, y) @@ -48,7 +48,7 @@ def test(): # torchscript to pnnx import os - os.system("../../src/pnnx test_F_layer_norm.pt inputshape=[12,24],[3,12,16]") + os.system("../../src/pnnx test_F_layer_norm.pt inputshape=[1,12,24],[1,3,12,16]") # ncnn inference import test_F_layer_norm_ncnn diff --git a/tools/pnnx/tests/ncnn/test_F_rms_norm.py b/tools/pnnx/tests/ncnn/test_F_rms_norm.py new file mode 100644 index 000000000000..f30f72f9ac45 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_F_rms_norm.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.w3 = nn.Parameter(torch.rand(24)) + self.w4 = nn.Parameter(torch.rand(12, 16)) + + def forward(self, x, y): + x = F.rms_norm(x, (24,), self.w3) + + y = F.rms_norm(y, (16,), None) + z = F.rms_norm(y, (12,16), self.w4, eps=1e-3) + return x, y, z + +def test(): + if version.parse(torch.__version__) < version.parse('2.4'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12, 24) + y = torch.rand(1, 3, 12, 16) + + a = net(x, y) + + # export torchscript + mod = torch.jit.trace(net, (x, y)) + mod.save("test_F_rms_norm.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_F_rms_norm.pt inputshape=[1,12,24],[1,3,12,16]") + + # ncnn inference + import test_F_rms_norm_ncnn + b = test_F_rms_norm_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-3, 1e-3): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py b/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py index a45444060d04..d409bdfba3a1 100644 --- a/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py +++ b/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py @@ -36,8 +36,8 @@ def test(): net.eval() torch.manual_seed(0) - x = torch.rand(24, 64) - y = torch.rand(12, 24, 64) + x = torch.rand(1, 24, 64) + y = torch.rand(1, 12, 24, 64) a = net(x, y) @@ -47,7 +47,7 @@ def test(): # torchscript to pnnx import os - os.system("../../src/pnnx test_nn_LayerNorm.pt inputshape=[24,64],[12,24,64]") + os.system("../../src/pnnx test_nn_LayerNorm.pt inputshape=[1,24,64],[1,12,24,64]") # ncnn inference import test_nn_LayerNorm_ncnn diff --git a/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py b/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py new file mode 100644 index 000000000000..e69ad1220bc1 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.rmsn_0 = nn.RMSNorm(64) + self.rmsn_0.weight = nn.Parameter(torch.rand(64)) + self.rmsn_1 = nn.RMSNorm(normalized_shape=(24,64), eps=1e-2, elementwise_affine=False) + + def forward(self, x, y): + x = self.rmsn_0(x) + y = self.rmsn_0(y) + z = self.rmsn_1(y) + return x, y, z + +def test(): + if version.parse(torch.__version__) < version.parse('2.4'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 24, 64) + y = torch.rand(1, 12, 24, 64) + + a = net(x, y) + + # export torchscript + mod = torch.jit.trace(net, (x, y)) + mod.save("test_nn_RMSNorm.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_nn_RMSNorm.pt inputshape=[1,24,64],[1,12,24,64]") + + # ncnn inference + import test_nn_RMSNorm_ncnn + b = test_nn_RMSNorm_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-3, 1e-3): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_roll.py b/tools/pnnx/tests/ncnn/test_torch_roll.py new file mode 100644 index 000000000000..6412ee6ba603 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_roll.py @@ -0,0 +1,64 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.roll(x, 3, 1) + y = torch.roll(y, -2, -1) + z = torch.roll(z, shifts=(2,1), dims=(0,1)) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_roll.pt") + + # torchscript to ncnn + import os + os.system("../../src/pnnx test_torch_roll.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_roll_ncnn + b = test_torch_roll_ncnn.test_inference() + + print(x) + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + print(a0) + print(b0) + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_unbind.py b/tools/pnnx/tests/ncnn/test_torch_unbind.py index 3b8e427010c4..8e224612d7ec 100644 --- a/tools/pnnx/tests/ncnn/test_torch_unbind.py +++ b/tools/pnnx/tests/ncnn/test_torch_unbind.py @@ -26,6 +26,7 @@ def forward(self, x, y): x0 = F.relu(x0) x1 = F.relu(x1) + x2 = F.relu(x2) y0 = F.relu(y0) y1 = F.relu(y1) y2 = F.relu(y2) @@ -35,7 +36,7 @@ def forward(self, x, y): y6 = F.relu(y6) y7 = F.relu(y7) y8 = F.relu(y8) - return x0, x1, y0, y1, y2, y3, y4, y5, y6, y7, y8 + return x0, x1, x2, y0, y1, y2, y3, y4, y5, y6, y7, y8 def test(): net = Model() diff --git a/tools/pnnx/tests/onnx/CMakeLists.txt b/tools/pnnx/tests/onnx/CMakeLists.txt index 0c0a136fbaf1..673fa0434d9c 100644 --- a/tools/pnnx/tests/onnx/CMakeLists.txt +++ b/tools/pnnx/tests/onnx/CMakeLists.txt @@ -29,16 +29,27 @@ pnnx_onnx_add_test(F_layer_norm) pnnx_onnx_add_test(F_leaky_relu) pnnx_onnx_add_test(F_linear) pnnx_onnx_add_test(F_local_response_norm) +pnnx_onnx_add_test(F_logsigmoid) +pnnx_onnx_add_test(F_log_softmax) pnnx_onnx_add_test(F_max_pool1d) pnnx_onnx_add_test(F_max_pool2d) pnnx_onnx_add_test(F_max_pool3d) +pnnx_onnx_add_test(F_mish) pnnx_onnx_add_test(F_pad) pnnx_onnx_add_test(F_prelu) pnnx_onnx_add_test(F_relu) pnnx_onnx_add_test(F_relu6) pnnx_onnx_add_test(F_scaled_dot_product_attention) +pnnx_onnx_add_test(F_selu) pnnx_onnx_add_test(F_sigmoid) +pnnx_onnx_add_test(F_silu) pnnx_onnx_add_test(F_softmax) +pnnx_onnx_add_test(F_softmin) +pnnx_onnx_add_test(F_softplus) +pnnx_onnx_add_test(F_softshrink) +pnnx_onnx_add_test(F_softsign) +pnnx_onnx_add_test(F_tanh) +pnnx_onnx_add_test(F_tanhshrink) pnnx_onnx_add_test(F_upsample_bilinear) pnnx_onnx_add_test(F_upsample_nearest) pnnx_onnx_add_test(F_upsample) @@ -74,10 +85,13 @@ pnnx_onnx_add_test(nn_LayerNorm) pnnx_onnx_add_test(nn_LeakyReLU) pnnx_onnx_add_test(nn_Linear) pnnx_onnx_add_test(nn_LocalResponseNorm) +pnnx_onnx_add_test(nn_LogSigmoid) +pnnx_onnx_add_test(nn_LogSoftmax) pnnx_onnx_add_test(nn_LSTM) pnnx_onnx_add_test(nn_MaxPool1d) pnnx_onnx_add_test(nn_MaxPool2d) pnnx_onnx_add_test(nn_MaxPool3d) +pnnx_onnx_add_test(nn_Mish) pnnx_onnx_add_test(nn_MultiheadAttention) pnnx_onnx_add_test(nn_PReLU) pnnx_onnx_add_test(nn_ReflectionPad1d) @@ -88,8 +102,16 @@ pnnx_onnx_add_test(nn_ReplicationPad1d) pnnx_onnx_add_test(nn_ReplicationPad2d) pnnx_onnx_add_test(nn_ReplicationPad3d) pnnx_onnx_add_test(nn_RNN) +pnnx_onnx_add_test(nn_SELU) pnnx_onnx_add_test(nn_Sigmoid) +pnnx_onnx_add_test(nn_SiLU) pnnx_onnx_add_test(nn_Softmax) +pnnx_onnx_add_test(nn_Softmin) +pnnx_onnx_add_test(nn_Softplus) +pnnx_onnx_add_test(nn_Softshrink) +pnnx_onnx_add_test(nn_Softsign) +pnnx_onnx_add_test(nn_Tanh) +pnnx_onnx_add_test(nn_Tanhshrink) pnnx_onnx_add_test(nn_Upsample) pnnx_onnx_add_test(nn_UpsamplingBilinear2d) pnnx_onnx_add_test(nn_UpsamplingNearest2d) @@ -104,8 +126,30 @@ pnnx_onnx_add_test(squeezenet1_1) pnnx_onnx_add_test(swin_t) pnnx_onnx_add_test(vit_b_32) +pnnx_onnx_add_test(Tensor_expand) +pnnx_onnx_add_test(Tensor_permute) +pnnx_onnx_add_test(Tensor_repeat) +pnnx_onnx_add_test(Tensor_reshape) +pnnx_onnx_add_test(Tensor_select) +pnnx_onnx_add_test(Tensor_slice) +pnnx_onnx_add_test(Tensor_view) + +pnnx_onnx_add_test(torch_cat) +pnnx_onnx_add_test(torch_ceil) +pnnx_onnx_add_test(torch_chunk) +pnnx_onnx_add_test(torch_flatten) +pnnx_onnx_add_test(torch_floor) pnnx_onnx_add_test(torch_max) +pnnx_onnx_add_test(torch_maximum) pnnx_onnx_add_test(torch_mean) pnnx_onnx_add_test(torch_min) +pnnx_onnx_add_test(torch_minimum) pnnx_onnx_add_test(torch_prod) +pnnx_onnx_add_test(torch_roll) +pnnx_onnx_add_test(torch_split) +pnnx_onnx_add_test(torch_squeeze) +pnnx_onnx_add_test(torch_stack) pnnx_onnx_add_test(torch_sum) +pnnx_onnx_add_test(torch_transpose) +pnnx_onnx_add_test(torch_unbind) +pnnx_onnx_add_test(torch_unsqueeze) diff --git a/tools/pnnx/tests/onnx/test_F_log_softmax.py b/tools/pnnx/tests/onnx/test_F_log_softmax.py new file mode 100644 index 000000000000..8bc657c67780 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_log_softmax.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.log_softmax(x, 1) + y = F.log_softmax(y, 0) + z = F.log_softmax(z, 2) + w = F.log_softmax(w, 3) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_log_softmax.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_log_softmax.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_log_softmax_pnnx + b = test_F_log_softmax_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_logsigmoid.py b/tools/pnnx/tests/onnx/test_F_logsigmoid.py new file mode 100644 index 000000000000..a731936a1097 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_logsigmoid.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.logsigmoid(x) + y = F.logsigmoid(y) + z = F.logsigmoid(z) + w = F.logsigmoid(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_logsigmoid.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_logsigmoid.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_logsigmoid_pnnx + b = test_F_logsigmoid_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_mish.py b/tools/pnnx/tests/onnx/test_F_mish.py new file mode 100644 index 000000000000..69026d38b2bf --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_mish.py @@ -0,0 +1,76 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +def mish_forward_0(x): + return x * F.softplus(x).tanh() + +def mish_forward_1(x): + return x.mul(torch.tanh(F.softplus(x))) + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.mish(x) + y = F.mish(y) + z = mish_forward_0(z) + w = mish_forward_1(w) + return x, y, z, w + +def test(): + if version.parse(torch.__version__) < version.parse('1.9'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_mish.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_mish.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_mish_pnnx + b = test_F_mish_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_selu.py b/tools/pnnx/tests/onnx/test_F_selu.py new file mode 100644 index 000000000000..e70f93441912 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_selu.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.selu(x) + y = F.selu(y) + z = F.selu(z) + w = F.selu(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_selu.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_selu.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_selu_pnnx + b = test_F_selu_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_sigmoid.py b/tools/pnnx/tests/onnx/test_F_sigmoid.py index 684a7ab48d9f..c90e570e0057 100644 --- a/tools/pnnx/tests/onnx/test_F_sigmoid.py +++ b/tools/pnnx/tests/onnx/test_F_sigmoid.py @@ -41,7 +41,7 @@ def test(): z = torch.rand(1, 3, 12, 16) w = torch.rand(1, 5, 7, 9, 11) - a0, a1, a2, a3 = net(x, y, z, w) + a = net(x, y, z, w) # export onnx torch.onnx.export(net, (x, y, z, w), "test_F_sigmoid.onnx") @@ -52,9 +52,12 @@ def test(): # pnnx inference import test_F_sigmoid_pnnx - b0, b1, b2, b3 = test_F_sigmoid_pnnx.test_inference() + b = test_F_sigmoid_pnnx.test_inference() - return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) and torch.equal(a3, b3) + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True if __name__ == "__main__": if test(): diff --git a/tools/pnnx/tests/onnx/test_F_silu.py b/tools/pnnx/tests/onnx/test_F_silu.py new file mode 100644 index 000000000000..d6cc987262ea --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_silu.py @@ -0,0 +1,69 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +def silu_forward_0(x): + return x * torch.sigmoid(x) + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.silu(x) + y = F.silu(y) + z = F.silu(z) + w = silu_forward_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_silu.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_silu.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_silu_pnnx + b = test_F_silu_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_softmin.py b/tools/pnnx/tests/onnx/test_F_softmin.py new file mode 100644 index 000000000000..88a82fea00af --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_softmin.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.softmin(x, 1) + y = F.softmin(y, 0) + z = F.softmin(z, 2) + w = F.softmin(w, 3) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_softmin.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_softmin.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_softmin_pnnx + b = test_F_softmin_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_softplus.py b/tools/pnnx/tests/onnx/test_F_softplus.py new file mode 100644 index 000000000000..c261f58d67c4 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_softplus.py @@ -0,0 +1,70 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.softplus(x) + y = F.softplus(y, 2, 5.2) + z = F.softplus(z, -0.7, 15) + w = F.softplus(w, 0.1, 0.3) + return x, y, z, w + +def test(): + if version.parse(torch.__version__) < version.parse('1.11'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_softplus.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_softplus.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_softplus_pnnx + b = test_F_softplus_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_softshrink.py b/tools/pnnx/tests/onnx/test_F_softshrink.py new file mode 100644 index 000000000000..7f1fb8838077 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_softshrink.py @@ -0,0 +1,70 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.softshrink(x) + y = F.softshrink(y, 0.1) + z = F.softshrink(z, 0.22) + w = F.softshrink(w, 0) + return x, y, z, w + +def test(): + if version.parse(torch.__version__) < version.parse('1.11'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_softshrink.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_softshrink.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_softshrink_pnnx + b = test_F_softshrink_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_softsign.py b/tools/pnnx/tests/onnx/test_F_softsign.py new file mode 100644 index 000000000000..27164f3dfc17 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_softsign.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.softsign(x) + y = F.softsign(y) + z = F.softsign(z) + w = F.softsign(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_softsign.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_softsign.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_softsign_pnnx + b = test_F_softsign_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_tanh.py b/tools/pnnx/tests/onnx/test_F_tanh.py new file mode 100644 index 000000000000..b56d513f655e --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_tanh.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.tanh(x) + y = F.tanh(y) + z = F.tanh(z) + w = F.tanh(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_tanh.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_tanh.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_tanh_pnnx + b = test_F_tanh_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_tanhshrink.py b/tools/pnnx/tests/onnx/test_F_tanhshrink.py new file mode 100644 index 000000000000..7be2bf57cb16 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_tanhshrink.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.tanhshrink(x) + y = F.tanhshrink(y) + z = F.tanhshrink(z) + w = F.tanhshrink(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_tanhshrink.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_tanhshrink.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_tanhshrink_pnnx + b = test_F_tanhshrink_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_Tensor_expand.py b/tools/pnnx/tests/onnx/test_Tensor_expand.py new file mode 100644 index 000000000000..ceb01dac4c81 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_expand.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = x.expand(24) + y = y.expand(-1, 11, -1) + z = z.expand(2, 8, 3, -1, 4) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1) + y = torch.rand(3, 1, 1) + z = torch.rand(1, 8, 1, 9, 1) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_expand.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_expand.onnx inputshape=[1],[3,1,1],[1,8,1,9,1]") + + # pnnx inference + import test_Tensor_expand_pnnx + b = test_Tensor_expand_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_Tensor_permute.py b/tools/pnnx/tests/onnx/test_Tensor_permute.py new file mode 100644 index 000000000000..a36de4c251cc --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_permute.py @@ -0,0 +1,64 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = x.permute(1, 0, 2) + x = x.permute(0, 1, 2) + y = y.permute(2, 3, 1, 0) + y = y.permute(3, 1, 0, 2) + z = z.permute(1, 3, 0, 4, 2) + z = z.permute(0, 2, 4, 3, 1) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_permute.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_permute.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_Tensor_permute_pnnx + b = test_Tensor_permute_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_Tensor_repeat.py b/tools/pnnx/tests/onnx/test_Tensor_repeat.py new file mode 100644 index 000000000000..569ad548beaf --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_repeat.py @@ -0,0 +1,63 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = x.repeat(1, 2, 3) + x = x.repeat(2, 3, 4) + y = y.repeat(1, 2, 1, 4) + y = y.repeat(3, 4, 5, 1) + z = z.repeat(1, 2, 3, 1, 5) + z = z.repeat(2, 3, 3, 1, 1) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_repeat.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_repeat.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_Tensor_repeat_pnnx + b = test_Tensor_repeat_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_Tensor_reshape.py b/tools/pnnx/tests/onnx/test_Tensor_reshape.py new file mode 100644 index 000000000000..027fb40a07d9 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_reshape.py @@ -0,0 +1,63 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = x.reshape(1, 2, 24) + x = x.reshape(48) + y = y.reshape(1, 11, 5, 9) + y = y.reshape(99, 5) + z = z.reshape(4, 3, 30, 10, 14) + z = z.reshape(15, 2, 10, 7, 8, 3) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_reshape.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_reshape.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_Tensor_reshape_pnnx + b = test_Tensor_reshape_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_Tensor_select.py b/tools/pnnx/tests/onnx/test_Tensor_select.py new file mode 100644 index 000000000000..4f7488b55a52 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_select.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = x.select(1, 1) + y = y.select(2, 4) + z = z.select(0, 10) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_select.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_select.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_Tensor_select_pnnx + b = test_Tensor_select_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_Tensor_slice.py b/tools/pnnx/tests/onnx/test_Tensor_slice.py new file mode 100644 index 000000000000..7fe32b4af617 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_slice.py @@ -0,0 +1,79 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + if version.parse(torch.__version__) < version.parse('1.12'): + x = x[:,:12,1:14:1] + else: + x = x[:,:12,1:14:2] + x = x[...,1:] + if version.parse(torch.__version__) >= version.parse('1.10'): + x = x[:,:,:x.size(2)-1] + y = y[0:,1:,5:,3:] + if version.parse(torch.__version__) < version.parse('1.12'): + y = y[:,:,1:13:1,:14] + else: + y = y[:,:,1:13:2,:14] + if version.parse(torch.__version__) >= version.parse('1.10'): + y = y[:1,:y.size(1):,:,:] + z = z[4:] + if version.parse(torch.__version__) < version.parse('1.12'): + z = z[:2,:,:,:,2:-2:1] + else: + z = z[:2,:,:,:,2:-2:3] + if version.parse(torch.__version__) >= version.parse('1.10'): + z = z[:,:,:,z.size(3)-3:,:] + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 13, 26) + y = torch.rand(1, 15, 19, 21) + z = torch.rand(14, 18, 15, 19, 20) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_slice.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_slice.onnx inputshape=[1,13,26],[1,15,19,21],[14,18,15,19,20]") + + # pnnx inference + import test_Tensor_slice_pnnx + b = test_Tensor_slice_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_Tensor_view.py b/tools/pnnx/tests/onnx/test_Tensor_view.py new file mode 100644 index 000000000000..40df090a07bb --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_view.py @@ -0,0 +1,63 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = x.view(1, 2, 24) + x = x.view(48) + y = y.view(1, 11, 5, 9) + y = y.view(99, 5) + z = z.view(4, 3, 30, 10, 14) + z = z.view(15, 2, 10, 7, 8, 3) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_view.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_view.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_Tensor_view_pnnx + b = test_Tensor_view_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_LogSigmoid.py b/tools/pnnx/tests/onnx/test_nn_LogSigmoid.py new file mode 100644 index 000000000000..ddb44cbf4427 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_LogSigmoid.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.LogSigmoid() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_LogSigmoid.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_LogSigmoid.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_LogSigmoid_pnnx + b = test_nn_LogSigmoid_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_LogSoftmax.py b/tools/pnnx/tests/onnx/test_nn_LogSoftmax.py new file mode 100644 index 000000000000..dbe8dc96d824 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_LogSoftmax.py @@ -0,0 +1,71 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.LogSoftmax(dim=1) + self.act_1 = nn.LogSoftmax(dim=1) + self.act_2 = nn.LogSoftmax(dim=0) + self.act_3 = nn.LogSoftmax(dim=2) + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_1(y) + z = self.act_2(z) + w = self.act_3(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_LogSoftmax.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_LogSoftmax.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_LogSoftmax_pnnx + b = test_nn_LogSoftmax_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Mish.py b/tools/pnnx/tests/onnx/test_nn_Mish.py new file mode 100644 index 000000000000..481ba7181117 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Mish.py @@ -0,0 +1,72 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Mish() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + if version.parse(torch.__version__) < version.parse('1.9'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Mish.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Mish.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Mish_pnnx + b = test_nn_Mish_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_ReLU.py b/tools/pnnx/tests/onnx/test_nn_ReLU.py index d381fb5bc0e5..8230e3f4827a 100644 --- a/tools/pnnx/tests/onnx/test_nn_ReLU.py +++ b/tools/pnnx/tests/onnx/test_nn_ReLU.py @@ -61,7 +61,7 @@ def test(): if not torch.allclose(a0, b0, 1e-4, 1e-4): return False - if version.parse(torch.__version__) < version.parse('2.4'): + if version.parse(torch.__version__) < version.parse('2.5'): return True # export dynamo onnx diff --git a/tools/pnnx/tests/onnx/test_nn_SELU.py b/tools/pnnx/tests/onnx/test_nn_SELU.py new file mode 100644 index 000000000000..a78c9e2336f3 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_SELU.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.SELU() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_SELU.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_SELU.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_SELU_pnnx + b = test_nn_SELU_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_SiLU.py b/tools/pnnx/tests/onnx/test_nn_SiLU.py new file mode 100644 index 000000000000..e509ddb6754f --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_SiLU.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.SiLU() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_SiLU.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_SiLU.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_SiLU_pnnx + b = test_nn_SiLU_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Sigmoid.py b/tools/pnnx/tests/onnx/test_nn_Sigmoid.py index 5b9cfc9a2bef..72d5d798ef48 100644 --- a/tools/pnnx/tests/onnx/test_nn_Sigmoid.py +++ b/tools/pnnx/tests/onnx/test_nn_Sigmoid.py @@ -43,7 +43,7 @@ def test(): z = torch.rand(1, 12, 24, 64) w = torch.rand(1, 12, 24, 32, 64) - a0, a1, a2, a3 = net(x, y, z, w) + a = net(x, y, z, w) # export onnx torch.onnx.export(net, (x, y, z, w), "test_nn_Sigmoid.onnx") @@ -54,9 +54,12 @@ def test(): # pnnx inference import test_nn_Sigmoid_pnnx - b0, b1, b2, b3 = test_nn_Sigmoid_pnnx.test_inference() + b = test_nn_Sigmoid_pnnx.test_inference() - return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) and torch.equal(a3, b3) + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True if __name__ == "__main__": if test(): diff --git a/tools/pnnx/tests/onnx/test_nn_Softmin.py b/tools/pnnx/tests/onnx/test_nn_Softmin.py new file mode 100644 index 000000000000..9cb8417f2f65 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Softmin.py @@ -0,0 +1,71 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Softmin(dim=1) + self.act_1 = nn.Softmin(dim=1) + self.act_2 = nn.Softmin(dim=0) + self.act_3 = nn.Softmin(dim=2) + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_1(y) + z = self.act_2(z) + w = self.act_3(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Softmin.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Softmin.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Softmin_pnnx + b = test_nn_Softmin_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Softplus.py b/tools/pnnx/tests/onnx/test_nn_Softplus.py new file mode 100644 index 000000000000..445c6341b29c --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Softplus.py @@ -0,0 +1,73 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Softplus() + self.act_1 = nn.Softplus(beta=0.7, threshold=15) + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_1(z) + w = self.act_1(w) + return x, y, z, w + +def test(): + if version.parse(torch.__version__) < version.parse('1.11'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Softplus.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Softplus.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Softplus_pnnx + b = test_nn_Softplus_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Softshrink.py b/tools/pnnx/tests/onnx/test_nn_Softshrink.py new file mode 100644 index 000000000000..b86e9239c162 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Softshrink.py @@ -0,0 +1,73 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Softshrink() + self.act_1 = nn.Softshrink(lambd=1.3) + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_1(z) + w = self.act_1(w) + return x, y, z, w + +def test(): + if version.parse(torch.__version__) < version.parse('1.11'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Softshrink.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Softshrink.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Softshrink_pnnx + b = test_nn_Softshrink_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Softsign.py b/tools/pnnx/tests/onnx/test_nn_Softsign.py new file mode 100644 index 000000000000..da86752ca671 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Softsign.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Softsign() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Softsign.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Softsign.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Softsign_pnnx + b = test_nn_Softsign_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Tanh.py b/tools/pnnx/tests/onnx/test_nn_Tanh.py new file mode 100644 index 000000000000..083275d277f2 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Tanh.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Tanh() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Tanh.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Tanh.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Tanh_pnnx + b = test_nn_Tanh_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Tanhshrink.py b/tools/pnnx/tests/onnx/test_nn_Tanhshrink.py new file mode 100644 index 000000000000..20cabe2559a5 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Tanhshrink.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Tanhshrink() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Tanhshrink.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Tanhshrink.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Tanhshrink_pnnx + b = test_nn_Tanhshrink_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_squeezenet1_1.py b/tools/pnnx/tests/onnx/test_squeezenet1_1.py index f5f5f4a668a9..28c7df8fb81e 100644 --- a/tools/pnnx/tests/onnx/test_squeezenet1_1.py +++ b/tools/pnnx/tests/onnx/test_squeezenet1_1.py @@ -39,7 +39,7 @@ def test(): if not torch.allclose(a, b, 1e-4, 1e-4): return False - if version.parse(torch.__version__) < version.parse('2.4'): + if version.parse(torch.__version__) < version.parse('2.5'): return True # export dynamo onnx diff --git a/tools/pnnx/tests/onnx/test_swin_t.py b/tools/pnnx/tests/onnx/test_swin_t.py index be25520d0bc4..6361d20c9116 100644 --- a/tools/pnnx/tests/onnx/test_swin_t.py +++ b/tools/pnnx/tests/onnx/test_swin_t.py @@ -43,7 +43,7 @@ def test(): if not torch.allclose(a, b, 1e-4, 1e-4): return False - if version.parse(torch.__version__) < version.parse('2.4'): + if version.parse(torch.__version__) < version.parse('2.5'): return True # export dynamo onnx diff --git a/tools/pnnx/tests/onnx/test_torch_cat.py b/tools/pnnx/tests/onnx/test_torch_cat.py new file mode 100644 index 000000000000..0d944434d280 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_cat.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + out0 = torch.cat((x, y), dim=1) + out1 = torch.cat((z, w), dim=3) + out2 = torch.cat((w, w), dim=2) + return out0, out1, out2 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 2, 16) + z = torch.rand(1, 5, 9, 11) + w = torch.rand(1, 5, 9, 3) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_torch_cat.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_cat.onnx inputshape=[1,3,16],[1,2,16],[1,5,9,11],[1,5,9,3]") + + # pnnx inference + import test_torch_cat_pnnx + b = test_torch_cat_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_ceil.py b/tools/pnnx/tests/onnx/test_torch_ceil.py new file mode 100644 index 000000000000..1ff59b37a485 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_ceil.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.ceil(x * 10) + y = torch.ceil(y * 10) + z = torch.ceil(z * 10) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_ceil.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_ceil.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_ceil_pnnx + b = test_torch_ceil_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_chunk.py b/tools/pnnx/tests/onnx/test_torch_chunk.py new file mode 100644 index 000000000000..2d1400103b9f --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_chunk.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x0, x1 = torch.chunk(x, chunks=2, dim=1) + y0, y1, y2 = torch.chunk(y, chunks=3, dim=2) + z0, z1, z2, z3, z4 = torch.chunk(z, chunks=5, dim=0) + return x0, x1, y0, y1, y2, z0, z1, z2, z3, z4 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_chunk.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_chunk.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_chunk_pnnx + b = test_torch_chunk_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_flatten.py b/tools/pnnx/tests/onnx/test_torch_flatten.py new file mode 100644 index 000000000000..6105b106804e --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_flatten.py @@ -0,0 +1,63 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.flatten(x) + y = torch.flatten(y, start_dim=1, end_dim=-1) + z = torch.flatten(z, start_dim=3, end_dim=4) + x = x.relu() + y = y.relu() + z = z.relu() + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_flatten.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_flatten.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_flatten_pnnx + b = test_torch_flatten_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_floor.py b/tools/pnnx/tests/onnx/test_torch_floor.py new file mode 100644 index 000000000000..a046e4c241ac --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_floor.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.floor(x * 10) + y = torch.floor(y * 10) + z = torch.floor(z * 10) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_floor.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_floor.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_floor_pnnx + b = test_torch_floor_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_maximum.py b/tools/pnnx/tests/onnx/test_torch_maximum.py new file mode 100644 index 000000000000..5e17d5cb2d2a --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_maximum.py @@ -0,0 +1,64 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + out0 = torch.maximum(x, y) + out1 = torch.maximum(y, y) + out2 = torch.maximum(z, torch.ones_like(z) + 0.1) + return out0, out1, out2 + +def test(): + if version.parse(torch.__version__) < version.parse('1.12'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(3, 16) + z = torch.rand(5, 9, 3) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_maximum.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_maximum.onnx inputshape=[3,16],[3,16],[5,9,3]") + + # pnnx inference + import test_torch_maximum_pnnx + b = test_torch_maximum_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_minimum.py b/tools/pnnx/tests/onnx/test_torch_minimum.py new file mode 100644 index 000000000000..0d8e9a87e50c --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_minimum.py @@ -0,0 +1,64 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + out0 = torch.minimum(x, y) + out1 = torch.minimum(y, y) + out2 = torch.minimum(z, torch.ones_like(z) + 0.1) + return out0, out1, out2 + +def test(): + if version.parse(torch.__version__) < version.parse('1.12'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(3, 16) + z = torch.rand(5, 9, 3) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_minimum.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_minimum.onnx inputshape=[3,16],[3,16],[5,9,3]") + + # pnnx inference + import test_torch_minimum_pnnx + b = test_torch_minimum_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_roll.py b/tools/pnnx/tests/onnx/test_torch_roll.py new file mode 100644 index 000000000000..06b8d579649e --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_roll.py @@ -0,0 +1,64 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.roll(x, 3, -1) + y = torch.roll(y, -2, -1) + z = torch.roll(z, shifts=(2,1), dims=(0,1)) + return x, y, z + +def test(): + if version.parse(torch.__version__) < version.parse('1.10'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_roll.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_roll.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_roll_pnnx + b = test_torch_roll_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_split.py b/tools/pnnx/tests/onnx/test_torch_split.py new file mode 100644 index 000000000000..b13b041cd96b --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_split.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x0, x1 = torch.split(x, split_size_or_sections=2, dim=1) + y0, y1, y2 = torch.split(y, split_size_or_sections=[1,3,5], dim=2) + z0, z1, z2, z3, z4 = torch.split(z, split_size_or_sections=3, dim=0) + return x0, x1, y0, y1, y2, z0, z1, z2, z3, z4 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_split.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_split.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_split_pnnx + b = test_torch_split_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_squeeze.py b/tools/pnnx/tests/onnx/test_torch_squeeze.py new file mode 100644 index 000000000000..b29e4ba2f9d7 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_squeeze.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.squeeze(x, 1) + y = torch.squeeze(y) + z = torch.squeeze(z, 4) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 1, 16) + y = torch.rand(1, 5, 1, 11) + z = torch.rand(14, 8, 5, 9, 1) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_squeeze.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_squeeze.onnx inputshape=[1,1,16],[1,5,1,11],[14,8,5,9,1]") + + # pnnx inference + import test_torch_squeeze_pnnx + b = test_torch_squeeze_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_stack.py b/tools/pnnx/tests/onnx/test_torch_stack.py new file mode 100644 index 000000000000..7b04ddd307f5 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_stack.py @@ -0,0 +1,62 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + out0 = torch.stack((x, y), dim=0) + out1 = torch.stack((x, y), dim=2) + out2 = torch.stack((z, w), dim=2) + out3 = torch.stack((z, w), dim=-1) + return out0, out1, out2, out3 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(3, 16) + z = torch.rand(5, 9, 3) + w = torch.rand(5, 9, 3) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_torch_stack.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_stack.onnx inputshape=[3,16],[3,16],[5,9,3],[5,9,3]") + + # pnnx inference + import test_torch_stack_pnnx + b = test_torch_stack_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_transpose.py b/tools/pnnx/tests/onnx/test_torch_transpose.py new file mode 100644 index 000000000000..e6a25c441017 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_transpose.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.transpose(x, 1, 2) + y = torch.transpose(y, 2, 3) + z = torch.transpose(z, 1, 3) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_transpose.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_transpose.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_transpose_pnnx + b = test_torch_transpose_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_unbind.py b/tools/pnnx/tests/onnx/test_torch_unbind.py new file mode 100644 index 000000000000..a98fa25c51cc --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_unbind.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x0, x1, x2 = torch.unbind(x, dim=1) + y0, y1, y2, y3, y4, y5, y6, y7, y8 = torch.unbind(y, dim=2) + z0, z1, z2, z3 = torch.unbind(z, dim=0) + return x0, x1, x2, y0, y1, y2, y3, y4, y5, y6, y7, y8, z0, z1, z2, z3 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(4, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_unbind.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_unbind.onnx inputshape=[1,3,16],[1,5,9,11],[4,8,5,9,10]") + + # pnnx inference + import test_torch_unbind_pnnx + b = test_torch_unbind_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_unsqueeze.py b/tools/pnnx/tests/onnx/test_torch_unsqueeze.py new file mode 100644 index 000000000000..01bf84076cf3 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_unsqueeze.py @@ -0,0 +1,63 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.unsqueeze(x, 0) + x = torch.unsqueeze(x, 1) + y = torch.unsqueeze(y, 2) + y = torch.unsqueeze(y, -1) + z = torch.unsqueeze(z, -2) + z = torch.unsqueeze(z, 3) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_unsqueeze.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_unsqueeze.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_unsqueeze_pnnx + b = test_torch_unsqueeze_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_vit_b_32.py b/tools/pnnx/tests/onnx/test_vit_b_32.py index ecb0bd350f62..3c92a119406a 100644 --- a/tools/pnnx/tests/onnx/test_vit_b_32.py +++ b/tools/pnnx/tests/onnx/test_vit_b_32.py @@ -46,7 +46,7 @@ def test(): if not torch.allclose(a, b, 1e-4, 1e-4): return False - if version.parse(torch.__version__) < version.parse('2.4'): + if version.parse(torch.__version__) < version.parse('2.5'): return True # export dynamo onnx diff --git a/tools/pnnx/tests/test_F_rms_norm.py b/tools/pnnx/tests/test_F_rms_norm.py new file mode 100644 index 000000000000..5dd9e699b23f --- /dev/null +++ b/tools/pnnx/tests/test_F_rms_norm.py @@ -0,0 +1,77 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.w3 = nn.Parameter(torch.rand(24)) + self.w4 = nn.Parameter(torch.rand(12, 16)) + self.w5 = nn.Parameter(torch.rand(24)) + + def forward(self, x, y, z, w0, w1, w2): + x = F.rms_norm(x, (24,), w0) + x = F.rms_norm(x, (12,24), None) + x = F.rms_norm(x, (24,), self.w3) + + y = F.rms_norm(y, (16,), None, eps=1e-3) + y = F.rms_norm(y, (12,16), w1) + y = F.rms_norm(y, (12,16), self.w4) + + z = F.rms_norm(z, (24,), w2) + z = F.rms_norm(z, (12,16,24), None, eps=1e-2) + z = F.rms_norm(z, (24,), self.w5) + return x, y, z + +def test(): + if version.parse(torch.__version__) < version.parse('2.4'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12, 24) + y = torch.rand(2, 3, 12, 16) + z = torch.rand(1, 10, 12, 16, 24) + w0 = torch.rand(24) + w1 = torch.rand(12, 16) + w2 = torch.rand(24) + + a0, a1, a2 = net(x, y, z, w0, w1, w2) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z, w0, w1, w2)) + mod.save("test_F_rms_norm.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_F_rms_norm.pt inputshape=[1,12,24],[2,3,12,16],[1,10,12,16,24],[24],[12,16],[24]") + + # pnnx inference + import test_F_rms_norm_pnnx + b0, b1, b2 = test_F_rms_norm_pnnx.test_inference() + + return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_nn_RMSNorm.py b/tools/pnnx/tests/test_nn_RMSNorm.py new file mode 100644 index 000000000000..a9b70cdb2661 --- /dev/null +++ b/tools/pnnx/tests/test_nn_RMSNorm.py @@ -0,0 +1,71 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.rmsn_0 = nn.RMSNorm(64) + self.rmsn_0.weight = nn.Parameter(torch.rand(64)) + self.rmsn_1 = nn.RMSNorm(normalized_shape=(24,64), eps=1e-2, elementwise_affine=False) + + def forward(self, x, y, z): + x = self.rmsn_0(x) + x = self.rmsn_1(x) + + y = self.rmsn_0(y) + y = self.rmsn_1(y) + + z = self.rmsn_0(z) + z = self.rmsn_1(z) + return x, y, z + +def test(): + if version.parse(torch.__version__) < version.parse('2.4'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 24, 64) + y = torch.rand(1, 12, 24, 64) + z = torch.rand(1, 12, 16, 24, 64) + + a0, a1, a2 = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_nn_RMSNorm.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_nn_RMSNorm.pt inputshape=[1,24,64],[1,12,24,64],[1,12,16,24,64]") + + # pnnx inference + import test_nn_RMSNorm_pnnx + b0, b1, b2 = test_nn_RMSNorm_pnnx.test_inference() + + return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py b/tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py new file mode 100644 index 000000000000..b04fa93442fa --- /dev/null +++ b/tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py @@ -0,0 +1,77 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class T5LayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(torch.rand(hidden_size)) + self.variance_epsilon = eps + + def forward(self, x): + variance = x.pow(2).mean(-1, keepdim=True) + x = x * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * x + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.rmsn_0 = T5LayerNorm(26) + self.rmsn_1 = T5LayerNorm(21) + + def forward(self, x, y): + x = self.rmsn_0(x) + y = self.rmsn_1(y) + return x, y + +def test(): + if version.parse(torch.__version__) < version.parse('2.4'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 64, 26) + y = torch.rand(3, 15, 15, 21) + + a0, a1 = net(x, y) + + # export onnx + torch.onnx.export(net, (x,y), "test.onnx") + + # export torchscript + mod = torch.jit.trace(net, (x, y)) + mod.save("test_pnnx_fuse_rmsnorm.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_pnnx_fuse_rmsnorm.pt inputshape=[1,64,26],[3,15,15,21]") + + # pnnx inference + import test_pnnx_fuse_rmsnorm_pnnx + b0, b1 = test_pnnx_fuse_rmsnorm_pnnx.test_inference() + + return torch.allclose(a0, b0, 1e-4, 1e-4) and torch.allclose(a1, b1, 1e-4, 1e-4) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_logaddexp.py b/tools/pnnx/tests/test_torch_logaddexp.py new file mode 100644 index 000000000000..6914dbd62131 --- /dev/null +++ b/tools/pnnx/tests/test_torch_logaddexp.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + out0 = torch.logaddexp(x, y) + out1 = torch.logaddexp(y, y) + out2 = torch.logaddexp(z, torch.ones_like(z) + 0.5) + return out0, out1, out2 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(3, 16) + z = torch.rand(5, 9, 3) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_logaddexp.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_logaddexp.pt inputshape=[3,16],[3,16],[5,9,3]") + + # pnnx inference + import test_torch_logaddexp_pnnx + b = test_torch_logaddexp_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_roll.py b/tools/pnnx/tests/test_torch_roll.py new file mode 100644 index 000000000000..32e3bde38e13 --- /dev/null +++ b/tools/pnnx/tests/test_torch_roll.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.roll(x, 3) + y = torch.roll(y, -2, -1) + z = torch.roll(z, shifts=(2,1), dims=(0,1)) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_roll.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_roll.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_roll_pnnx + b = test_torch_roll_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_unbind.py b/tools/pnnx/tests/test_torch_unbind.py index c92c87b74351..b232f289dab4 100644 --- a/tools/pnnx/tests/test_torch_unbind.py +++ b/tools/pnnx/tests/test_torch_unbind.py @@ -24,7 +24,7 @@ def forward(self, x, y, z): x0, x1, x2 = torch.unbind(x, dim=1) y0, y1, y2, y3, y4, y5, y6, y7, y8 = torch.unbind(y, dim=2) z0, z1, z2, z3 = torch.unbind(z, dim=0) - return x0, x1, y0, y1, y2, y3, y4, y5, y6, y7, y8, z0, z1, z2, z3 + return x0, x1, x2, y0, y1, y2, y3, y4, y5, y6, y7, y8, z0, z1, z2, z3 def test(): net = Model() diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index 4d19ceb6f166..5e92b333aa57 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -133,6 +133,8 @@ class NetQuantize : public ModelWriter int quantize_lstm(); int quantize_gru(); + int quantize_embed(); + int fuse_requantize(); }; @@ -562,6 +564,55 @@ int NetQuantize::quantize_gru() return 0; } +int NetQuantize::quantize_embed() +{ + for (size_t i = 0; i < layers.size(); i++) + { + if (layers[i]->type != "Embed") + continue; + + // Embed - quantize weight from fp32 to int8 + ncnn::Embed* embed = (ncnn::Embed*)layers[i]; + + fprintf(stderr, "quantize_embed %s\n", embed->name.c_str()); + + // TODO move to ncnn2table + + const int num_output = embed->num_output; + const int input_dim = embed->input_dim; + + ncnn::Mat weight_data_int8_scales(1); + { + const float* ptr = embed->weight_data; + float absmax = 0.f; + for (int i = 0; i < embed->weight_data.w; i++) + { + absmax = std::max(absmax, (float)fabs(ptr[i])); + } + + weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + ncnn::Mat weight_data_int8; + + ncnn::Option opt_q = opt; + opt_q.blob_allocator = embed->weight_data.allocator; + opt_q.use_packing_layout = false; + ncnn::quantize_to_int8(embed->weight_data, weight_data_int8, weight_data_int8_scales, opt_q); + if (weight_data_int8.empty()) + return -100; + + embed->weight_data = weight_data_int8; + } + + embed->int8_scale_term = 2; + embed->weight_data_int8_scale = weight_data_int8_scales[0]; + } + + return 0; +} + int NetQuantize::fuse_requantize() { const size_t layer_count = layers.size(); @@ -809,6 +860,7 @@ int main(int argc, char** argv) quantizer.quantize_rnn(); quantizer.quantize_lstm(); quantizer.quantize_gru(); + quantizer.quantize_embed(); quantizer.fuse_requantize();