Skip to content

Commit

Permalink
feat mask for disable threading, make some extractor setter no-op, up…
Browse files Browse the repository at this point in the history
…date doc (#5270)
  • Loading branch information
nihui authored Jan 8, 2024
1 parent 237f45f commit c222208
Show file tree
Hide file tree
Showing 10 changed files with 150 additions and 190 deletions.
26 changes: 4 additions & 22 deletions build-android.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -2,56 +2,38 @@
@ECHO OFF
@SETLOCAL
@SET ANDROID_NDK=<your-ndk-root_path, such as"E:\android-ndk-r18b">
@SET VULKAN_SDK=<your-vulkan-toolkit_path, such as"D:\VulkanSDK\1.1.106.0\Bin">

:: Set ninja.exe
:: @SET NINJA_EXE=<your-ninja-exe_path, such as"D:\android\sdk\cmake\3.10.2.4988404\bin\ninja.exe">

:: android armv7
mkdir build-android-armv7
pushd build-android-armv7
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 ..
:: cmake -G Ninja -DCMAKE_TOOLCHAIN_FILE="%ANDROID_NDK%/build/cmake/android.toolchain.cmake" -DCMAKE_MAKE_PROGRAM=%NINJA_EXE% -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd

:: android armv7 vulkan
mkdir build-android-armv7-vulkan
pushd build-android-armv7-vulkan
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd

:: android aarch64
mkdir build-android-aarch64
pushd build-android-aarch64
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd

:: android aarch64 vulkan
mkdir build-android-aarch64-vulkan
pushd build-android-aarch64-vulkan
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd

:: android x86
mkdir build-android-x86
pushd build-android-x86
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 ..
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd

:: android x86_64
mkdir build-android-x86_64
pushd build-android-x86_64
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 ..
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd
Expand Down
120 changes: 12 additions & 108 deletions build.sh
Original file line number Diff line number Diff line change
@@ -1,73 +1,41 @@
#!/usr/bin/env bash

##### android armv7 without neon
mkdir -p build-android-armv7-without-neon
pushd build-android-armv7-without-neon
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### android armv7
mkdir -p build-android-armv7
pushd build-android-armv7
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-19 ..
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### android aarch64
mkdir -p build-android-aarch64
pushd build-android-aarch64
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 ..
make -j4
make install
popd

##### android armv7 without neon
mkdir -p build-android-armv7-without-neon
pushd build-android-armv7-without-neon
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-19 ..
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### android x86
mkdir -p build-android-x86
pushd build-android-x86
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 ..
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### android x86_64
mkdir -p build-android-x86_64
pushd build-android-x86_64
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 ..
make -j4
make install
popd

##### android armv7 vulkan
mkdir -p build-android-armv7-vulkan
pushd build-android-armv7-vulkan
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### android aarch64 vulkan
mkdir -p build-android-aarch64-vulkan
pushd build-android-aarch64-vulkan
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### android x86 vulkan
mkdir -p build-android-x86-vulkan
pushd build-android-x86-vulkan
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### android x86_64 vulkan
mkdir -p build-android-x86_64-vulkan
pushd build-android-x86_64-vulkan
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
make -j4
make install
popd
Expand Down Expand Up @@ -144,70 +112,6 @@ make -j4
make install
popd

##### ios armv7 arm64
mkdir -p build-ios
pushd build-ios
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iosxc.toolchain.cmake -DENABLE_BITCODE=OFF ..
make -j4
make install
popd

##### ios armv7 arm64 bitcode
mkdir -p build-ios-bitcode
pushd build-ios-bitcode
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iosxc.toolchain.cmake -DENABLE_BITCODE=ON ..
make -j4
make install
popd

##### ios simulator i386 x86_64
mkdir -p build-ios-sim
pushd build-ios-sim
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iossimxc.toolchain.cmake -DENABLE_BITCODE=OFF ..
make -j4
make install
popd

##### ios simulator i386 x86_64 bitcode
mkdir -p build-ios-sim-bitcode
pushd build-ios-sim-bitcode
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iossimxc.toolchain.cmake -DENABLE_BITCODE=ON ..
make -j4
make install
popd

##### ios arm64 vulkan
mkdir -p build-ios-vulkan
pushd build-ios-vulkan
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iosxc-arm64.toolchain.cmake -DENABLE_BITCODE=OFF -DVulkan_INCLUDE_DIR=${VULKAN_SDK}/MoltenVK/include -DVulkan_LIBRARY=${VULKAN_SDK}/MoltenVK/iOS/MoltenVK.framework/MoltenVK -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### ios arm64 vulkan bitcode
mkdir -p build-ios-vulkan-bitcode
pushd build-ios-vulkan-bitcode
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iosxc-arm64.toolchain.cmake -DENABLE_BITCODE=ON -DVulkan_INCLUDE_DIR=${VULKAN_SDK}/MoltenVK/include -DVulkan_LIBRARY=${VULKAN_SDK}/MoltenVK/iOS/MoltenVK.framework/MoltenVK -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### ios simulator x86_64 vulkan
mkdir -p build-ios-sim-vulkan
pushd build-ios-sim-vulkan
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iossimxc-x64.toolchain.cmake -DENABLE_BITCODE=OFF -DVulkan_INCLUDE_DIR=${VULKAN_SDK}/MoltenVK/include -DVulkan_LIBRARY=${VULKAN_SDK}/MoltenVK/iOS/MoltenVK.framework/MoltenVK -DNCNN_VULKAN=ON ..
make
make install
popd

##### ios simulator x86_64 vulkan bitcode
mkdir -p build-ios-sim-vulkan-bitcode
pushd build-ios-sim-vulkan-bitcode
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iossimxc-x64.toolchain.cmake -DENABLE_BITCODE=ON -DVulkan_INCLUDE_DIR=${VULKAN_SDK}/MoltenVK/include -DVulkan_LIBRARY=${VULKAN_SDK}/MoltenVK/iOS/MoltenVK.framework/MoltenVK -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### MacOS
mkdir -p build-mac
pushd build-mac
Expand Down
2 changes: 0 additions & 2 deletions docs/Home.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ int main()
net.load_model("model.bin");

ncnn::Extractor ex = net.create_extractor();
ex.set_light_mode(true);
ex.set_num_threads(4);

ex.input("data", in);

Expand Down
111 changes: 111 additions & 0 deletions docs/developer-guide/layer-feat-mask.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# layer feature mask

Each ncnn layer allows a special parameter pair `31=X` to control specific bahavior.

X is an unsigned integer with each bit contributing a feature mask.

We usually use it to configuring fine-graded behaviors for certain layers to maintain accuracy, reduce memory usage or optimize performance.

|bit|value|mask|rationale|
|---|---|---|---|
|1<<0|1|no fp16 arithmetic|precision concern|
|1<<1|2|no fp16 storage|precision concern|
|1<<2|4|no bf16 storage|precision concern|
|1<<3|8|no int8|debug dynamic quantized model|
|1<<4|16|no vulkan|reduce overhead for cpu op - gpu split - cpu op|
|1<<5|32|no sgemm|reduce some memory|
|1<<6|64|no winograd|reduce some memory|
|1<<7|128|no threading|force single thread|

These bits can be OR-combined into one value to control multiple behaviors simultaneously.

For example, `31=17` means disabling both vulkan and fp16 arithmetic.

## disable fp16 for certain layer to fix overflow

```ruby
7767517
3 3
Input input 0 1 input0 0=22 1=22 2=32
Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1
Convolution conv1 1 1 conv0 conv1 0=128 1=3 6=36864 9=1
```

Typically, we use fp16 computation to improve inference speed.
However, since the weight value of `conv1` is very large, fp16 accumulation may cause numerical overflow, so fp16 needs to be disabled individually for `conv1`, while other layers continue to use fp16 mode

Add `31=3` to disable fp16 storage and arithmetic.

```ruby
7767517
3 3
Input input 0 1 input0 0=22 1=22 2=32
Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1
Convolution conv1 1 1 conv0 conv1 0=128 1=3 6=36864 9=1 31=3
```

## disable vulkan for certain layer to improve performance

```ruby
7767517
5 5
Input input 0 1 input0 0=22 1=22 2=32
Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1
SomeCPULayer c0 1 1 conv0 c0 0=32
ReLU relu0 1 1 c0 relu0
SomeCPULayer c1 1 1 relu0 c1 0=32
```

Between the CPU layers, there is a simple calculation layer that supports vulkan. We can set `31=16` to force it to run on CPU. This can avoid the overhead of data upload, download and storage layout conversion between CPU and GPU. After all, CPU is fast enough for simple operations.

```ruby
7767517
5 5
Input input 0 1 input0 0=22 1=22 2=32
Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1
SomeCPULayer c0 1 1 conv0 c0 0=32
ReLU relu0 1 1 c0 relu0 31=16
SomeCPULayer c1 1 1 relu0 c1 0=32
```

## disable winograd for certain layer to reduce memory usage

```ruby
7767517
3 3
Input input 0 1 input0 0=22 1=22 2=32
Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1
Convolution conv1 1 1 conv0 conv1 0=128 1=3 6=36864 9=1
```

The winograd technology uses more memory for the purpose of improving convolution performance, but this is not always true. In some memory-constrained situations, or memory IO bottlenecks, we can disable the use of winograd on some layers in exchange for a smaller memory footprint. Add `31=64` to Convolution layer, which forces it to use implcit-gemm or tiled im2col-gemm implementation, reducing memory usage and sometimes improving vulkan performance.

```ruby
7767517
3 3
Input input 0 1 input0 0=22 1=22 2=32
Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1
Convolution conv1 1 1 conv0 conv1 0=128 1=3 6=36864 9=1 31=64
```

## disable threading for certain layer to improve performance

```ruby
7767517
4 4
Input input 0 1 input0 0=22 1=22 2=3
Convolution conv0 1 1 input0 conv0 0=16 1=3 6=432
HardSigmoid hs 1 1 conv0 hs0
Convolution conv1 1 1 hs0 conv1 0=16 1=3 6=2304
```

The overhead of multi-thread dispatch and merging is too large for small tensors. Add `31=128` to HardSigmoid layer, which forces it to execute in a single thread, reducing power consumption and improving performance.

```ruby
7767517
4 4
Input input 0 1 input0 0=22 1=22 2=3
Convolution conv0 1 1 input0 conv0 0=16 1=3 6=432
HardSigmoid hs 1 1 conv0 hs0 31=128
Convolution conv1 1 1 hs0 conv1 0=16 1=3 6=2304
```
Loading

0 comments on commit c222208

Please sign in to comment.