diff --git a/CMakeLists.txt b/CMakeLists.txt index 641f98471d0..1946b4bdbf8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -153,16 +153,23 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm") OR ((CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) AND (${CMAKE_GENERATOR_PLATFORM} MATCHES "^(arm|arm64)"))) set(NCNN_TARGET_ARCH arm) - if(CMAKE_SIZEOF_VOID_P EQUAL 4 AND NOT (CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))) - set(CMAKE_REQUIRED_FLAGS "-mfpu=neon-vfpv4") - check_cxx_source_compiles("#include \nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4) + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) + set(CMAKE_REQUIRED_FLAGS "/arch:VFPv4") + check_cxx_source_compiles("#include \nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4) - if(NOT NCNN_COMPILER_SUPPORT_ARM_VFPV4) - set(CMAKE_REQUIRED_FLAGS "-mfpu=neon-vfpv4 -mfp16-format=ieee") - check_cxx_source_compiles("#include \nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4_FP16) - endif() + unset(CMAKE_REQUIRED_FLAGS) + else() + set(CMAKE_REQUIRED_FLAGS "-mfpu=neon-vfpv4") + check_cxx_source_compiles("#include \nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4) - unset(CMAKE_REQUIRED_FLAGS) + if(NOT NCNN_COMPILER_SUPPORT_ARM_VFPV4) + set(CMAKE_REQUIRED_FLAGS "-mfpu=neon-vfpv4 -mfp16-format=ieee") + check_cxx_source_compiles("#include \nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4_FP16) + endif() + + unset(CMAKE_REQUIRED_FLAGS) + endif() if(NCNN_COMPILER_SUPPORT_ARM_VFPV4 OR NCNN_COMPILER_SUPPORT_ARM_VFPV4_FP16) option(NCNN_VFPV4 "optimize armv7 platform with vfpv4" ON) @@ -171,41 +178,78 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm") endif() endif() - if(CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT (CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))) - set(CMAKE_REQUIRED_FLAGS "-march=armv8-a") - check_cxx_source_compiles("#include \nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4) + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.0") + check_cxx_source_compiles("#include \nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4) - set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16") - check_cxx_source_compiles("#include \nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16) + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2") + check_cxx_source_compiles("#include \nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16) - set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+dotprod") - check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD) + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2") + check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD) - set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16fml") - check_cxx_source_compiles("#include \nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML) + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2") + check_cxx_source_compiles("#include \nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML) - set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+bf16") - check_cxx_source_compiles("#include \nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16) + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4") + check_cxx_source_compiles("#include \nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16) - set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+i8mm") - check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM) + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4") + check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM) - set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve") - check_cxx_source_compiles("#include \nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE) + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") + check_cxx_source_compiles("#include \nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE) - set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve2") - check_cxx_source_compiles("#include \nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2) + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") + check_cxx_source_compiles("#include \nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2) - set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+bf16") - check_cxx_source_compiles("#include \nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16) + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") + check_cxx_source_compiles("#include \nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16) - set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+i8mm") - check_cxx_source_compiles("#include \nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM) + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") + check_cxx_source_compiles("#include \nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM) - set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+f32mm") - check_cxx_source_compiles("#include \nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM) + set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6") + check_cxx_source_compiles("#include \nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM) - unset(CMAKE_REQUIRED_FLAGS) + unset(CMAKE_REQUIRED_FLAGS) + else() + set(CMAKE_REQUIRED_FLAGS "-march=armv8-a") + check_cxx_source_compiles("#include \nint main() { float32x4_t _a; float16x4_t _s = vcvt_f16_f32(_a); return 0; }" NCNN_COMPILER_SUPPORT_ARM_VFPV4) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16") + check_cxx_source_compiles("#include \nint main() { float16x8_t _s, _a, _b; _s = vfmaq_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+dotprod") + check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vdotq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16fml") + check_cxx_source_compiles("#include \nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+bf16") + check_cxx_source_compiles("#include \nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+i8mm") + check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve") + check_cxx_source_compiles("#include \nint main() { svfloat16_t _s, _a, _b; svbool_t bp; _s = svmla_f16_z(bp, _s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve2") + check_cxx_source_compiles("#include \nint main() { svint16_t _s; svint8_t _a, _b; _s = svmlslb_s16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVE2) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+bf16") + check_cxx_source_compiles("#include \nint main() { svfloat32_t _s; svbfloat16_t _a, _b; _s = svbfmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+i8mm") + check_cxx_source_compiles("#include \nint main() { svint32_t _s; svint8_t _a, _b; _s = svmmla_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM) + + set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+f32mm") + check_cxx_source_compiles("#include \nint main() { svfloat32_t _s, _a, _b; _s = svmmla_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM) + + unset(CMAKE_REQUIRED_FLAGS) + endif() if(NCNN_COMPILER_SUPPORT_ARM_VFPV4) option(NCNN_VFPV4 "optimize aarch64 platform with vfpv4" ON)