From 8293ce0f5f87a15583a132189c1e55c2512e801e Mon Sep 17 00:00:00 2001 From: halx99 Date: Sat, 3 Aug 2024 20:18:13 +0800 Subject: [PATCH] Refactor math simd --- 1k/1kiss.ps1 | 1 + 1k/fetch.cmake | 8 +- 3rdparty/README.md | 2 +- 3rdparty/yasio/yasio/bindings/yasio_ni.cpp | 24 +- 3rdparty/yasio/yasio/config.hpp | 2 +- .../yasio/impl/eventfd_select_interrupter.hpp | 10 +- 3rdparty/yasio/yasio/logging.hpp | 3 + 3rdparty/yasio/yasio/xxsocket.cpp | 2 +- CMakeOptions.md | 1 + cmake/Modules/AXConfigDefine.cmake | 15 +- core/CMakeLists.txt | 40 +- core/base/Configuration.cpp | 14 +- core/base/Console.cpp | 2 +- core/math/Mat4.cpp | 41 +- core/math/Mat4.h | 12 +- core/math/Mat4.inl | 3 +- core/math/MathBase.h | 40 +- core/math/MathUtil.cpp | 232 +++--- core/math/MathUtil.h | 30 +- core/math/MathUtil.inl | 382 +++++----- core/math/MathUtilNeon.inl | 684 +++++++++--------- core/math/MathUtilNeon64.inl | 398 ---------- core/math/MathUtilSSE.inl | 413 +++++++---- core/platform/PlatformConfig.h | 21 + core/platform/PlatformMacros.h | 135 ++-- .../Source/core/math/MathUtilTests.cpp | 345 ++++----- 26 files changed, 1281 insertions(+), 1579 deletions(-) delete mode 100644 core/math/MathUtilNeon64.inl diff --git a/1k/1kiss.ps1 b/1k/1kiss.ps1 index 1e9009412b4b..8d98e9f5ebd3 100644 --- a/1k/1kiss.ps1 +++ b/1k/1kiss.ps1 @@ -895,6 +895,7 @@ function setup_cmake($skipOS = $false, $scope = 'local') { else { & "$cmake_pkg_path" '--skip-license' '--prefix=/usr/local' 1>$null 2>$null } + if (!$?) { Remove-Item $cmake_pkg_path -Force } } $cmake_prog, $_ = find_prog -name 'cmake' -path $cmake_bin -silent $true diff --git a/1k/fetch.cmake b/1k/fetch.cmake index 4556042af33e..6fa0c0804fde 100644 --- a/1k/fetch.cmake +++ b/1k/fetch.cmake @@ -20,10 +20,16 @@ function(_1kfetch_init) set(_1kfetch_manifest "${_1kfetch_manifest}" CACHE STRING "" FORCE) endif() + if(NOT EXISTS ${PWSH_PROG}) # try again + unset(PWSH_PROG CACHE) + find_program(PWSH_PROG NAMES pwsh powershell NO_PACKAGE_ROOT_PATH NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH NO_CMAKE_SYSTEM_PATH NO_CMAKE_FIND_ROOT_PATH) + endif() + execute_process(COMMAND ${PWSH_PROG} ${CMAKE_CURRENT_FUNCTION_LIST_DIR}/resolv-uri.ps1 -name "1kdist" -manifest ${_1kfetch_manifest} OUTPUT_VARIABLE _1kdist_url + RESULT_VARIABLE _1kdist_error ) if(_1kdist_url) @@ -33,7 +39,7 @@ function(_1kfetch_init) set(_1kdist_base_url "${_1kdist_base_url}/${_1kdist_ver}" PARENT_SCOPE) set(_1kdist_ver ${_1kdist_ver} PARENT_SCOPE) else() - message(WARNING "Resolve 1kdist uri fail, the _1kfetch_dist will not work") + message(WARNING "Resolve 1kdist uri fail, ${_1kdist_error}, the _1kfetch_dist will not work") endif() endfunction() diff --git a/3rdparty/README.md b/3rdparty/README.md index 7cdfd34e53d8..bb9df631800b 100644 --- a/3rdparty/README.md +++ b/3rdparty/README.md @@ -248,7 +248,7 @@ ## yasio - [![Upstream](https://img.shields.io/github/v/release/yasio/yasio?label=Upstream)](https://github.com/yasio/yasio) -- Version: 4.2.3 +- Version: 4.2.4 - License: MIT WITH Anti-996 ## zlib diff --git a/3rdparty/yasio/yasio/bindings/yasio_ni.cpp b/3rdparty/yasio/yasio/bindings/yasio_ni.cpp index 8914488f6b06..50b1bab7239d 100644 --- a/3rdparty/yasio/yasio/bindings/yasio_ni.cpp +++ b/3rdparty/yasio/yasio/bindings/yasio_ni.cpp @@ -60,14 +60,14 @@ YASIO_NI_API void yasio_init_globals(void(YASIO_INTEROP_DECL* pfn)(int level, co YASIO_NI_API void yasio_cleanup_globals() { io_service::cleanup_globals(); } struct yasio_io_event { - int kind; // - int channel; - void* thandle; + int kind; // event kind + int channel; // channel index + void* thandle; // transport union { - void* msg; - int status; // + void* hmsg; // io_packet* + int ec; // error code }; - void* user; + void* user; // user data }; YASIO_NI_API void* yasio_create_service(int channel_count, void(YASIO_INTEROP_DECL* event_cb)(yasio_io_event* event), void* user) @@ -82,9 +82,9 @@ YASIO_NI_API void* yasio_create_service(int channel_count, void(YASIO_INTEROP_DE event.thandle = e->transport(); event.user = user; if (event.kind == yasio::YEK_ON_PACKET) - event.msg = !is_packet_empty(pkt) ? &pkt : nullptr; + event.hmsg = !is_packet_empty(pkt) ? &pkt : nullptr; else - event.status = e->status(); + event.ec = e->status(); event_cb(&event); }); return service; @@ -157,8 +157,12 @@ YASIO_NI_API void yasio_set_option(void* service_ptr, int opt, const char* pszAr std::array args; int argc = 0; yasio::split_if(&strArgs.front(), ';', [&](char* s, char* e) { - *e = '\0'; // to c style string - args[argc++] = cxx17::string_view(s, e - s); + if (e) { + *e = '\0'; // to c style string + args[argc++] = cxx17::string_view(s, e - s); + } else { + args[argc++] = cxx17::string_view{s}; + } return (argc < YASIO_MAX_OPTION_ARGC); }); diff --git a/3rdparty/yasio/yasio/config.hpp b/3rdparty/yasio/yasio/config.hpp index 77c35c1301bf..6ae79af5b2ec 100644 --- a/3rdparty/yasio/yasio/config.hpp +++ b/3rdparty/yasio/yasio/config.hpp @@ -205,7 +205,7 @@ SOFTWARE. /* ** The yasio version macros */ -#define YASIO_VERSION_NUM 0x040203 +#define YASIO_VERSION_NUM 0x040204 /* ** The macros used by io_service. diff --git a/3rdparty/yasio/yasio/impl/eventfd_select_interrupter.hpp b/3rdparty/yasio/yasio/impl/eventfd_select_interrupter.hpp index eb5fe285a68a..55d32953da48 100644 --- a/3rdparty/yasio/yasio/impl/eventfd_select_interrupter.hpp +++ b/3rdparty/yasio/yasio/impl/eventfd_select_interrupter.hpp @@ -20,11 +20,11 @@ #include #include #include -#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ < 8) -# include -#else // __GLIBC__ == 2 && __GLIBC_MINOR__ < 8 +#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ < 8) && !defined(__UCLIBC__) +# include // for syscall without API: eventfd +#else # include -#endif // __GLIBC__ == 2 && __GLIBC_MINOR__ < 8 +#endif #include @@ -105,7 +105,7 @@ class eventfd_select_interrupter { // Open the descriptors. Throws on error. inline void open_descriptors() { -#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ < 8) +#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ < 8) && !defined(__UCLIBC__) write_descriptor_ = read_descriptor_ = syscall(__NR_eventfd, 0); if (read_descriptor_ != -1) { diff --git a/3rdparty/yasio/yasio/logging.hpp b/3rdparty/yasio/yasio/logging.hpp index 12b79b94f0d3..11f28046195c 100644 --- a/3rdparty/yasio/yasio/logging.hpp +++ b/3rdparty/yasio/yasio/logging.hpp @@ -40,6 +40,9 @@ inline void yasio__print(std::string&& message) { ::write(::fileno(stdout), mess # include # include # define YASIO_LOG_TAG(tag, format, ...) __android_log_print(ANDROID_LOG_INFO, "yasio", (tag format), ##__VA_ARGS__) +#elif defined(__OHOS__) +# include +# define YASIO_LOG_TAG(tag, format, ...) OH_LOG_INFO(LOG_APP, (tag format "\n"), ##__VA_ARGS__) #else # define YASIO_LOG_TAG(tag, format, ...) printf((tag format "\n"), ##__VA_ARGS__) #endif diff --git a/3rdparty/yasio/yasio/xxsocket.cpp b/3rdparty/yasio/yasio/xxsocket.cpp index 947757846ad6..6c1e06fb6122 100644 --- a/3rdparty/yasio/yasio/xxsocket.cpp +++ b/3rdparty/yasio/yasio/xxsocket.cpp @@ -209,7 +209,7 @@ int xxsocket::pserve(const endpoint& ep) if (!this->reopen(ep.af())) return -1; - set_optval(SOL_SOCKET, SO_REUSEADDR, 1); + this->reuse_address(true); int n = this->bind(ep); if (n != 0) diff --git a/CMakeOptions.md b/CMakeOptions.md index 484c69a2f19c..89135c7d4a94 100644 --- a/CMakeOptions.md +++ b/CMakeOptions.md @@ -52,6 +52,7 @@ default is: `navigator.hardwareConcurrency` - AX_WASM_SHELL_FILE: specify the wasm shell file, by default use `${_AX_ROOT}/core/platform/wasm/shell_minimal.html` - AX_WASM_ENABLE_DEVTOOLS: whether enable web devtools aka `pause`, `resume`, `step` buttons in webpage, default: `TRUE` - AX_WASM_INITIAL_MEMORY: set the wasm initial memory size, default `1024MB` +- AX_WASM_ISA_SIMD: specify the wasm simd intrinsics type, default `none`, supports `sse`, `neon`, note the `wasm-simd` not support by axmol yet ## The options for axmol apps diff --git a/cmake/Modules/AXConfigDefine.cmake b/cmake/Modules/AXConfigDefine.cmake index e2156b202e53..6e3e569e8913 100644 --- a/cmake/Modules/AXConfigDefine.cmake +++ b/cmake/Modules/AXConfigDefine.cmake @@ -186,22 +186,21 @@ endfunction() if(EMSCRIPTEN) set(AX_WASM_THREADS "4" CACHE STRING "Wasm threads count") - - set(_AX_WASM_THREADS_INT 0) + set(_threads_hint "") if (AX_WASM_THREADS STREQUAL "auto") # not empty string or not 0 # Enable pthread support globally + set(_threads_hint "(auto)") include(ProcessorCount) + set(_AX_WASM_THREADS_INT 0) ProcessorCount(_AX_WASM_THREADS_INT) - elseif(AX_WASM_THREADS MATCHES "^([0-9]+)$" OR AX_WASM_THREADS STREQUAL "navigator.hardwareConcurrency") - set(_AX_WASM_THREADS_INT ${AX_WASM_THREADS}) + set(AX_WASM_THREADS "${_AX_WASM_THREADS_INT}" CACHE STRING "Wasm threads count" FORCE) endif() - message(STATUS "AX_WASM_THREADS=${AX_WASM_THREADS}") - message(STATUS "_AX_WASM_THREADS_INT=${_AX_WASM_THREADS_INT}") + message(STATUS "AX_WASM_THREADS=${AX_WASM_THREADS}${_threads_hint}") - if (_AX_WASM_THREADS_INT) + if(AX_WASM_THREADS MATCHES "^([0-9]+)$" OR AX_WASM_THREADS STREQUAL "navigator.hardwareConcurrency") list(APPEND _ax_compile_options -pthread) - add_link_options(-pthread -sPTHREAD_POOL_SIZE=${_AX_WASM_THREADS_INT}) + add_link_options(-pthread -sPTHREAD_POOL_SIZE=${AX_WASM_THREADS}) endif() set(AX_WASM_INITIAL_MEMORY "1024MB" CACHE STRING "") diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 96ef16b5f77f..7defad5ae35c 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -400,9 +400,43 @@ if(WINDOWS) endif() endif() -# AX_USE_SSE -if (AX_ISA_SIMD MATCHES "sse") - target_compile_definitions(${_AX_CORE_LIB} PUBLIC AX_USE_SSE=1) +# axmol math simd intrinsics support +set(_simdc_defines) +set(_simdc_options) +if (NOT WASM) # native platforms auto detect from cmake or preprocessor check + if (AX_ISA_SIMD MATCHES "sse") + list(APPEND _simdc_defines AX_SSE_INTRINSICS=1) + if (AX_ISA_SIMD MATCHES "sse4") + list(APPEND _simdc_defines __SSE4_1__=1) + if (LINUX) + list(APPEND _simdc_options -msse4.1) + endif() + endif() + endif() +else() # wasm requires user specify SIMD intrinsics manually + set(AX_WASM_ISA_SIMD "none" CACHE STRING "") + string(TOLOWER ${AX_WASM_ISA_SIMD} AX_WASM_ISA_SIMD) + if(AX_WASM_ISA_SIMD MATCHES "sse") + message(AUTHOR_WARNING "Using SSE intrinsics for WASM ...") + list(APPEND _simdc_defines AX_SSE_INTRINSICS=1 __SSE__=1 __SSE2__=1) + list(APPEND _simdc_options -msse -msse2) + if(AX_ISA_LEVEL GREATER_EQUAL 2) + list(APPEND _simdc_defines __SSE4_1__=1) + list(APPEND _simdc_options -msse4.1) + endif() + list(APPEND _simdc_options -msimd128) + elseif(AX_WASM_ISA_SIMD MATCHES "neon") + message(AUTHOR_WARNING "Using NEON intrinsics for WASM ...") + list(APPEND _simdc_defines AX_NEON_INTRINSICS=1) + list(APPEND _simdc_options -mfpu=neon -msimd128) + endif() +endif() + +if(_simdc_defines) + target_compile_definitions(${_AX_CORE_LIB} PUBLIC ${_simdc_defines}) + if(_simdc_options) + target_compile_options(${_AX_CORE_LIB} PUBLIC ${_simdc_options}) + endif() endif() # engine extensions diff --git a/core/base/Configuration.cpp b/core/base/Configuration.cpp index aa442d4ff543..8fe50d291f9f 100644 --- a/core/base/Configuration.cpp +++ b/core/base/Configuration.cpp @@ -71,7 +71,7 @@ bool Configuration::init() #if AX_ENABLE_PROFILERS _valueDict["axmol.compiled_with_profiler"] = Value(true); #else - _valueDict["axmol.compiled_with_profiler"] = Value(false); + _valueDict["axmol.compiled_with_profiler"] = Value(false); #endif #if AX_ENABLE_GL_STATE_CACHE == 0 @@ -83,7 +83,17 @@ bool Configuration::init() #if _AX_DEBUG _valueDict["axmol.build_type"] = Value("DEBUG"); #else - _valueDict["axmol.build_type"] = Value("RELEASE"); + _valueDict["axmol.build_type"] = Value("RELEASE"); +#endif + +#if defined(AX_SSE_INTRINSICS) +# if defined(__SSE4_1__) + _valueDict["axmol.simd"] = Value("SSE41"); +# else + _valueDict["axmol.simd"] = Value("SSE2"); +# endif +#elif defined(AX_NEON_INTRINSICS) + _valueDict["axmol.simd"] = Value("NEON"); #endif return true; diff --git a/core/base/Console.cpp b/core/base/Console.cpp index b5fc8ef2d0c2..a9569d8697dd 100644 --- a/core/base/Console.cpp +++ b/core/base/Console.cpp @@ -398,7 +398,7 @@ bool Console::listenOnTCP(int port) if (sock.pserve(ep) != 0) { int ec = xxsocket::get_last_errno(); - AXLOGW("Console: open server failed, ec:{}", ec); + AXLOGW("Console: open server failed, ec:{}, {}", ec, xxsocket::strerror(ec)); return false; } diff --git a/core/math/Mat4.cpp b/core/math/Mat4.cpp index d08383871037..6ab50adc4ec4 100644 --- a/core/math/Mat4.cpp +++ b/core/math/Mat4.cpp @@ -17,7 +17,7 @@ Original file from GamePlay3D: http://gameplay3d.org - This file was modified to fit the cocos2d-x project + This file was modified to fit the axmol project */ #include "math/Mat4.h" @@ -459,11 +459,7 @@ void Mat4::add(float scalar) void Mat4::add(float scalar, Mat4* dst) { GP_ASSERT(dst); -#ifdef AX_USE_SSE - MathUtil::addMatrix(col, scalar, dst->col); -#else MathUtil::addMatrix(m, scalar, dst->m); -#endif } void Mat4::add(const Mat4& mat) @@ -474,11 +470,7 @@ void Mat4::add(const Mat4& mat) void Mat4::add(const Mat4& m1, const Mat4& m2, Mat4* dst) { GP_ASSERT(dst); -#ifdef AX_USE_SSE - MathUtil::addMatrix(m1.col, m2.col, dst->col); -#else MathUtil::addMatrix(m1.m, m2.m, dst->m); -#endif } bool Mat4::decompose(Vec3* scale, Quaternion* rotation, Vec3* translation) const @@ -751,11 +743,7 @@ void Mat4::multiply(float scalar, Mat4* dst) const void Mat4::multiply(const Mat4& m, float scalar, Mat4* dst) { GP_ASSERT(dst); -#ifdef AX_USE_SSE - MathUtil::multiplyMatrix(m.col, scalar, dst->col); -#else MathUtil::multiplyMatrix(m.m, scalar, dst->m); -#endif } void Mat4::multiply(const Mat4& mat) @@ -766,20 +754,12 @@ void Mat4::multiply(const Mat4& mat) void Mat4::multiply(const Mat4& m1, const Mat4& m2, Mat4* dst) { GP_ASSERT(dst); -#ifdef AX_USE_SSE - MathUtil::multiplyMatrix(m1.col, m2.col, dst->col); -#else MathUtil::multiplyMatrix(m1.m, m2.m, dst->m); -#endif } void Mat4::negate() { -#ifdef AX_USE_SSE - MathUtil::negateMatrix(col, col); -#else MathUtil::negateMatrix(m, m); -#endif } Mat4 Mat4::getNegated() const @@ -945,11 +925,7 @@ void Mat4::subtract(const Mat4& mat) void Mat4::subtract(const Mat4& m1, const Mat4& m2, Mat4* dst) { GP_ASSERT(dst); -#ifdef AX_USE_SSE - MathUtil::subtractMatrix(m1.col, m2.col, dst->col); -#else MathUtil::subtractMatrix(m1.m, m2.m, dst->m); -#endif } void Mat4::transformVector(Vec3* vector) const @@ -967,7 +943,7 @@ void Mat4::transformVector(float x, float y, float z, float w, Vec3* dst) const { GP_ASSERT(dst); - MathUtil::transformVec4(m, x, y, z, w, (float*)dst); + MathUtil::transformVec4(m, x, y, z, w, reinterpret_cast(dst)); } void Mat4::transformVector(Vec4* vector) const @@ -979,14 +955,7 @@ void Mat4::transformVector(Vec4* vector) const void Mat4::transformVector(const Vec4& vector, Vec4* dst) const { GP_ASSERT(dst); -#ifdef AX_USE_SSE - alignas(16) Vec4 inVal{vector}; - alignas(16) Vec4 outVal; - MathUtil::transformVec4(col, reinterpret_cast(inVal), reinterpret_cast<__m128&>(outVal)); - *dst = outVal; -#else - MathUtil::transformVec4(m, (const float*)&vector, (float*)dst); -#endif + MathUtil::transformVec4(m, reinterpret_cast(&vector), reinterpret_cast(dst)); } void Mat4::translate(float x, float y, float z) @@ -1013,11 +982,7 @@ void Mat4::translate(const Vec3& t, Mat4* dst) const void Mat4::transpose() { -#ifdef AX_USE_SSE - MathUtil::transposeMatrix(col, col); -#else MathUtil::transposeMatrix(m, m); -#endif } Mat4 Mat4::getTransposed() const diff --git a/core/math/Mat4.h b/core/math/Mat4.h index e6cd4b6f3757..613648901699 100644 --- a/core/math/Mat4.h +++ b/core/math/Mat4.h @@ -18,7 +18,7 @@ Original file from GamePlay3D: http://gameplay3d.org - This file was modified to fit the cocos2d-x project + This file was modified to fit the axmol project */ #ifndef MATH_MAT4_H @@ -29,10 +29,6 @@ #include "math/Vec3.h" #include "math/Vec4.h" -#ifdef AX_USE_SSE -# include -#endif - /** * @addtogroup base * @{ @@ -73,7 +69,7 @@ NS_AX_MATH_BEGIN * * @see Transform */ -#ifdef AX_USE_SSE +#if defined(AX_SSE_INTRINSICS) || defined(AX_NEON_INTRINSICS) class AX_DLL alignas(16) Mat4 #else class AX_DLL Mat4 @@ -95,10 +91,10 @@ class AX_DLL Mat4 /** * Stores the columns of this 4x4 matrix. * */ -#ifdef AX_USE_SSE +#if defined(AX_SSE_INTRINSICS) || defined(AX_NEON_INTRINSICS) union { - __m128 col[4]; + _xm128_t col[4]; float m[16]; }; #else diff --git a/core/math/Mat4.inl b/core/math/Mat4.inl index 69dac19e5fc8..1babde69f1f5 100644 --- a/core/math/Mat4.inl +++ b/core/math/Mat4.inl @@ -1,5 +1,6 @@ /** Copyright 2013 BlackBerry Inc. + Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md). Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,7 +16,7 @@ Original file from GamePlay3D: http://gameplay3d.org - This file was modified to fit the cocos2d-x project + This file was modified to fit the axmol project */ #include "math/Mat4.h" diff --git a/core/math/MathBase.h b/core/math/MathBase.h index 16d0a62e75ea..72d1e581fbbd 100644 --- a/core/math/MathBase.h +++ b/core/math/MathBase.h @@ -1,5 +1,6 @@ /**************************************************************************** Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd. + Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md). https://axmol.dev/ @@ -22,46 +23,47 @@ THE SOFTWARE. ****************************************************************************/ -#ifndef __CCMATHBASE_H__ -#define __CCMATHBASE_H__ +#ifndef __AXMATHBASE_H__ +#define __AXMATHBASE_H__ #include #include #include "platform/PlatformMacros.h" + /** * @addtogroup base * @{ */ /**Util macro for conversion from degrees to radians.*/ -#define MATH_DEG_TO_RAD(x) ((x)*0.0174532925f) +#define MATH_DEG_TO_RAD(x) ((x) * 0.0174532925f) /**Util macro for conversion from radians to degrees.*/ -#define MATH_RAD_TO_DEG(x) ((x)*57.29577951f) +#define MATH_RAD_TO_DEG(x) ((x) * 57.29577951f) /** @{ Util macro for const float such as epsilon, small float and float precision tolerance. */ #define MATH_FLOAT_SMALL 1.0e-37f -#define MATH_TOLERANCE 2e-37f -#define MATH_PIOVER2 1.57079632679489661923f -#define MATH_EPSILON 0.000001f +#define MATH_TOLERANCE 2e-37f +#define MATH_PIOVER2 1.57079632679489661923f +#define MATH_EPSILON 0.000001f /**@}*/ -//#define MATH_PIOVER4 0.785398163397448309616f -//#define MATH_PIX2 6.28318530717958647693f -//#define MATH_E 2.71828182845904523536f -//#define MATH_LOG10E 0.4342944819032518f -//#define MATH_LOG2E 1.442695040888963387f -//#define MATH_PI 3.14159265358979323846f -//#define MATH_RANDOM_MINUS1_1() ((2.0f*((float)rand()/RAND_MAX))-1.0f) // Returns a random float between -1 -// and 1. #define MATH_RANDOM_0_1() ((float)rand()/RAND_MAX) // Returns a random float -// between 0 and 1. #define MATH_CLAMP(x, lo, hi) ((x < lo) ? lo : ((x > hi) ? hi : x)) #ifndef M_1_PI #define -// M_1_PI 0.31830988618379067154 +// #define MATH_PIOVER4 0.785398163397448309616f +// #define MATH_PIX2 6.28318530717958647693f +// #define MATH_E 2.71828182845904523536f +// #define MATH_LOG10E 0.4342944819032518f +// #define MATH_LOG2E 1.442695040888963387f +// #define MATH_PI 3.14159265358979323846f +// #define MATH_RANDOM_MINUS1_1() ((2.0f*((float)rand()/RAND_MAX))-1.0f) // Returns a random float between -1 +// and 1. #define MATH_RANDOM_0_1() ((float)rand()/RAND_MAX) // Returns a random float +// between 0 and 1. #define MATH_CLAMP(x, lo, hi) ((x < lo) ? lo : ((x > hi) ? hi : x)) #ifndef M_1_PI #define +// M_1_PI 0.31830988618379067154 #ifdef __cplusplus # define NS_AX_MATH_BEGIN \ - namespace ax \ + namespace ax \ { -# define NS_AX_MATH_END } +# define NS_AX_MATH_END } # define USING_NS_AX_MATH using namespace ax #else # define NS_AX_MATH_BEGIN diff --git a/core/math/MathUtil.cpp b/core/math/MathUtil.cpp index 9fb49000b3b2..805c8dfb331b 100644 --- a/core/math/MathUtil.cpp +++ b/core/math/MathUtil.cpp @@ -17,7 +17,7 @@ limitations under the License. Original file from GamePlay3D: http://gameplay3d.org -This file was modified to fit the cocos2d-x project +This file was modified to fit the axmol project */ #include "math/MathUtil.h" @@ -28,50 +28,10 @@ This file was modified to fit the cocos2d-x project # include #endif -//#define USE_NEON32 : neon 32 code will be used -//#define USE_NEON64 : neon 64 code will be used -//#define INCLUDE_NEON32 : neon 32 code included -//#define INCLUDE_NEON64 : neon 64 code included -//#define USE_SSE : SSE code used -//#define INCLUDE_SSE : SSE code included - -#if (AX_TARGET_PLATFORM == AX_PLATFORM_IOS) -# if defined(__arm64__) -# define USE_NEON64 1 -# define INCLUDE_NEON64 1 -# elif defined(__ARM_NEON__) -# define USE_NEON32 1 -# define INCLUDE_NEON32 1 -# endif -#elif (AX_TARGET_PLATFORM == AX_PLATFORM_OSX) -# if defined(__arm64__) || defined(__aarch64__) -# define USE_NEON64 1 -# define INCLUDE_NEON64 1 -# endif -#elif (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID) -# if defined(__arm64__) || defined(__aarch64__) -# define USE_NEON64 1 -# define INCLUDE_NEON64 1 -# elif defined(__ARM_NEON__) -# define INCLUDE_NEON32 1 -# endif -#endif - -#if defined(AX_USE_SSE) -# define USE_SSE 1 -# define INCLUDE_SSE 1 -#endif - -#ifdef INCLUDE_NEON32 -# include "math/MathUtilNeon.inl" -#endif - -#ifdef INCLUDE_NEON64 -# include "math/MathUtilNeon64.inl" -#endif - -#ifdef INCLUDE_SSE +#if defined(AX_SSE_INTRINSICS) # include "math/MathUtilSSE.inl" +#elif defined(AX_NEON_INTRINSICS) +# include "math/MathUtilNeon.inl" #endif #include "math/MathUtil.inl" @@ -106,9 +66,8 @@ float MathUtil::lerp(float from, float to, float alpha) bool MathUtil::isNeon32Enabled() { -#ifdef USE_NEON32 - return true; -#elif (defined(INCLUDE_NEON32) && (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)) +#if defined(AX_NEON_INTRINSICS) && !AX_64BITS +# if AX_NEON_INTRINSICS == 1 && AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID class AnrdoidNeonChecker { public: @@ -127,15 +86,9 @@ bool MathUtil::isNeon32Enabled() }; static AnrdoidNeonChecker checker; return checker.isNeonEnabled(); -#else - return false; -#endif -} - -bool MathUtil::isNeon64Enabled() -{ -#ifdef USE_NEON64 +# else return true; +# endif #else return false; #endif @@ -143,15 +96,17 @@ bool MathUtil::isNeon64Enabled() void MathUtil::addMatrix(const float* m, float scalar, float* dst) { -#ifdef USE_NEON32 - MathUtilNeon::addMatrix(m, scalar, dst); -#elif defined(USE_NEON64) - MathUtilNeon64::addMatrix(m, scalar, dst); -#elif defined(INCLUDE_NEON32) +#if defined(AX_SSE_INTRINSICS) + MathUtilSSE::addMatrix(reinterpret_cast(m), scalar, reinterpret_cast<_xm128_t*>(dst)); +#elif defined(AX_NEON_INTRINSICS) +# if AX_64BITS || AX_NEON_INTRINSICS > 1 + MathUtilNeon::addMatrix(reinterpret_cast(m), scalar, reinterpret_cast<_xm128_t*>(dst)); +# else if (isNeon32Enabled()) - MathUtilNeon::addMatrix(m, scalar, dst); + MathUtilNeon::addMatrix(reinterpret_cast(m), scalar, reinterpret_cast<_xm128_t*>(dst)); else MathUtilC::addMatrix(m, scalar, dst); +# endif #else MathUtilC::addMatrix(m, scalar, dst); #endif @@ -159,15 +114,20 @@ void MathUtil::addMatrix(const float* m, float scalar, float* dst) void MathUtil::addMatrix(const float* m1, const float* m2, float* dst) { -#ifdef USE_NEON32 - MathUtilNeon::addMatrix(m1, m2, dst); -#elif defined(USE_NEON64) - MathUtilNeon64::addMatrix(m1, m2, dst); -#elif defined(INCLUDE_NEON32) +#if defined(AX_SSE_INTRINSICS) + MathUtilSSE::addMatrix(reinterpret_cast(m1), reinterpret_cast(m2), + reinterpret_cast<_xm128_t*>(dst)); +#elif defined(AX_NEON_INTRINSICS) +# if AX_64BITS || AX_NEON_INTRINSICS > 1 + MathUtilNeon::addMatrix(reinterpret_cast(m1), reinterpret_cast(m2), + reinterpret_cast<_xm128_t*>(dst)); +# else if (isNeon32Enabled()) - MathUtilNeon::addMatrix(m1, m2, dst); + MathUtilNeon::addMatrix(reinterpret_cast(m1), reinterpret_cast(m2), + reinterpret_cast<_xm128_t*>(dst)); else MathUtilC::addMatrix(m1, m2, dst); +# endif #else MathUtilC::addMatrix(m1, m2, dst); #endif @@ -175,15 +135,20 @@ void MathUtil::addMatrix(const float* m1, const float* m2, float* dst) void MathUtil::subtractMatrix(const float* m1, const float* m2, float* dst) { -#ifdef USE_NEON32 - MathUtilNeon::subtractMatrix(m1, m2, dst); -#elif defined(USE_NEON64) - MathUtilNeon64::subtractMatrix(m1, m2, dst); -#elif defined(INCLUDE_NEON32) +#if defined(AX_SSE_INTRINSICS) + MathUtilSSE::subtractMatrix(reinterpret_cast(m1), reinterpret_cast(m2), + reinterpret_cast<_xm128_t*>(dst)); +#elif defined(AX_NEON_INTRINSICS) +# if AX_64BITS || AX_NEON_INTRINSICS > 1 + MathUtilNeon::subtractMatrix(reinterpret_cast(m1), reinterpret_cast(m2), + reinterpret_cast<_xm128_t*>(dst)); +# else if (isNeon32Enabled()) - MathUtilNeon::subtractMatrix(m1, m2, dst); + MathUtilNeon::subtractMatrix(reinterpret_cast(m1), reinterpret_cast(m2), + reinterpret_cast<_xm128_t*>(dst)); else MathUtilC::subtractMatrix(m1, m2, dst); +# endif #else MathUtilC::subtractMatrix(m1, m2, dst); #endif @@ -191,15 +156,17 @@ void MathUtil::subtractMatrix(const float* m1, const float* m2, float* dst) void MathUtil::multiplyMatrix(const float* m, float scalar, float* dst) { -#ifdef USE_NEON32 - MathUtilNeon::multiplyMatrix(m, scalar, dst); -#elif defined(USE_NEON64) - MathUtilNeon64::multiplyMatrix(m, scalar, dst); -#elif defined(INCLUDE_NEON32) +#if defined(AX_SSE_INTRINSICS) + MathUtilSSE::multiplyMatrix(reinterpret_cast(m), scalar, reinterpret_cast<_xm128_t*>(dst)); +#elif defined(AX_NEON_INTRINSICS) +# if AX_64BITS || AX_NEON_INTRINSICS > 1 + MathUtilNeon::multiplyMatrix(reinterpret_cast(m), scalar, reinterpret_cast<_xm128_t*>(dst)); +# else if (isNeon32Enabled()) - MathUtilNeon::multiplyMatrix(m, scalar, dst); + MathUtilNeon::multiplyMatrix(reinterpret_cast(m), scalar, reinterpret_cast<_xm128_t*>(dst)); else MathUtilC::multiplyMatrix(m, scalar, dst); +# endif #else MathUtilC::multiplyMatrix(m, scalar, dst); #endif @@ -207,15 +174,20 @@ void MathUtil::multiplyMatrix(const float* m, float scalar, float* dst) void MathUtil::multiplyMatrix(const float* m1, const float* m2, float* dst) { -#ifdef USE_NEON32 - MathUtilNeon::multiplyMatrix(m1, m2, dst); -#elif defined(USE_NEON64) - MathUtilNeon64::multiplyMatrix(m1, m2, dst); -#elif defined(INCLUDE_NEON32) +#if defined(AX_SSE_INTRINSICS) + MathUtilSSE::multiplyMatrix(reinterpret_cast(m1), reinterpret_cast(m2), + reinterpret_cast<_xm128_t*>(dst)); +#elif defined(AX_NEON_INTRINSICS) +# if AX_64BITS || AX_NEON_INTRINSICS > 1 + MathUtilNeon::multiplyMatrix(reinterpret_cast(m1), reinterpret_cast(m2), + reinterpret_cast<_xm128_t*>(dst)); +# else if (isNeon32Enabled()) - MathUtilNeon::multiplyMatrix(m1, m2, dst); + MathUtilNeon::multiplyMatrix(reinterpret_cast(m1), reinterpret_cast(m2), + reinterpret_cast<_xm128_t*>(dst)); else MathUtilC::multiplyMatrix(m1, m2, dst); +# endif #else MathUtilC::multiplyMatrix(m1, m2, dst); #endif @@ -223,15 +195,17 @@ void MathUtil::multiplyMatrix(const float* m1, const float* m2, float* dst) void MathUtil::negateMatrix(const float* m, float* dst) { -#ifdef USE_NEON32 - MathUtilNeon::negateMatrix(m, dst); -#elif defined(USE_NEON64) - MathUtilNeon64::negateMatrix(m, dst); -#elif defined(INCLUDE_NEON32) +#if defined(AX_SSE_INTRINSICS) + MathUtilSSE::negateMatrix(reinterpret_cast(m), reinterpret_cast<_xm128_t*>(dst)); +#elif defined(AX_NEON_INTRINSICS) +# if AX_64BITS || AX_NEON_INTRINSICS > 1 + MathUtilNeon::negateMatrix(reinterpret_cast(m), reinterpret_cast<_xm128_t*>(dst)); +# else if (isNeon32Enabled()) - MathUtilNeon::negateMatrix(m, dst); + MathUtilNeon::negateMatrix(reinterpret_cast(m), reinterpret_cast<_xm128_t*>(dst)); else MathUtilC::negateMatrix(m, dst); +# endif #else MathUtilC::negateMatrix(m, dst); #endif @@ -239,47 +213,53 @@ void MathUtil::negateMatrix(const float* m, float* dst) void MathUtil::transposeMatrix(const float* m, float* dst) { -#ifdef USE_NEON32 - MathUtilNeon::transposeMatrix(m, dst); -#elif defined(USE_NEON64) - MathUtilNeon64::transposeMatrix(m, dst); -#elif defined(INCLUDE_NEON32) +#if defined(AX_SSE_INTRINSICS) + MathUtilSSE::transposeMatrix(reinterpret_cast(m), reinterpret_cast<_xm128_t*>(dst)); +#elif defined(AX_NEON_INTRINSICS) +# if AX_64BITS || AX_NEON_INTRINSICS > 1 + MathUtilNeon::transposeMatrix(reinterpret_cast(m), reinterpret_cast<_xm128_t*>(dst)); +# else if (isNeon32Enabled()) - MathUtilNeon::transposeMatrix(m, dst); + MathUtilNeon::transposeMatrix(reinterpret_cast(m), reinterpret_cast<_xm128_t*>(dst)); else MathUtilC::transposeMatrix(m, dst); +# endif #else MathUtilC::transposeMatrix(m, dst); #endif } -void MathUtil::transformVec4(const float* m, float x, float y, float z, float w, float* dst) +void MathUtil::transformVec4(const float* m, float x, float y, float z, float w, float* dst /*vec3*/) { -#ifdef USE_NEON32 - MathUtilNeon::transformVec4(m, x, y, z, w, dst); -#elif defined(USE_NEON64) - MathUtilNeon64::transformVec4(m, x, y, z, w, dst); -#elif defined(INCLUDE_NEON32) +#if defined(AX_SSE_INTRINSICS) + MathUtilSSE::transformVec4(reinterpret_cast(m), x, y, z, w, dst); +#elif defined(AX_NEON_INTRINSICS) +# if AX_64BITS || AX_NEON_INTRINSICS > 1 + MathUtilNeon::transformVec4(reinterpret_cast(m), x, y, z, w, dst); +# else if (isNeon32Enabled()) - MathUtilNeon::transformVec4(m, x, y, z, w, dst); + MathUtilNeon::transformVec4(reinterpret_cast(m), x, y, z, w, dst); else MathUtilC::transformVec4(m, x, y, z, w, dst); +# endif #else MathUtilC::transformVec4(m, x, y, z, w, dst); #endif } -void MathUtil::transformVec4(const float* m, const float* v, float* dst) +void MathUtil::transformVec4(const float* m, const float* v, float* dst /*vec4*/) { -#ifdef USE_NEON32 - MathUtilNeon::transformVec4(m, v, dst); -#elif defined(USE_NEON64) - MathUtilNeon64::transformVec4(m, v, dst); -#elif defined(INCLUDE_NEON32) +#if defined(AX_SSE_INTRINSICS) + MathUtilSSE::transformVec4(reinterpret_cast(m), v, dst); +#elif defined(AX_NEON_INTRINSICS) +# if AX_64BITS || AX_NEON_INTRINSICS > 1 + MathUtilNeon::transformVec4(reinterpret_cast(m), v, dst); +# else if (isNeon32Enabled()) - MathUtilNeon::transformVec4(m, v, dst); + MathUtilNeon::transformVec4(reinterpret_cast(m), v, dst); else MathUtilC::transformVec4(m, v, dst); +# endif #else MathUtilC::transformVec4(m, v, dst); #endif @@ -287,15 +267,17 @@ void MathUtil::transformVec4(const float* m, const float* v, float* dst) void MathUtil::crossVec3(const float* v1, const float* v2, float* dst) { -#ifdef USE_NEON32 +#if defined(AX_SSE_INTRINSICS) + MathUtilSSE::crossVec3(v1, v2, dst); +#elif defined(AX_NEON_INTRINSICS) +# if AX_64BITS || AX_NEON_INTRINSICS > 1 MathUtilNeon::crossVec3(v1, v2, dst); -#elif defined(USE_NEON64) - MathUtilNeon64::crossVec3(v1, v2, dst); -#elif defined(INCLUDE_NEON32) +# else if (isNeon32Enabled()) MathUtilNeon::crossVec3(v1, v2, dst); else MathUtilC::crossVec3(v1, v2, dst); +# endif #else MathUtilC::crossVec3(v1, v2, dst); #endif @@ -308,24 +290,28 @@ void MathUtil::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_ static_assert(offsetof(V3F_C4B_T2F, vertices) == 0); static_assert(offsetof(V3F_C4B_T2F, colors) == 12); static_assert(offsetof(V3F_C4B_T2F, texCoords) == 16); - -#ifdef USE_NEON32 +#if defined(AX_SSE_INTRINSICS) + MathUtilSSE::transformVertices(dst, src, count, transform); +#elif defined(AX_NEON_INTRINSICS) +# if AX_64BITS || AX_NEON_INTRINSICS > 1 MathUtilNeon::transformVertices(dst, src, count, transform); -#elif defined(USE_NEON64) - MathUtilNeon64::transformVertices(dst, src, count, transform); -#elif defined(INCLUDE_NEON32) +# else if (isNeon32Enabled()) MathUtilNeon::transformVertices(dst, src, count, transform); else MathUtilC::transformVertices(dst, src, count, transform); +# endif #else MathUtilC::transformVertices(dst, src, count, transform); #endif } -void MathUtil::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) { -#if defined(USE_NEON64) - MathUtilNeon64::transformIndices(dst, src, count, offset); +void MathUtil::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) +{ +#if defined(AX_SSE_INTRINSICS) + MathUtilSSE::transformIndices(dst, src, count, offset); +#elif defined(AX_NEON_INTRINSICS) && AX_64BITS + MathUtilNeon::transformIndices(dst, src, count, offset); #else MathUtilC::transformIndices(dst, src, count, offset); #endif diff --git a/core/math/MathUtil.h b/core/math/MathUtil.h index 7cb78b7845f0..b7057fe8c7d3 100644 --- a/core/math/MathUtil.h +++ b/core/math/MathUtil.h @@ -18,16 +18,12 @@ Original file from GamePlay3D: http://gameplay3d.org - This file was modified to fit the cocos2d-x project + This file was modified to fit the axmol project */ #ifndef MATHUTIL_H_ #define MATHUTIL_H_ -#ifdef AX_USE_SSE -# include -#endif - #include "math/MathBase.h" @@ -42,7 +38,7 @@ NS_AX_END NS_AX_MATH_BEGIN -class Mat4; +class Vec4; /** * Defines a math utility class. @@ -100,26 +96,8 @@ class AX_DLL MathUtil private: // Indicates that if neon is enabled static bool isNeon32Enabled(); - static bool isNeon64Enabled(); private: -#ifdef AX_USE_SSE - static void addMatrix(const __m128 m[4], float scalar, __m128 dst[4]); - - static void addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]); - - static void subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]); - - static void multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4]); - - static void multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]); - - static void negateMatrix(const __m128 m[4], __m128 dst[4]); - - static void transposeMatrix(const __m128 m[4], __m128 dst[4]); - - static void transformVec4(const __m128 m[4], const __m128& v, __m128& dst); -#endif static void addMatrix(const float* m, float scalar, float* dst); static void addMatrix(const float* m1, const float* m2, float* dst); @@ -134,9 +112,9 @@ class AX_DLL MathUtil static void transposeMatrix(const float* m, float* dst); - static void transformVec4(const float* m, float x, float y, float z, float w, float* dst); + static void transformVec4(const float* m, float x, float y, float z, float w, float* dst/*vec3*/); - static void transformVec4(const float* m, const float* v, float* dst); + static void transformVec4(const float* m, const float* v, float* dst/*vec4*/); static void crossVec3(const float* v1, const float* v2, float* dst); diff --git a/core/math/MathUtil.inl b/core/math/MathUtil.inl index 4d7028bdbd59..a2da119df439 100644 --- a/core/math/MathUtil.inl +++ b/core/math/MathUtil.inl @@ -16,7 +16,7 @@ Original file from GamePlay3D: http://gameplay3d.org - This file was modified to fit the cocos2d-x project + This file was modified to fit the axmol project */ NS_AX_MATH_BEGIN @@ -24,221 +24,201 @@ NS_AX_MATH_BEGIN class MathUtilC { public: - inline static void addMatrix(const float* m, float scalar, float* dst); - inline static void addMatrix(const float* m1, const float* m2, float* dst); - inline static void subtractMatrix(const float* m1, const float* m2, float* dst); - inline static void multiplyMatrix(const float* m, float scalar, float* dst); - inline static void multiplyMatrix(const float* m1, const float* m2, float* dst); - - inline static void negateMatrix(const float* m, float* dst); - inline static void transposeMatrix(const float* m, float* dst); - - inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst); - inline static void transformVec4(const float* m, const float* v, float* dst); - inline static void crossVec3(const float* v1, const float* v2, float* dst); - - inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform); - inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset); -}; + inline static void addMatrix(const float* m, float scalar, float* dst) + { + dst[0] = m[0] + scalar; + dst[1] = m[1] + scalar; + dst[2] = m[2] + scalar; + dst[3] = m[3] + scalar; + dst[4] = m[4] + scalar; + dst[5] = m[5] + scalar; + dst[6] = m[6] + scalar; + dst[7] = m[7] + scalar; + dst[8] = m[8] + scalar; + dst[9] = m[9] + scalar; + dst[10] = m[10] + scalar; + dst[11] = m[11] + scalar; + dst[12] = m[12] + scalar; + dst[13] = m[13] + scalar; + dst[14] = m[14] + scalar; + dst[15] = m[15] + scalar; + } -inline void MathUtilC::addMatrix(const float* m, float scalar, float* dst) -{ - dst[0] = m[0] + scalar; - dst[1] = m[1] + scalar; - dst[2] = m[2] + scalar; - dst[3] = m[3] + scalar; - dst[4] = m[4] + scalar; - dst[5] = m[5] + scalar; - dst[6] = m[6] + scalar; - dst[7] = m[7] + scalar; - dst[8] = m[8] + scalar; - dst[9] = m[9] + scalar; - dst[10] = m[10] + scalar; - dst[11] = m[11] + scalar; - dst[12] = m[12] + scalar; - dst[13] = m[13] + scalar; - dst[14] = m[14] + scalar; - dst[15] = m[15] + scalar; -} - -inline void MathUtilC::addMatrix(const float* m1, const float* m2, float* dst) -{ - dst[0] = m1[0] + m2[0]; - dst[1] = m1[1] + m2[1]; - dst[2] = m1[2] + m2[2]; - dst[3] = m1[3] + m2[3]; - dst[4] = m1[4] + m2[4]; - dst[5] = m1[5] + m2[5]; - dst[6] = m1[6] + m2[6]; - dst[7] = m1[7] + m2[7]; - dst[8] = m1[8] + m2[8]; - dst[9] = m1[9] + m2[9]; - dst[10] = m1[10] + m2[10]; - dst[11] = m1[11] + m2[11]; - dst[12] = m1[12] + m2[12]; - dst[13] = m1[13] + m2[13]; - dst[14] = m1[14] + m2[14]; - dst[15] = m1[15] + m2[15]; -} - -inline void MathUtilC::subtractMatrix(const float* m1, const float* m2, float* dst) -{ - dst[0] = m1[0] - m2[0]; - dst[1] = m1[1] - m2[1]; - dst[2] = m1[2] - m2[2]; - dst[3] = m1[3] - m2[3]; - dst[4] = m1[4] - m2[4]; - dst[5] = m1[5] - m2[5]; - dst[6] = m1[6] - m2[6]; - dst[7] = m1[7] - m2[7]; - dst[8] = m1[8] - m2[8]; - dst[9] = m1[9] - m2[9]; - dst[10] = m1[10] - m2[10]; - dst[11] = m1[11] - m2[11]; - dst[12] = m1[12] - m2[12]; - dst[13] = m1[13] - m2[13]; - dst[14] = m1[14] - m2[14]; - dst[15] = m1[15] - m2[15]; -} - -inline void MathUtilC::multiplyMatrix(const float* m, float scalar, float* dst) -{ - dst[0] = m[0] * scalar; - dst[1] = m[1] * scalar; - dst[2] = m[2] * scalar; - dst[3] = m[3] * scalar; - dst[4] = m[4] * scalar; - dst[5] = m[5] * scalar; - dst[6] = m[6] * scalar; - dst[7] = m[7] * scalar; - dst[8] = m[8] * scalar; - dst[9] = m[9] * scalar; - dst[10] = m[10] * scalar; - dst[11] = m[11] * scalar; - dst[12] = m[12] * scalar; - dst[13] = m[13] * scalar; - dst[14] = m[14] * scalar; - dst[15] = m[15] * scalar; -} - -inline void MathUtilC::multiplyMatrix(const float* m1, const float* m2, float* dst) -{ - // Support the case where m1 or m2 is the same array as dst. - float product[16]; + inline static void addMatrix(const float* m1, const float* m2, float* dst) + { + dst[0] = m1[0] + m2[0]; + dst[1] = m1[1] + m2[1]; + dst[2] = m1[2] + m2[2]; + dst[3] = m1[3] + m2[3]; + dst[4] = m1[4] + m2[4]; + dst[5] = m1[5] + m2[5]; + dst[6] = m1[6] + m2[6]; + dst[7] = m1[7] + m2[7]; + dst[8] = m1[8] + m2[8]; + dst[9] = m1[9] + m2[9]; + dst[10] = m1[10] + m2[10]; + dst[11] = m1[11] + m2[11]; + dst[12] = m1[12] + m2[12]; + dst[13] = m1[13] + m2[13]; + dst[14] = m1[14] + m2[14]; + dst[15] = m1[15] + m2[15]; + } - product[0] = m1[0] * m2[0] + m1[4] * m2[1] + m1[8] * m2[2] + m1[12] * m2[3]; - product[1] = m1[1] * m2[0] + m1[5] * m2[1] + m1[9] * m2[2] + m1[13] * m2[3]; - product[2] = m1[2] * m2[0] + m1[6] * m2[1] + m1[10] * m2[2] + m1[14] * m2[3]; - product[3] = m1[3] * m2[0] + m1[7] * m2[1] + m1[11] * m2[2] + m1[15] * m2[3]; + inline static void subtractMatrix(const float* m1, const float* m2, float* dst) + { + dst[0] = m1[0] - m2[0]; + dst[1] = m1[1] - m2[1]; + dst[2] = m1[2] - m2[2]; + dst[3] = m1[3] - m2[3]; + dst[4] = m1[4] - m2[4]; + dst[5] = m1[5] - m2[5]; + dst[6] = m1[6] - m2[6]; + dst[7] = m1[7] - m2[7]; + dst[8] = m1[8] - m2[8]; + dst[9] = m1[9] - m2[9]; + dst[10] = m1[10] - m2[10]; + dst[11] = m1[11] - m2[11]; + dst[12] = m1[12] - m2[12]; + dst[13] = m1[13] - m2[13]; + dst[14] = m1[14] - m2[14]; + dst[15] = m1[15] - m2[15]; + } - product[4] = m1[0] * m2[4] + m1[4] * m2[5] + m1[8] * m2[6] + m1[12] * m2[7]; - product[5] = m1[1] * m2[4] + m1[5] * m2[5] + m1[9] * m2[6] + m1[13] * m2[7]; - product[6] = m1[2] * m2[4] + m1[6] * m2[5] + m1[10] * m2[6] + m1[14] * m2[7]; - product[7] = m1[3] * m2[4] + m1[7] * m2[5] + m1[11] * m2[6] + m1[15] * m2[7]; + inline static void multiplyMatrix(const float* m, float scalar, float* dst) + { + dst[0] = m[0] * scalar; + dst[1] = m[1] * scalar; + dst[2] = m[2] * scalar; + dst[3] = m[3] * scalar; + dst[4] = m[4] * scalar; + dst[5] = m[5] * scalar; + dst[6] = m[6] * scalar; + dst[7] = m[7] * scalar; + dst[8] = m[8] * scalar; + dst[9] = m[9] * scalar; + dst[10] = m[10] * scalar; + dst[11] = m[11] * scalar; + dst[12] = m[12] * scalar; + dst[13] = m[13] * scalar; + dst[14] = m[14] * scalar; + dst[15] = m[15] * scalar; + } - product[8] = m1[0] * m2[8] + m1[4] * m2[9] + m1[8] * m2[10] + m1[12] * m2[11]; - product[9] = m1[1] * m2[8] + m1[5] * m2[9] + m1[9] * m2[10] + m1[13] * m2[11]; - product[10] = m1[2] * m2[8] + m1[6] * m2[9] + m1[10] * m2[10] + m1[14] * m2[11]; - product[11] = m1[3] * m2[8] + m1[7] * m2[9] + m1[11] * m2[10] + m1[15] * m2[11]; + inline static void multiplyMatrix(const float* m1, const float* m2, float* dst) + { + // Support the case where m1 or m2 is the same array as dst. + float product[16]; + + product[0] = m1[0] * m2[0] + m1[4] * m2[1] + m1[8] * m2[2] + m1[12] * m2[3]; + product[1] = m1[1] * m2[0] + m1[5] * m2[1] + m1[9] * m2[2] + m1[13] * m2[3]; + product[2] = m1[2] * m2[0] + m1[6] * m2[1] + m1[10] * m2[2] + m1[14] * m2[3]; + product[3] = m1[3] * m2[0] + m1[7] * m2[1] + m1[11] * m2[2] + m1[15] * m2[3]; + + product[4] = m1[0] * m2[4] + m1[4] * m2[5] + m1[8] * m2[6] + m1[12] * m2[7]; + product[5] = m1[1] * m2[4] + m1[5] * m2[5] + m1[9] * m2[6] + m1[13] * m2[7]; + product[6] = m1[2] * m2[4] + m1[6] * m2[5] + m1[10] * m2[6] + m1[14] * m2[7]; + product[7] = m1[3] * m2[4] + m1[7] * m2[5] + m1[11] * m2[6] + m1[15] * m2[7]; + + product[8] = m1[0] * m2[8] + m1[4] * m2[9] + m1[8] * m2[10] + m1[12] * m2[11]; + product[9] = m1[1] * m2[8] + m1[5] * m2[9] + m1[9] * m2[10] + m1[13] * m2[11]; + product[10] = m1[2] * m2[8] + m1[6] * m2[9] + m1[10] * m2[10] + m1[14] * m2[11]; + product[11] = m1[3] * m2[8] + m1[7] * m2[9] + m1[11] * m2[10] + m1[15] * m2[11]; + + product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8] * m2[14] + m1[12] * m2[15]; + product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9] * m2[14] + m1[13] * m2[15]; + product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15]; + product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15]; + + memcpy(dst, product, MATRIX_SIZE); + } - product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8] * m2[14] + m1[12] * m2[15]; - product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9] * m2[14] + m1[13] * m2[15]; - product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15]; - product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15]; + inline static void negateMatrix(const float* m, float* dst) + { + dst[0] = -m[0]; + dst[1] = -m[1]; + dst[2] = -m[2]; + dst[3] = -m[3]; + dst[4] = -m[4]; + dst[5] = -m[5]; + dst[6] = -m[6]; + dst[7] = -m[7]; + dst[8] = -m[8]; + dst[9] = -m[9]; + dst[10] = -m[10]; + dst[11] = -m[11]; + dst[12] = -m[12]; + dst[13] = -m[13]; + dst[14] = -m[14]; + dst[15] = -m[15]; + } - memcpy(dst, product, MATRIX_SIZE); -} + inline static void transposeMatrix(const float* m, float* dst) + { + float t[16] = {m[0], m[4], m[8], m[12], m[1], m[5], m[9], m[13], + m[2], m[6], m[10], m[14], m[3], m[7], m[11], m[15]}; + memcpy(dst, t, MATRIX_SIZE); + } -inline void MathUtilC::negateMatrix(const float* m, float* dst) -{ - dst[0] = -m[0]; - dst[1] = -m[1]; - dst[2] = -m[2]; - dst[3] = -m[3]; - dst[4] = -m[4]; - dst[5] = -m[5]; - dst[6] = -m[6]; - dst[7] = -m[7]; - dst[8] = -m[8]; - dst[9] = -m[9]; - dst[10] = -m[10]; - dst[11] = -m[11]; - dst[12] = -m[12]; - dst[13] = -m[13]; - dst[14] = -m[14]; - dst[15] = -m[15]; -} - -inline void MathUtilC::transposeMatrix(const float* m, float* dst) -{ - float t[16] = { - m[0], m[4], m[8], m[12], - m[1], m[5], m[9], m[13], - m[2], m[6], m[10], m[14], - m[3], m[7], m[11], m[15] - }; - memcpy(dst, t, MATRIX_SIZE); -} - -inline void MathUtilC::transformVec4(const float* m, float x, float y, float z, float w, float* dst) -{ - dst[0] = x * m[0] + y * m[4] + z * m[8] + w * m[12]; - dst[1] = x * m[1] + y * m[5] + z * m[9] + w * m[13]; - dst[2] = x * m[2] + y * m[6] + z * m[10] + w * m[14]; -} + inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst) + { + dst[0] = x * m[0] + y * m[4] + z * m[8] + w * m[12]; + dst[1] = x * m[1] + y * m[5] + z * m[9] + w * m[13]; + dst[2] = x * m[2] + y * m[6] + z * m[10] + w * m[14]; + } -inline void MathUtilC::transformVec4(const float* m, const float* v, float* dst) -{ - // Handle case where v == dst. - float x = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + v[3] * m[12]; - float y = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + v[3] * m[13]; - float z = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + v[3] * m[14]; - float w = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + v[3] * m[15]; - - dst[0] = x; - dst[1] = y; - dst[2] = z; - dst[3] = w; -} - -inline void MathUtilC::crossVec3(const float* v1, const float* v2, float* dst) -{ - float x = (v1[1] * v2[2]) - (v1[2] * v2[1]); - float y = (v1[2] * v2[0]) - (v1[0] * v2[2]); - float z = (v1[0] * v2[1]) - (v1[1] * v2[0]); + inline static void transformVec4(const float* m, const float* v, float* dst) + { + // Handle case where v == dst. + float x = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + v[3] * m[12]; + float y = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + v[3] * m[13]; + float z = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + v[3] * m[14]; + float w = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + v[3] * m[15]; + + dst[0] = x; + dst[1] = y; + dst[2] = z; + dst[3] = w; + } - dst[0] = x; - dst[1] = y; - dst[2] = z; -} + inline static void crossVec3(const float* v1, const float* v2, float* dst) + { + float x = (v1[1] * v2[2]) - (v1[2] * v2[1]); + float y = (v1[2] * v2[0]) - (v1[0] * v2[2]); + float z = (v1[0] * v2[1]) - (v1[1] * v2[0]); -inline void MathUtilC::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform) -{ - auto end = dst + count; - auto t = transform; // Make copy for better aliasing inference - auto m = t.m; + dst[0] = x; + dst[1] = y; + dst[2] = z; + } - while (dst < end) + inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform) { - auto pos = src->vertices; - dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8] + m[12]; - dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9] + m[13]; - dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14]; - memcpy(&dst->colors, &src->colors, sizeof(dst->colors) + sizeof(dst->texCoords)); - ++dst; - ++src; + auto end = dst + count; + auto& t = transform; // Make copy for better aliasing inference + auto m = t.m; + + while (dst < end) + { + auto pos = src->vertices; + dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8] + m[12]; + dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9] + m[13]; + dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14]; + memcpy(&dst->colors, &src->colors, sizeof(V3F_C4B_T2F::colors) + sizeof(V3F_C4B_T2F::texCoords)); + ++dst; + ++src; + } } -} -inline void MathUtilC::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) -{ - auto end = dst + count; - while (dst < end) + inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) { - *dst = *src + offset; - ++dst; - ++src; + auto end = dst + count; + while (dst < end) + { + *dst = *src + offset; + ++dst; + ++src; + } } -} +}; NS_AX_MATH_END diff --git a/core/math/MathUtilNeon.inl b/core/math/MathUtilNeon.inl index e80382490351..42773e51637c 100644 --- a/core/math/MathUtilNeon.inl +++ b/core/math/MathUtilNeon.inl @@ -16,356 +16,374 @@ Original file from GamePlay3D: http://gameplay3d.org - This file was modified to fit the cocos2d-x project + This file was modified to fit the axmol project */ #include NS_AX_MATH_BEGIN -class MathUtilNeon +struct MathUtilNeon { -public: - inline static void addMatrix(const float* m, float scalar, float* dst); - inline static void addMatrix(const float* m1, const float* m2, float* dst); - inline static void subtractMatrix(const float* m1, const float* m2, float* dst); - inline static void multiplyMatrix(const float* m, float scalar, float* dst); - inline static void multiplyMatrix(const float* m1, const float* m2, float* dst); +#if defined(__EMSCRIPTEN__) +# define vmlaq_lane_f32(a, b, c, lane) vaddq_f32(a, vmulq_lane_f32(b, c, lane)) +#endif - inline static void negateMatrix(const float* m, float* dst); - inline static void transposeMatrix(const float* m, float* dst); + inline static void addMatrix(const _xm128_t* m, float scalar, _xm128_t* dst) + { + float32x4_t s = vdupq_n_f32(scalar); + dst[0] = vaddq_f32(m[0], s); + dst[1] = vaddq_f32(m[1], s); + dst[2] = vaddq_f32(m[2], s); + dst[3] = vaddq_f32(m[3], s); + } - inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst); - inline static void transformVec4(const float* m, const float* v, float* dst); - inline static void crossVec3(const float* v1, const float* v2, float* dst); + inline static void addMatrix(const _xm128_t* m1, const _xm128_t* m2, _xm128_t* dst) + { + dst[0] = vaddq_f32(m1[0], m2[0]); + dst[1] = vaddq_f32(m1[1], m2[1]); + dst[2] = vaddq_f32(m1[2], m2[2]); + dst[3] = vaddq_f32(m1[3], m2[3]); + } - inline static void transformVertices(ax::V3F_C4B_T2F* dst, const ax::V3F_C4B_T2F* src, size_t count, const ax::Mat4& transform); -}; + inline static void subtractMatrix(const _xm128_t* m1, const _xm128_t* m2, _xm128_t* dst) + { + dst[0] = vsubq_f32(m1[0], m2[0]); + dst[1] = vsubq_f32(m1[1], m2[1]); + dst[2] = vsubq_f32(m1[2], m2[2]); + dst[3] = vsubq_f32(m1[3], m2[3]); + } -inline void MathUtilNeon::addMatrix(const float* m, float scalar, float* dst) -{ - asm volatile( - "vld1.32 {q0, q1}, [%1]! \n\t" // M[m0-m7] - "vld1.32 {q2, q3}, [%1] \n\t" // M[m8-m15] - "vld1.32 {d8[0]}, [%2] \n\t" // s - "vmov.f32 s17, s16 \n\t" // s - "vmov.f32 s18, s16 \n\t" // s - "vmov.f32 s19, s16 \n\t" // s - - "vadd.f32 q8, q0, q4 \n\t" // DST->M[m0-m3] = M[m0-m3] + s - "vadd.f32 q9, q1, q4 \n\t" // DST->M[m4-m7] = M[m4-m7] + s - "vadd.f32 q10, q2, q4 \n\t" // DST->M[m8-m11] = M[m8-m11] + s - "vadd.f32 q11, q3, q4 \n\t" // DST->M[m12-m15] = M[m12-m15] + s - - "vst1.32 {q8, q9}, [%0]! \n\t" // DST->M[m0-m7] - "vst1.32 {q10, q11}, [%0] \n\t" // DST->M[m8-m15] - : - : "r"(dst), "r"(m), "r"(&scalar) - : "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "memory" - ); -} - -inline void MathUtilNeon::addMatrix(const float* m1, const float* m2, float* dst) -{ - asm volatile( - "vld1.32 {q0, q1}, [%1]! \n\t" // M1[m0-m7] - "vld1.32 {q2, q3}, [%1] \n\t" // M1[m8-m15] - "vld1.32 {q8, q9}, [%2]! \n\t" // M2[m0-m7] - "vld1.32 {q10, q11}, [%2] \n\t" // M2[m8-m15] - - "vadd.f32 q12, q0, q8 \n\t" // DST->M[m0-m3] = M1[m0-m3] + M2[m0-m3] - "vadd.f32 q13, q1, q9 \n\t" // DST->M[m4-m7] = M1[m4-m7] + M2[m4-m7] - "vadd.f32 q14, q2, q10 \n\t" // DST->M[m8-m11] = M1[m8-m11] + M2[m8-m11] - "vadd.f32 q15, q3, q11 \n\t" // DST->M[m12-m15] = M1[m12-m15] + M2[m12-m15] - - "vst1.32 {q12, q13}, [%0]! \n\t" // DST->M[m0-m7] - "vst1.32 {q14, q15}, [%0] \n\t" // DST->M[m8-m15] - : - : "r"(dst), "r"(m1), "r"(m2) - : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory" - ); -} - -inline void MathUtilNeon::subtractMatrix(const float* m1, const float* m2, float* dst) -{ - asm volatile( - "vld1.32 {q0, q1}, [%1]! \n\t" // M1[m0-m7] - "vld1.32 {q2, q3}, [%1] \n\t" // M1[m8-m15] - "vld1.32 {q8, q9}, [%2]! \n\t" // M2[m0-m7] - "vld1.32 {q10, q11}, [%2] \n\t" // M2[m8-m15] - - "vsub.f32 q12, q0, q8 \n\t" // DST->M[m0-m3] = M1[m0-m3] - M2[m0-m3] - "vsub.f32 q13, q1, q9 \n\t" // DST->M[m4-m7] = M1[m4-m7] - M2[m4-m7] - "vsub.f32 q14, q2, q10 \n\t" // DST->M[m8-m11] = M1[m8-m11] - M2[m8-m11] - "vsub.f32 q15, q3, q11 \n\t" // DST->M[m12-m15] = M1[m12-m15] - M2[m12-m15] - - "vst1.32 {q12, q13}, [%0]! \n\t" // DST->M[m0-m7] - "vst1.32 {q14, q15}, [%0] \n\t" // DST->M[m8-m15] - : - : "r"(dst), "r"(m1), "r"(m2) - : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory" - ); -} - -inline void MathUtilNeon::multiplyMatrix(const float* m, float scalar, float* dst) -{ - asm volatile( - "vld1.32 {d0[0]}, [%2] \n\t" // M[m0-m7] - "vld1.32 {q4-q5}, [%1]! \n\t" // M[m8-m15] - "vld1.32 {q6-q7}, [%1] \n\t" // s - - "vmul.f32 q8, q4, d0[0] \n\t" // DST->M[m0-m3] = M[m0-m3] * s - "vmul.f32 q9, q5, d0[0] \n\t" // DST->M[m4-m7] = M[m4-m7] * s - "vmul.f32 q10, q6, d0[0] \n\t" // DST->M[m8-m11] = M[m8-m11] * s - "vmul.f32 q11, q7, d0[0] \n\t" // DST->M[m12-m15] = M[m12-m15] * s - - "vst1.32 {q8-q9}, [%0]! \n\t" // DST->M[m0-m7] - "vst1.32 {q10-q11}, [%0] \n\t" // DST->M[m8-m15] - : - : "r"(dst), "r"(m), "r"(&scalar) - : "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "memory" - ); -} - -inline void MathUtilNeon::multiplyMatrix(const float* m1, const float* m2, float* dst) -{ - asm volatile( - "vld1.32 {d16 - d19}, [%1]! \n\t" // M1[m0-m7] - "vld1.32 {d20 - d23}, [%1] \n\t" // M1[m8-m15] - "vld1.32 {d0 - d3}, [%2]! \n\t" // M2[m0-m7] - "vld1.32 {d4 - d7}, [%2] \n\t" // M2[m8-m15] - - "vmul.f32 q12, q8, d0[0] \n\t" // DST->M[m0-m3] = M1[m0-m3] * M2[m0] - "vmul.f32 q13, q8, d2[0] \n\t" // DST->M[m4-m7] = M1[m4-m7] * M2[m4] - "vmul.f32 q14, q8, d4[0] \n\t" // DST->M[m8-m11] = M1[m8-m11] * M2[m8] - "vmul.f32 q15, q8, d6[0] \n\t" // DST->M[m12-m15] = M1[m12-m15] * M2[m12] - - "vmla.f32 q12, q9, d0[1] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m1] - "vmla.f32 q13, q9, d2[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m5] - "vmla.f32 q14, q9, d4[1] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m9] - "vmla.f32 q15, q9, d6[1] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m13] - - "vmla.f32 q12, q10, d1[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m2] - "vmla.f32 q13, q10, d3[0] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m6] - "vmla.f32 q14, q10, d5[0] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m10] - "vmla.f32 q15, q10, d7[0] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m14] - - "vmla.f32 q12, q11, d1[1] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m3] - "vmla.f32 q13, q11, d3[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m7] - "vmla.f32 q14, q11, d5[1] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m11] - "vmla.f32 q15, q11, d7[1] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m15] - - "vst1.32 {d24 - d27}, [%0]! \n\t" // DST->M[m0-m7] - "vst1.32 {d28 - d31}, [%0] \n\t" // DST->M[m8-m15] - - : // output - : "r"(dst), "r"(m1), "r"(m2) // input - note *value* of pointer doesn't change. - : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -inline void MathUtilNeon::negateMatrix(const float* m, float* dst) -{ - asm volatile( - "vld1.32 {q0-q1}, [%1]! \n\t" // load m0-m7 - "vld1.32 {q2-q3}, [%1] \n\t" // load m8-m15 - - "vneg.f32 q4, q0 \n\t" // negate m0-m3 - "vneg.f32 q5, q1 \n\t" // negate m4-m7 - "vneg.f32 q6, q2 \n\t" // negate m8-m15 - "vneg.f32 q7, q3 \n\t" // negate m8-m15 - - "vst1.32 {q4-q5}, [%0]! \n\t" // store m0-m7 - "vst1.32 {q6-q7}, [%0] \n\t" // store m8-m15 - : - : "r"(dst), "r"(m) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory" - ); -} - -inline void MathUtilNeon::transposeMatrix(const float* m, float* dst) -{ - asm volatile( - "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%1]! \n\t" // DST->M[m0, m4, m8, m12] = M[m0-m3] - "vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1]! \n\t" // DST->M[m1, m5, m9, m12] = M[m4-m7] - "vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%1]! \n\t" // DST->M[m2, m6, m10, m12] = M[m8-m11] - "vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%1] \n\t" // DST->M[m3, m7, m11, m12] = M[m12-m15] - - "vst1.32 {q0-q1}, [%0]! \n\t" // DST->M[m0-m7] - "vst1.32 {q2-q3}, [%0] \n\t" // DST->M[m8-m15] - : - : "r"(dst), "r"(m) - : "q0", "q1", "q2", "q3", "memory" - ); -} - -inline void MathUtilNeon::transformVec4(const float* m, float x, float y, float z, float w, float* dst) -{ - asm volatile( - "vld1.32 {d0[0]}, [%1] \n\t" // V[x] - "vld1.32 {d0[1]}, [%2] \n\t" // V[y] - "vld1.32 {d1[0]}, [%3] \n\t" // V[z] - "vld1.32 {d1[1]}, [%4] \n\t" // V[w] - "vld1.32 {d18 - d21}, [%5]! \n\t" // M[m0-m7] - "vld1.32 {d22 - d25}, [%5] \n\t" // M[m8-m15] - - "vmul.f32 q13, q9, d0[0] \n\t" // DST->V = M[m0-m3] * V[x] - "vmla.f32 q13, q10, d0[1] \n\t" // DST->V += M[m4-m7] * V[y] - "vmla.f32 q13, q11, d1[0] \n\t" // DST->V += M[m8-m11] * V[z] - "vmla.f32 q13, q12, d1[1] \n\t" // DST->V += M[m12-m15] * V[w] - - "vst1.32 {d26}, [%0]! \n\t" // DST->V[x, y] - "vst1.32 {d27[0]}, [%0] \n\t" // DST->V[z] - : - : "r"(dst), "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m) - : "q0", "q9", "q10","q11", "q12", "q13", "memory" - ); -} - -inline void MathUtilNeon::transformVec4(const float* m, const float* v, float* dst) -{ - asm volatile - ( - "vld1.32 {d0, d1}, [%1] \n\t" // V[x, y, z, w] - "vld1.32 {d18 - d21}, [%2]! \n\t" // M[m0-m7] - "vld1.32 {d22 - d25}, [%2] \n\t" // M[m8-m15] - - "vmul.f32 q13, q9, d0[0] \n\t" // DST->V = M[m0-m3] * V[x] - "vmla.f32 q13, q10, d0[1] \n\t" // DST->V = M[m4-m7] * V[y] - "vmla.f32 q13, q11, d1[0] \n\t" // DST->V = M[m8-m11] * V[z] - "vmla.f32 q13, q12, d1[1] \n\t" // DST->V = M[m12-m15] * V[w] - - "vst1.32 {d26, d27}, [%0] \n\t" // DST->V - : - : "r"(dst), "r"(v), "r"(m) - : "q0", "q9", "q10","q11", "q12", "q13", "memory" - ); -} - -inline void MathUtilNeon::crossVec3(const float* v1, const float* v2, float* dst) -{ - asm volatile( - "vld1.32 {d1[1]}, [%1] \n\t" // - "vld1.32 {d0}, [%2] \n\t" // - "vmov.f32 s2, s1 \n\t" // q0 = (v1y, v1z, v1z, v1x) - - "vld1.32 {d2[1]}, [%3] \n\t" // - "vld1.32 {d3}, [%4] \n\t" // - "vmov.f32 s4, s7 \n\t" // q1 = (v2z, v2x, v2y, v2z) - - "vmul.f32 d4, d0, d2 \n\t" // x = v1y * v2z, y = v1z * v2x - "vmls.f32 d4, d1, d3 \n\t" // x -= v1z * v2y, y-= v1x - v2z - - "vmul.f32 d5, d3, d1[1] \n\t" // z = v1x * v2y - "vmls.f32 d5, d0, d2[1] \n\t" // z-= v1y * vx - - "vst1.32 {d4}, [%0]! \n\t" // V[x, y] - "vst1.32 {d5[0]}, [%0] \n\t" // V[z] - : - : "r"(dst), "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1)) - : "q0", "q1", "q2", "memory" - ); -} - -inline void MathUtilNeon::transformVertices(ax::V3F_C4B_T2F* dst, const ax::V3F_C4B_T2F* src, size_t count, const ax::Mat4& transform) -{ - auto end = dst + count; + inline static void multiplyMatrix(const _xm128_t* m, float scalar, _xm128_t* dst) + { + _xm128_t s = vdupq_n_f32(scalar); + UTILS_UNROLL + for (int i = 0; i < 4; ++i) + { + dst[i] = vmulq_f32(m[i], s); + } + } - // Load matrix - float32x4_t mc0 = vld1q_f32(transform.m); - float32x4_t mc1 = vld1q_f32(transform.m + 4); - float32x4_t mc2 = vld1q_f32(transform.m + 8); - float32x4_t mc3 = vld1q_f32(transform.m + 12); + inline static void multiplyMatrix(const _xm128_t* m1, const _xm128_t* m2, _xm128_t* dst) + { + float32x4_t product[4]; + float32x4_t val; + UTILS_UNROLL + for (int i = 0; i < 4; ++i) + { + val = vmulq_n_f32(m1[0], vgetq_lane_f32(m2[i], 0)); + val = vmlaq_n_f32(val, m1[1], vgetq_lane_f32(m2[i], 1)); + val = vmlaq_n_f32(val, m1[2], vgetq_lane_f32(m2[i], 2)); + val = vmlaq_n_f32(val, m1[3], vgetq_lane_f32(m2[i], 3)); + product[i] = val; + } + memcpy(dst, product, sizeof(product)); + } - // Process 4 vertices at a time - auto end4 = dst + count / 4 * 4; - while (dst < end4) + inline static void negateMatrix(const _xm128_t* m, _xm128_t* dst) { - // Load 4 vertices. Note that color will also get loaded into w - float32x2_t xy0 = vld1_f32(&src[0].vertices.x); - float32x2_t zw0 = vld1_f32(&src[0].vertices.z); - float32x2_t uv0 = vld1_f32(&src[0].texCoords.u); - float32x2_t xy1 = vld1_f32(&src[1].vertices.x); - float32x2_t zw1 = vld1_f32(&src[1].vertices.z); - float32x2_t uv1 = vld1_f32(&src[1].texCoords.u); - float32x2_t xy2 = vld1_f32(&src[2].vertices.x); - float32x2_t zw2 = vld1_f32(&src[2].vertices.z); - float32x2_t uv2 = vld1_f32(&src[2].texCoords.u); - float32x2_t xy3 = vld1_f32(&src[3].vertices.x); - float32x2_t zw3 = vld1_f32(&src[3].vertices.z); - float32x2_t uv3 = vld1_f32(&src[3].texCoords.u); - - // Multiply x by column 0 - float32x4_t r0 = vmulq_lane_f32(mc0, xy0, 0); - float32x4_t r1 = vmulq_lane_f32(mc0, xy1, 0); - float32x4_t r2 = vmulq_lane_f32(mc0, xy2, 0); - float32x4_t r3 = vmulq_lane_f32(mc0, xy3, 0); - - // Multiply y by column 1 and add to result - r0 = vmlaq_lane_f32(r0, mc1, xy0, 1); - r1 = vmlaq_lane_f32(r1, mc1, xy1, 1); - r2 = vmlaq_lane_f32(r2, mc1, xy2, 1); - r3 = vmlaq_lane_f32(r3, mc1, xy3, 1); - - // Multiply z by column 2 and add to result - r0 = vmlaq_lane_f32(r0, mc2, zw0, 0); - r1 = vmlaq_lane_f32(r1, mc2, zw1, 0); - r2 = vmlaq_lane_f32(r2, mc2, zw2, 0); - r3 = vmlaq_lane_f32(r3, mc2, zw3, 0); - - // Add column 3 - r0 = vaddq_f32(r0, mc3); - r1 = vaddq_f32(r1, mc3); - r2 = vaddq_f32(r2, mc3); - r3 = vaddq_f32(r3, mc3); - - // Set color - r0 = vsetq_lane_f32(vget_lane_f32(zw0, 1), r0, 3); - r1 = vsetq_lane_f32(vget_lane_f32(zw1, 1), r1, 3); - r2 = vsetq_lane_f32(vget_lane_f32(zw2, 1), r2, 3); - r3 = vsetq_lane_f32(vget_lane_f32(zw3, 1), r3, 3); - - // Store result - vst1q_f32(&dst[0].vertices.x, r0); - vst1_f32(&dst[0].texCoords.u, uv0); - vst1q_f32(&dst[1].vertices.x, r1); - vst1_f32(&dst[1].texCoords.u, uv1); - vst1q_f32(&dst[2].vertices.x, r2); - vst1_f32(&dst[2].texCoords.u, uv2); - vst1q_f32(&dst[3].vertices.x, r3); - vst1_f32(&dst[3].texCoords.u, uv3); - - dst += 4; - src += 4; + UTILS_UNROLL + for (int i = 0; i < 4; ++i) + { + dst[i] = vnegq_f32(m[i]); + } } - // Process remaining vertices - while (dst < end) + inline static void transposeMatrix(const _xm128_t* m, _xm128_t* dst) { - // Load vertex - float32x2_t xy = vld1_f32(&src->vertices.x); - float32x2_t zw = vld1_f32(&src->vertices.z); - float32x2_t uv = vld1_f32(&src->texCoords.u); - - // Multiply x by column 0 - float32x4_t r = vmulq_lane_f32(mc0, xy, 0); - // Multiply y by column 1 and add to result - r = vmlaq_lane_f32(r, mc1, xy, 1); - // Multiply z by column 2 and add to result - r = vmlaq_lane_f32(r, mc2, zw, 0); - // Add column 3 - r = vaddq_f32(r, mc3); - - // Set color - r = vsetq_lane_f32(vget_lane_f32(zw, 1), r, 3); - - // Store result - vst1q_f32(&dst->vertices.x, r); - vst1_f32(&dst->texCoords.u, uv); - - ++dst; - ++src; + auto tmp0 = vzipq_f32(m[0], m[2]); + auto tmp1 = vzipq_f32(m[1], m[3]); + auto tmp2 = vzipq_f32(tmp0.val[0], tmp1.val[0]); + auto tmp3 = vzipq_f32(tmp0.val[1], tmp1.val[1]); + + dst[0] = tmp2.val[0]; + dst[1] = tmp2.val[1]; + dst[2] = tmp3.val[0]; + dst[3] = tmp3.val[1]; } -} + + inline static void transformVec4(const _xm128_t* m, float x, float y, float z, float w, float* dst/*vec3*/) + { + auto v0 = vmulq_n_f32(m[0], x); + auto v1 = vmulq_n_f32(m[1], y); + auto v2 = vmulq_n_f32(m[2], z); + auto v3 = vmulq_n_f32(m[3], w); + auto prod = vaddq_f32(v0, vaddq_f32(v1, vaddq_f32(v2, v3))); + vst1_f32(dst, vget_low_f32(prod)); + vst1_lane_f32(dst + 2, vget_high_f32(prod), 0); + } + + inline static void transformVec4(const _xm128_t* m, const float* v /*vec4*/, float* dst /*vec4*/) + { + auto v0 = vmulq_n_f32(m[0], v[0]); + auto v1 = vmulq_n_f32(m[1], v[1]); + auto v2 = vmulq_n_f32(m[2], v[2]); + auto v3 = vmulq_n_f32(m[3], v[3]); + auto prod = vaddq_f32(v0, vaddq_f32(v1, vaddq_f32(v2, v3))); + vst1q_f32(dst, prod); + } + + inline static void crossVec3(const float* v1, const float* v2, float* dst) + { + // refer to: + // https://developer.arm.com/documentation/den0018/a/NEON-Code-Examples-with-Mixed-Operations/Cross-product/Single-cross-product + // Vector a is stored in memory such that ai is at the lower address and + // ak is at the higher address. Vector b is also stored in the same way. + + float32x4_t vec_a = vcombine_f32(vld1_f32(v1 + 1), vld1_f32(v1)); // Q register = [aj, ai, ak, aj] + float32x4_t vec_b = vcombine_f32(vld1_f32(v2 + 1), vld1_f32(v2)); // Q register = [bj, bi, bk, bj] + float32x4_t vec_a_rot = vextq_f32(vec_a, vec_a, 1); + float32x4_t vec_b_rot = vextq_f32(vec_b, vec_b, 1); + + float32x4_t prod = vmulq_f32(vec_a, vec_b_rot); + + // prod = [ ajbj, aibj, akbi, ajbk ] + + prod = vmlsq_f32(prod, vec_a_rot, vec_b); + // prod = [ ajbj-ajbj, aibj-ajbi, akbi-aibk, ajbk-akbj ] + + vst1_f32(dst, vget_low_f32(prod)); // Store the lower two elements to address r + vst1_lane_f32(dst + 2, vget_high_f32(prod), 0); // Store the 3rd element + } + +#if AX_64BITS + inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform) + { + auto end = dst + count; + + // Load matrix + float32x4x4_t m = vld1q_f32_x4(transform.m); + + // Process 4 vertices at a time if there's enough data + auto end4 = dst + count / 4 * 4; + while (dst < end4) + { + // Do this for each vertex + // dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8] + m[12]; + // dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9] + m[13]; + // dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14]; + + // First, load each vertex, multiply x by column 0 and add to column 3 + // Note: since we're reading 4 floats it will load color bytes into v.w + float32x4_t v0 = vld1q_f32(&src[0].vertices.x); + float32x4_t r0 = vmlaq_laneq_f32(m.val[3], m.val[0], v0, 0); + float32x4_t v1 = vld1q_f32(&src[1].vertices.x); + float32x4_t r1 = vmlaq_laneq_f32(m.val[3], m.val[0], v1, 0); + float32x4_t v2 = vld1q_f32(&src[2].vertices.x); + float32x4_t r2 = vmlaq_laneq_f32(m.val[3], m.val[0], v2, 0); + float32x4_t v3 = vld1q_f32(&src[3].vertices.x); + float32x4_t r3 = vmlaq_laneq_f32(m.val[3], m.val[0], v3, 0); + + // Load texCoords + float32x2_t uv0 = vld1_f32(&src[0].texCoords.u); + float32x2_t uv1 = vld1_f32(&src[1].texCoords.u); + float32x2_t uv2 = vld1_f32(&src[2].texCoords.u); + float32x2_t uv3 = vld1_f32(&src[3].texCoords.u); + + // Multiply y by column 1 and add to result + r0 = vmlaq_laneq_f32(r0, m.val[1], v0, 1); + r1 = vmlaq_laneq_f32(r1, m.val[1], v1, 1); + r2 = vmlaq_laneq_f32(r2, m.val[1], v2, 1); + r3 = vmlaq_laneq_f32(r3, m.val[1], v3, 1); + + // Multiply z by column 2 and add to result + r0 = vmlaq_laneq_f32(r0, m.val[2], v0, 2); + r1 = vmlaq_laneq_f32(r1, m.val[2], v1, 2); + r2 = vmlaq_laneq_f32(r2, m.val[2], v2, 2); + r3 = vmlaq_laneq_f32(r3, m.val[2], v3, 2); + + // Set w to loaded color + r0 = vsetq_lane_f32(vgetq_lane_f32(v0, 3), r0, 3); + r1 = vsetq_lane_f32(vgetq_lane_f32(v1, 3), r1, 3); + r2 = vsetq_lane_f32(vgetq_lane_f32(v2, 3), r2, 3); + r3 = vsetq_lane_f32(vgetq_lane_f32(v3, 3), r3, 3); + + // Store result + vst1q_f32(&dst[0].vertices.x, r0); + vst1_f32(&dst[0].texCoords.u, uv0); + vst1q_f32(&dst[1].vertices.x, r1); + vst1_f32(&dst[1].texCoords.u, uv1); + vst1q_f32(&dst[2].vertices.x, r2); + vst1_f32(&dst[2].texCoords.u, uv2); + vst1q_f32(&dst[3].vertices.x, r3); + vst1_f32(&dst[3].texCoords.u, uv3); + + dst += 4; + src += 4; + } + + // Process remaining vertices one by one + while (dst < end) + { + float32x4_t v = vld1q_f32(&src->vertices.x); + float32x4_t r = vmlaq_laneq_f32(m.val[3], m.val[0], v, 0); + r = vmlaq_laneq_f32(r, m.val[1], v, 1); + r = vmlaq_laneq_f32(r, m.val[2], v, 2); + r = vsetq_lane_f32(vgetq_lane_f32(v, 3), r, 3); + float32x2_t uv = vld1_f32(&src->texCoords.u); + vst1q_f32(&dst->vertices.x, r); + vst1_f32(&dst->texCoords.u, uv); + + ++dst; + ++src; + } + } + + inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) + { + auto end = dst + count; + auto off = vdupq_n_u16(offset); + + if (count < 8) + goto LEFTOVER; + + // Process 32 indices at a time if there's enough data + while (count >= 32) + { + // Load 32 indices + uint16x8x4_t v = vld1q_u16_x4(src); + + // Add offset + v.val[0] = vaddq_u16(v.val[0], off); + v.val[1] = vaddq_u16(v.val[1], off); + v.val[2] = vaddq_u16(v.val[2], off); + v.val[3] = vaddq_u16(v.val[3], off); + + // Store result + vst1q_u16_x4(dst, v); + + dst += 32; + src += 32; + count -= 32; + } + + // Process 8 indices at a time if there's enough data + while (count >= 8) + { + uint16x8_t v = vld1q_u16(src); + v = vaddq_u16(v, off); + vst1q_u16(dst, v); + + dst += 8; + src += 8; + count -= 8; + } + + LEFTOVER: + // Process remaining indices one by one + while (count > 0) + { + *dst = *src + offset; + ++dst; + ++src; + --count; + } + } +#else + inline static void transformVertices(ax::V3F_C4B_T2F* dst, + const ax::V3F_C4B_T2F* src, + size_t count, + const ax::Mat4& transform) + { + auto end = dst + count; + + // Load matrix + float32x4_t mc0 = vld1q_f32(transform.m); + float32x4_t mc1 = vld1q_f32(transform.m + 4); + float32x4_t mc2 = vld1q_f32(transform.m + 8); + float32x4_t mc3 = vld1q_f32(transform.m + 12); + + // Process 4 vertices at a time + auto end4 = dst + count / 4 * 4; + while (dst < end4) + { + // Load 4 vertices. Note that color will also get loaded into w + float32x2_t xy0 = vld1_f32(&src[0].vertices.x); + float32x2_t zw0 = vld1_f32(&src[0].vertices.z); + float32x2_t uv0 = vld1_f32(&src[0].texCoords.u); + float32x2_t xy1 = vld1_f32(&src[1].vertices.x); + float32x2_t zw1 = vld1_f32(&src[1].vertices.z); + float32x2_t uv1 = vld1_f32(&src[1].texCoords.u); + float32x2_t xy2 = vld1_f32(&src[2].vertices.x); + float32x2_t zw2 = vld1_f32(&src[2].vertices.z); + float32x2_t uv2 = vld1_f32(&src[2].texCoords.u); + float32x2_t xy3 = vld1_f32(&src[3].vertices.x); + float32x2_t zw3 = vld1_f32(&src[3].vertices.z); + float32x2_t uv3 = vld1_f32(&src[3].texCoords.u); + + // Multiply x by column 0 + float32x4_t r0 = vmulq_lane_f32(mc0, xy0, 0); + float32x4_t r1 = vmulq_lane_f32(mc0, xy1, 0); + float32x4_t r2 = vmulq_lane_f32(mc0, xy2, 0); + float32x4_t r3 = vmulq_lane_f32(mc0, xy3, 0); + + // Multiply y by column 1 and add to result + r0 = vmlaq_lane_f32(r0, mc1, xy0, 1); + r1 = vmlaq_lane_f32(r1, mc1, xy1, 1); + r2 = vmlaq_lane_f32(r2, mc1, xy2, 1); + r3 = vmlaq_lane_f32(r3, mc1, xy3, 1); + + // Multiply z by column 2 and add to result + r0 = vmlaq_lane_f32(r0, mc2, zw0, 0); + r1 = vmlaq_lane_f32(r1, mc2, zw1, 0); + r2 = vmlaq_lane_f32(r2, mc2, zw2, 0); + r3 = vmlaq_lane_f32(r3, mc2, zw3, 0); + + // Add column 3 + r0 = vaddq_f32(r0, mc3); + r1 = vaddq_f32(r1, mc3); + r2 = vaddq_f32(r2, mc3); + r3 = vaddq_f32(r3, mc3); + + // Set color + r0 = vsetq_lane_f32(vget_lane_f32(zw0, 1), r0, 3); + r1 = vsetq_lane_f32(vget_lane_f32(zw1, 1), r1, 3); + r2 = vsetq_lane_f32(vget_lane_f32(zw2, 1), r2, 3); + r3 = vsetq_lane_f32(vget_lane_f32(zw3, 1), r3, 3); + + // Store result + vst1q_f32(&dst[0].vertices.x, r0); + vst1_f32(&dst[0].texCoords.u, uv0); + vst1q_f32(&dst[1].vertices.x, r1); + vst1_f32(&dst[1].texCoords.u, uv1); + vst1q_f32(&dst[2].vertices.x, r2); + vst1_f32(&dst[2].texCoords.u, uv2); + vst1q_f32(&dst[3].vertices.x, r3); + vst1_f32(&dst[3].texCoords.u, uv3); + + dst += 4; + src += 4; + } + + // Process remaining vertices + while (dst < end) + { + // Load vertex + float32x2_t xy = vld1_f32(&src->vertices.x); + float32x2_t zw = vld1_f32(&src->vertices.z); + float32x2_t uv = vld1_f32(&src->texCoords.u); + + // Multiply x by column 0 + float32x4_t r = vmulq_lane_f32(mc0, xy, 0); + // Multiply y by column 1 and add to result + r = vmlaq_lane_f32(r, mc1, xy, 1); + // Multiply z by column 2 and add to result + r = vmlaq_lane_f32(r, mc2, zw, 0); + // Add column 3 + r = vaddq_f32(r, mc3); + + // Set color + r = vsetq_lane_f32(vget_lane_f32(zw, 1), r, 3); + + // Store result + vst1q_f32(&dst->vertices.x, r); + vst1_f32(&dst->texCoords.u, uv); + + ++dst; + ++src; + } + } +#endif +}; NS_AX_MATH_END diff --git a/core/math/MathUtilNeon64.inl b/core/math/MathUtilNeon64.inl deleted file mode 100644 index 1bfb02759dc1..000000000000 --- a/core/math/MathUtilNeon64.inl +++ /dev/null @@ -1,398 +0,0 @@ -/** - Copyright 2013 BlackBerry Inc. - Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md). - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - Original file from GamePlay3D: http://gameplay3d.org - - This file was modified to fit the cocos2d-x project - */ - -#include -#include "base/Types.h" - -NS_AX_MATH_BEGIN - -class MathUtilNeon64 -{ -public: - inline static void addMatrix(const float* m, float scalar, float* dst); - inline static void addMatrix(const float* m1, const float* m2, float* dst); - inline static void subtractMatrix(const float* m1, const float* m2, float* dst); - inline static void multiplyMatrix(const float* m, float scalar, float* dst); - inline static void multiplyMatrix(const float* m1, const float* m2, float* dst); - - inline static void negateMatrix(const float* m, float* dst); - inline static void transposeMatrix(const float* m, float* dst); - - inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst); - inline static void transformVec4(const float* m, const float* v, float* dst); - inline static void crossVec3(const float* v1, const float* v2, float* dst); - - inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform); - inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset); -}; - -inline void MathUtilNeon64::addMatrix(const float* m, float scalar, float* dst) -{ - asm volatile( - "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // M[m0-m7] M[m8-m15] - "ld1r {v4.4s}, [%2] \n\t" //ssss - - "fadd v8.4s, v0.4s, v4.4s \n\t" // DST->M[m0-m3] = M[m0-m3] + s - "fadd v9.4s, v1.4s, v4.4s \n\t" // DST->M[m4-m7] = M[m4-m7] + s - "fadd v10.4s, v2.4s, v4.4s \n\t" // DST->M[m8-m11] = M[m8-m11] + s - "fadd v11.4s, v3.4s, v4.4s \n\t" // DST->M[m12-m15] = M[m12-m15] + s - - "st4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n\t" // Result in V9 - : - : "r"(dst), "r"(m), "r"(&scalar) - : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "memory" - ); -} - -inline void MathUtilNeon64::addMatrix(const float* m1, const float* m2, float* dst) -{ - asm volatile( - "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // M1[m0-m7] M1[m8-m15] - "ld4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%2] \n\t" // M2[m0-m7] M2[m8-m15] - - "fadd v12.4s, v0.4s, v8.4s \n\t" // DST->M[m0-m3] = M1[m0-m3] + M2[m0-m3] - "fadd v13.4s, v1.4s, v9.4s \n\t" // DST->M[m4-m7] = M1[m4-m7] + M2[m4-m7] - "fadd v14.4s, v2.4s, v10.4s \n\t" // DST->M[m8-m11] = M1[m8-m11] + M2[m8-m11] - "fadd v15.4s, v3.4s, v11.4s \n\t" // DST->M[m12-m15] = M1[m12-m15] + M2[m12-m15] - - "st4 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t" // DST->M[m0-m7] DST->M[m8-m15] - : - : "r"(dst), "r"(m1), "r"(m2) - : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" - ); -} - -inline void MathUtilNeon64::subtractMatrix(const float* m1, const float* m2, float* dst) -{ - asm volatile( - "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // M1[m0-m7] M1[m8-m15] - "ld4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%2] \n\t" // M2[m0-m7] M2[m8-m15] - - "fsub v12.4s, v0.4s, v8.4s \n\t" // DST->M[m0-m3] = M1[m0-m3] - M2[m0-m3] - "fsub v13.4s, v1.4s, v9.4s \n\t" // DST->M[m4-m7] = M1[m4-m7] - M2[m4-m7] - "fsub v14.4s, v2.4s, v10.4s \n\t" // DST->M[m8-m11] = M1[m8-m11] - M2[m8-m11] - "fsub v15.4s, v3.4s, v11.4s \n\t" // DST->M[m12-m15] = M1[m12-m15] - M2[m12-m15] - - "st4 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t" // DST->M[m0-m7] DST->M[m8-m15] - : - : "r"(dst), "r"(m1), "r"(m2) - : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" - ); -} - -inline void MathUtilNeon64::multiplyMatrix(const float* m, float scalar, float* dst) -{ - asm volatile( - "ld1 {v0.s}[0], [%2] \n\t" //s - "ld4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%1] \n\t" //M[m0-m7] M[m8-m15] - - "fmul v8.4s, v4.4s, v0.s[0] \n\t" // DST->M[m0-m3] = M[m0-m3] * s - "fmul v9.4s, v5.4s, v0.s[0] \n\t" // DST->M[m4-m7] = M[m4-m7] * s - "fmul v10.4s, v6.4s, v0.s[0] \n\t" // DST->M[m8-m11] = M[m8-m11] * s - "fmul v11.4s, v7.4s, v0.s[0] \n\t" // DST->M[m12-m15] = M[m12-m15] * s - - "st4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n\t" // DST->M[m0-m7] DST->M[m8-m15] - : - : "r"(dst), "r"(m), "r"(&scalar) - : "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory" - ); -} - -inline void MathUtilNeon64::multiplyMatrix(const float* m1, const float* m2, float* dst) -{ - asm volatile( - "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%1] \n\t" // M1[m0-m7] M1[m8-m15] M2[m0-m7] M2[m8-m15] - "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2] \n\t" // M2[m0-m15] - - - "fmul v12.4s, v8.4s, v0.s[0] \n\t" // DST->M[m0-m3] = M1[m0-m3] * M2[m0] - "fmul v13.4s, v8.4s, v0.s[1] \n\t" // DST->M[m4-m7] = M1[m4-m7] * M2[m4] - "fmul v14.4s, v8.4s, v0.s[2] \n\t" // DST->M[m8-m11] = M1[m8-m11] * M2[m8] - "fmul v15.4s, v8.4s, v0.s[3] \n\t" // DST->M[m12-m15] = M1[m12-m15] * M2[m12] - - "fmla v12.4s, v9.4s, v1.s[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m1] - "fmla v13.4s, v9.4s, v1.s[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m5] - "fmla v14.4s, v9.4s, v1.s[2] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m9] - "fmla v15.4s, v9.4s, v1.s[3] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m13] - - "fmla v12.4s, v10.4s, v2.s[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m2] - "fmla v13.4s, v10.4s, v2.s[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m6] - "fmla v14.4s, v10.4s, v2.s[2] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m10] - "fmla v15.4s, v10.4s, v2.s[3] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m14] - - "fmla v12.4s, v11.4s, v3.s[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m3] - "fmla v13.4s, v11.4s, v3.s[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m7] - "fmla v14.4s, v11.4s, v3.s[2] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m11] - "fmla v15.4s, v11.4s, v3.s[3] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m15] - - "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t" // DST->M[m0-m7]// DST->M[m8-m15] - - : // output - : "r"(dst), "r"(m1), "r"(m2) // input - note *value* of pointer doesn't change. - : "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - ); -} - -inline void MathUtilNeon64::negateMatrix(const float* m, float* dst) -{ - asm volatile( - "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // load m0-m7 load m8-m15 - - "fneg v4.4s, v0.4s \n\t" // negate m0-m3 - "fneg v5.4s, v1.4s \n\t" // negate m4-m7 - "fneg v6.4s, v2.4s \n\t" // negate m8-m15 - "fneg v7.4s, v3.4s \n\t" // negate m8-m15 - - "st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n\t" // store m0-m7 store m8-m15 - : - : "r"(dst), "r"(m) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory" - ); -} - -inline void MathUtilNeon64::transposeMatrix(const float* m, float* dst) -{ - asm volatile( - "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t" // DST->M[m0, m4, m8, m12] = M[m0-m3] - //DST->M[m1, m5, m9, m12] = M[m4-m7] - "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n\t" - : - : "r"(dst), "r"(m) - : "v0", "v1", "v2", "v3", "memory" - ); -} - -inline void MathUtilNeon64::transformVec4(const float* m, float x, float y, float z, float w, float* dst) -{ - asm volatile( - "ld1 {v0.s}[0], [%1] \n\t" // V[x] - "ld1 {v0.s}[1], [%2] \n\t" // V[y] - "ld1 {v0.s}[2], [%3] \n\t" // V[z] - "ld1 {v0.s}[3], [%4] \n\t" // V[w] - "ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%5] \n\t" // M[m0-m7] M[m8-m15] - - - "fmul v13.4s, v9.4s, v0.s[0] \n\t" // DST->V = M[m0-m3] * V[x] - "fmla v13.4s, v10.4s, v0.s[1] \n\t" // DST->V += M[m4-m7] * V[y] - "fmla v13.4s, v11.4s, v0.s[2] \n\t" // DST->V += M[m8-m11] * V[z] - "fmla v13.4s, v12.4s, v0.s[3] \n\t" // DST->V += M[m12-m15] * V[w] - - //"st1 {v13.4s}, [%0] \n\t" // DST->V[x, y] // DST->V[z] - "st1 {v13.2s}, [%0], 8 \n\t" - "st1 {v13.s}[2], [%0] \n\t" - : "+r"(dst) - : "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m) - : "v0", "v9", "v10","v11", "v12", "v13", "memory" - ); -} - -inline void MathUtilNeon64::transformVec4(const float* m, const float* v, float* dst) -{ - asm volatile - ( - "ld1 {v0.4s}, [%1] \n\t" // V[x, y, z, w] - "ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%2] \n\t" // M[m0-m7] M[m8-m15] - - "fmul v13.4s, v9.4s, v0.s[0] \n\t" // DST->V = M[m0-m3] * V[x] - "fmla v13.4s, v10.4s, v0.s[1] \n\t" // DST->V = M[m4-m7] * V[y] - "fmla v13.4s, v11.4s, v0.s[2] \n\t" // DST->V = M[m8-m11] * V[z] - "fmla v13.4s, v12.4s, v0.s[3] \n\t" // DST->V = M[m12-m15] * V[w] - - "st1 {v13.4s}, [%0] \n\t" // DST->V - : - : "r"(dst), "r"(v), "r"(m) - : "v0", "v9", "v10","v11", "v12", "v13", "memory" - ); -} - -inline void MathUtilNeon64::crossVec3(const float* v1, const float* v2, float* dst) -{ - asm volatile( - "ld1 {v0.2s}, [%2] \n\t" - "ld1 {v0.s}[2], [%1] \n\t" - "mov v0.s[3], v0.s[0] \n\t" // q0 = (v1y, v1z, v1x, v1x) - - "ld1 {v1.4s}, [%3] \n\t" - "mov v1.s[3], v1.s[0] \n\t" // q1 = (v2x, v2y, v2z, v2x) - - "fmul v2.4s, v0.4s, v1.4s \n\t" // x = v1y * v2z, y = v1z * v2x - - - "mov v0.s[0], v0.s[1] \n\t" - "mov v0.s[1], v0.s[2] \n\t" - "mov v0.s[2], v0.s[3] \n\t" - - "mov v1.s[3], v1.s[2] \n\t" - - "fmul v0.4s, v0.4s, v1.4s \n\t" - - "mov v0.s[3], v0.s[1] \n\t" - "mov v0.s[1], v0.s[2] \n\t" - "mov v0.s[2], v0.s[0] \n\t" - - "fsub v2.4s, v0.4s, v2.4s \n\t" - - "mov v2.s[0], v2.s[1] \n\t" - "mov v2.s[1], v2.s[2] \n\t" - "mov v2.s[2], v2.s[3] \n\t" - - "st1 {v2.2s}, [%0], 8 \n\t" // V[x, y] - "st1 {v2.s}[2], [%0] \n\t" // V[z] - : "+r"(dst) - : "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1)) - : "v0", "v1", "v2", "memory" - ); -} - -inline void MathUtilNeon64::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform) -{ - auto end = dst + count; - - // Load matrix - float32x4x4_t m = vld1q_f32_x4(transform.m); - - // Process 4 vertices at a time if there's enough data - auto end4 = dst + count / 4 * 4; - while (dst < end4) - { - // Do this for each vertex - // dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8] + m[12]; - // dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9] + m[13]; - // dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14]; - - // First, load each vertex, multiply x by column 0 and add to column 3 - // Note: since we're reading 4 floats it will load color bytes into v.w - float32x4_t v0 = vld1q_f32(&src[0].vertices.x); - float32x4_t r0 = vmlaq_laneq_f32(m.val[3], m.val[0], v0, 0); - float32x4_t v1 = vld1q_f32(&src[1].vertices.x); - float32x4_t r1 = vmlaq_laneq_f32(m.val[3], m.val[0], v1, 0); - float32x4_t v2 = vld1q_f32(&src[2].vertices.x); - float32x4_t r2 = vmlaq_laneq_f32(m.val[3], m.val[0], v2, 0); - float32x4_t v3 = vld1q_f32(&src[3].vertices.x); - float32x4_t r3 = vmlaq_laneq_f32(m.val[3], m.val[0], v3, 0); - - // Load texCoords - float32x2_t uv0 = vld1_f32(&src[0].texCoords.u); - float32x2_t uv1 = vld1_f32(&src[1].texCoords.u); - float32x2_t uv2 = vld1_f32(&src[2].texCoords.u); - float32x2_t uv3 = vld1_f32(&src[3].texCoords.u); - - // Multiply y by column 1 and add to result - r0 = vmlaq_laneq_f32(r0, m.val[1], v0, 1); - r1 = vmlaq_laneq_f32(r1, m.val[1], v1, 1); - r2 = vmlaq_laneq_f32(r2, m.val[1], v2, 1); - r3 = vmlaq_laneq_f32(r3, m.val[1], v3, 1); - - // Multiply z by column 2 and add to result - r0 = vmlaq_laneq_f32(r0, m.val[2], v0, 2); - r1 = vmlaq_laneq_f32(r1, m.val[2], v1, 2); - r2 = vmlaq_laneq_f32(r2, m.val[2], v2, 2); - r3 = vmlaq_laneq_f32(r3, m.val[2], v3, 2); - - // Set w to loaded color - r0 = vsetq_lane_f32(vgetq_lane_f32(v0, 3), r0, 3); - r1 = vsetq_lane_f32(vgetq_lane_f32(v1, 3), r1, 3); - r2 = vsetq_lane_f32(vgetq_lane_f32(v2, 3), r2, 3); - r3 = vsetq_lane_f32(vgetq_lane_f32(v3, 3), r3, 3); - - // Store result - vst1q_f32(&dst[0].vertices.x, r0); - vst1_f32(&dst[0].texCoords.u, uv0); - vst1q_f32(&dst[1].vertices.x, r1); - vst1_f32(&dst[1].texCoords.u, uv1); - vst1q_f32(&dst[2].vertices.x, r2); - vst1_f32(&dst[2].texCoords.u, uv2); - vst1q_f32(&dst[3].vertices.x, r3); - vst1_f32(&dst[3].texCoords.u, uv3); - - dst += 4; - src += 4; - } - - // Process remaining vertices one by one - while (dst < end) - { - float32x4_t v = vld1q_f32(&src->vertices.x); - float32x4_t r = vmlaq_laneq_f32(m.val[3], m.val[0], v, 0); - r = vmlaq_laneq_f32(r, m.val[1], v, 1); - r = vmlaq_laneq_f32(r, m.val[2], v, 2); - r = vsetq_lane_f32(vgetq_lane_f32(v, 3), r, 3); - float32x2_t uv = vld1_f32(&src->texCoords.u); - vst1q_f32(&dst->vertices.x, r); - vst1_f32(&dst->texCoords.u, uv); - - ++dst; - ++src; - } -} - -inline void MathUtilNeon64::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) -{ - auto end = dst + count; - auto off = vdupq_n_u16(offset); - - if (count < 8) - goto LEFTOVER; - - // Process 32 indices at a time if there's enough data - while (count >= 32) - { - // Load 32 indices - uint16x8x4_t v = vld1q_u16_x4(src); - - // Add offset - v.val[0] = vaddq_u16(v.val[0], off); - v.val[1] = vaddq_u16(v.val[1], off); - v.val[2] = vaddq_u16(v.val[2], off); - v.val[3] = vaddq_u16(v.val[3], off); - - // Store result - vst1q_u16_x4(dst, v); - - dst += 32; - src += 32; - count -= 32; - } - - // Process 8 indices at a time if there's enough data - while (count >= 8) - { - uint16x8_t v = vld1q_u16(src); - v = vaddq_u16(v, off); - vst1q_u16(dst, v); - - dst += 8; - src += 8; - count -= 8; - } - -LEFTOVER: - // Process remaining indices one by one - while (count > 0) - { - *dst = *src + offset; - ++dst; - ++src; - --count; - } -} - -NS_AX_MATH_END diff --git a/core/math/MathUtilSSE.inl b/core/math/MathUtilSSE.inl index 48a377bdcbff..4869fe98b1de 100644 --- a/core/math/MathUtilSSE.inl +++ b/core/math/MathUtilSSE.inl @@ -1,157 +1,276 @@ +/**************************************************************************** +Copyright (c) 2010-2012 cocos2d-x.org +Copyright (c) 2013-2017 Chukong Technologies +Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd. +Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md). + +https://axmol.dev/ + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +****************************************************************************/ + NS_AX_MATH_BEGIN -#ifdef AX_USE_SSE +#ifdef AX_SSE_INTRINSICS -void MathUtil::addMatrix(const __m128 m[4], float scalar, __m128 dst[4]) -{ - __m128 s = _mm_set1_ps(scalar); - dst[0] = _mm_add_ps(m[0], s); - dst[1] = _mm_add_ps(m[1], s); - dst[2] = _mm_add_ps(m[2], s); - dst[3] = _mm_add_ps(m[3], s); -} - -void MathUtil::addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]) +struct MathUtilSSE { - dst[0] = _mm_add_ps(m1[0], m2[0]); - dst[1] = _mm_add_ps(m1[1], m2[1]); - dst[2] = _mm_add_ps(m1[2], m2[2]); - dst[3] = _mm_add_ps(m1[3], m2[3]); -} -void MathUtil::subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]) -{ - dst[0] = _mm_sub_ps(m1[0], m2[0]); - dst[1] = _mm_sub_ps(m1[1], m2[1]); - dst[2] = _mm_sub_ps(m1[2], m2[2]); - dst[3] = _mm_sub_ps(m1[3], m2[3]); -} + static void addMatrix(const __m128 m[4], float scalar, __m128 dst[4]) + { + __m128 s = _mm_set1_ps(scalar); + dst[0] = _mm_add_ps(m[0], s); + dst[1] = _mm_add_ps(m[1], s); + dst[2] = _mm_add_ps(m[2], s); + dst[3] = _mm_add_ps(m[3], s); + } -void MathUtil::multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4]) -{ - __m128 s = _mm_set1_ps(scalar); - dst[0] = _mm_mul_ps(m[0], s); - dst[1] = _mm_mul_ps(m[1], s); - dst[2] = _mm_mul_ps(m[2], s); - dst[3] = _mm_mul_ps(m[3], s); -} - -void MathUtil::multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]) -{ - __m128 dst0, dst1, dst2, dst3; - { - __m128 e0 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(0, 0, 0, 0)); - __m128 e1 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(1, 1, 1, 1)); - __m128 e2 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(2, 2, 2, 2)); - __m128 e3 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(3, 3, 3, 3)); - - __m128 v0 = _mm_mul_ps(m1[0], e0); - __m128 v1 = _mm_mul_ps(m1[1], e1); - __m128 v2 = _mm_mul_ps(m1[2], e2); - __m128 v3 = _mm_mul_ps(m1[3], e3); - - __m128 a0 = _mm_add_ps(v0, v1); - __m128 a1 = _mm_add_ps(v2, v3); - __m128 a2 = _mm_add_ps(a0, a1); - - dst0 = a2; - } - - { - __m128 e0 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(0, 0, 0, 0)); - __m128 e1 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(1, 1, 1, 1)); - __m128 e2 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(2, 2, 2, 2)); - __m128 e3 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(3, 3, 3, 3)); - - __m128 v0 = _mm_mul_ps(m1[0], e0); - __m128 v1 = _mm_mul_ps(m1[1], e1); - __m128 v2 = _mm_mul_ps(m1[2], e2); - __m128 v3 = _mm_mul_ps(m1[3], e3); - - __m128 a0 = _mm_add_ps(v0, v1); - __m128 a1 = _mm_add_ps(v2, v3); - __m128 a2 = _mm_add_ps(a0, a1); - - dst1 = a2; - } - - { - __m128 e0 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(0, 0, 0, 0)); - __m128 e1 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(1, 1, 1, 1)); - __m128 e2 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(2, 2, 2, 2)); - __m128 e3 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(3, 3, 3, 3)); - - __m128 v0 = _mm_mul_ps(m1[0], e0); - __m128 v1 = _mm_mul_ps(m1[1], e1); - __m128 v2 = _mm_mul_ps(m1[2], e2); - __m128 v3 = _mm_mul_ps(m1[3], e3); - - __m128 a0 = _mm_add_ps(v0, v1); - __m128 a1 = _mm_add_ps(v2, v3); - __m128 a2 = _mm_add_ps(a0, a1); - - dst2 = a2; - } - - { - __m128 e0 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(0, 0, 0, 0)); - __m128 e1 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(1, 1, 1, 1)); - __m128 e2 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(2, 2, 2, 2)); - __m128 e3 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(3, 3, 3, 3)); - - __m128 v0 = _mm_mul_ps(m1[0], e0); - __m128 v1 = _mm_mul_ps(m1[1], e1); - __m128 v2 = _mm_mul_ps(m1[2], e2); - __m128 v3 = _mm_mul_ps(m1[3], e3); - - __m128 a0 = _mm_add_ps(v0, v1); - __m128 a1 = _mm_add_ps(v2, v3); - __m128 a2 = _mm_add_ps(a0, a1); - - dst3 = a2; - } - dst[0] = dst0; - dst[1] = dst1; - dst[2] = dst2; - dst[3] = dst3; -} - -void MathUtil::negateMatrix(const __m128 m[4], __m128 dst[4]) -{ - __m128 z = _mm_setzero_ps(); - dst[0] = _mm_sub_ps(z, m[0]); - dst[1] = _mm_sub_ps(z, m[1]); - dst[2] = _mm_sub_ps(z, m[2]); - dst[3] = _mm_sub_ps(z, m[3]); -} - -void MathUtil::transposeMatrix(const __m128 m[4], __m128 dst[4]) -{ - __m128 tmp0 = _mm_shuffle_ps(m[0], m[1], 0x44); - __m128 tmp2 = _mm_shuffle_ps(m[0], m[1], 0xEE); - __m128 tmp1 = _mm_shuffle_ps(m[2], m[3], 0x44); - __m128 tmp3 = _mm_shuffle_ps(m[2], m[3], 0xEE); - - dst[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88); - dst[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD); - dst[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88); - dst[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD); -} - -void MathUtil::transformVec4(const __m128 m[4], const __m128& v, __m128& dst) -{ - __m128 col1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)); - __m128 col2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)); - __m128 col3 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)); - __m128 col4 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)); - - dst = _mm_add_ps( - _mm_add_ps(_mm_mul_ps(m[0], col1), _mm_mul_ps(m[1], col2)), - _mm_add_ps(_mm_mul_ps(m[2], col3), _mm_mul_ps(m[3], col4)) - ); -} + static void addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]) + { + dst[0] = _mm_add_ps(m1[0], m2[0]); + dst[1] = _mm_add_ps(m1[1], m2[1]); + dst[2] = _mm_add_ps(m1[2], m2[2]); + dst[3] = _mm_add_ps(m1[3], m2[3]); + } -#endif + static void subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]) + { + dst[0] = _mm_sub_ps(m1[0], m2[0]); + dst[1] = _mm_sub_ps(m1[1], m2[1]); + dst[2] = _mm_sub_ps(m1[2], m2[2]); + dst[3] = _mm_sub_ps(m1[3], m2[3]); + } + + static void multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4]) + { + __m128 s = _mm_set1_ps(scalar); + dst[0] = _mm_mul_ps(m[0], s); + dst[1] = _mm_mul_ps(m[1], s); + dst[2] = _mm_mul_ps(m[2], s); + dst[3] = _mm_mul_ps(m[3], s); + } + + static void multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]) + { + __m128 dst0, dst1, dst2, dst3; + { + __m128 e0 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(0, 0, 0, 0)); + __m128 e1 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(1, 1, 1, 1)); + __m128 e2 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(2, 2, 2, 2)); + __m128 e3 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(3, 3, 3, 3)); + + __m128 v0 = _mm_mul_ps(m1[0], e0); + __m128 v1 = _mm_mul_ps(m1[1], e1); + __m128 v2 = _mm_mul_ps(m1[2], e2); + __m128 v3 = _mm_mul_ps(m1[3], e3); + + __m128 a0 = _mm_add_ps(v0, v1); + __m128 a1 = _mm_add_ps(v2, v3); + __m128 a2 = _mm_add_ps(a0, a1); + + dst0 = a2; + } + + { + __m128 e0 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(0, 0, 0, 0)); + __m128 e1 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(1, 1, 1, 1)); + __m128 e2 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(2, 2, 2, 2)); + __m128 e3 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(3, 3, 3, 3)); + + __m128 v0 = _mm_mul_ps(m1[0], e0); + __m128 v1 = _mm_mul_ps(m1[1], e1); + __m128 v2 = _mm_mul_ps(m1[2], e2); + __m128 v3 = _mm_mul_ps(m1[3], e3); + + __m128 a0 = _mm_add_ps(v0, v1); + __m128 a1 = _mm_add_ps(v2, v3); + __m128 a2 = _mm_add_ps(a0, a1); + + dst1 = a2; + } + + { + __m128 e0 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(0, 0, 0, 0)); + __m128 e1 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(1, 1, 1, 1)); + __m128 e2 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(2, 2, 2, 2)); + __m128 e3 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(3, 3, 3, 3)); + + __m128 v0 = _mm_mul_ps(m1[0], e0); + __m128 v1 = _mm_mul_ps(m1[1], e1); + __m128 v2 = _mm_mul_ps(m1[2], e2); + __m128 v3 = _mm_mul_ps(m1[3], e3); + + __m128 a0 = _mm_add_ps(v0, v1); + __m128 a1 = _mm_add_ps(v2, v3); + __m128 a2 = _mm_add_ps(a0, a1); + + dst2 = a2; + } + { + __m128 e0 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(0, 0, 0, 0)); + __m128 e1 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(1, 1, 1, 1)); + __m128 e2 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(2, 2, 2, 2)); + __m128 e3 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(3, 3, 3, 3)); + + __m128 v0 = _mm_mul_ps(m1[0], e0); + __m128 v1 = _mm_mul_ps(m1[1], e1); + __m128 v2 = _mm_mul_ps(m1[2], e2); + __m128 v3 = _mm_mul_ps(m1[3], e3); + + __m128 a0 = _mm_add_ps(v0, v1); + __m128 a1 = _mm_add_ps(v2, v3); + __m128 a2 = _mm_add_ps(a0, a1); + + dst3 = a2; + } + dst[0] = dst0; + dst[1] = dst1; + dst[2] = dst2; + dst[3] = dst3; + } + + static void negateMatrix(const __m128 m[4], __m128 dst[4]) + { + __m128 z = _mm_setzero_ps(); + dst[0] = _mm_sub_ps(z, m[0]); + dst[1] = _mm_sub_ps(z, m[1]); + dst[2] = _mm_sub_ps(z, m[2]); + dst[3] = _mm_sub_ps(z, m[3]); + } + + static void transposeMatrix(const __m128 m[4], __m128 dst[4]) + { + __m128 tmp0 = _mm_shuffle_ps(m[0], m[1], 0x44); + __m128 tmp2 = _mm_shuffle_ps(m[0], m[1], 0xEE); + __m128 tmp1 = _mm_shuffle_ps(m[2], m[3], 0x44); + __m128 tmp3 = _mm_shuffle_ps(m[2], m[3], 0xEE); + + dst[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88); + dst[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD); + dst[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88); + dst[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD); + } + + static void transformVec4(const __m128 m[4], float x, float y, float z, float w, float* dst /*vec3*/) + { + //__m128 res = _mm_set_ps(w, z, y, x); + //__m128 xx = _mm_shuffle_ps(res, res, _MM_SHUFFLE(0, 0, 0, 0)); + //__m128 yy = _mm_shuffle_ps(res, res, _MM_SHUFFLE(1, 1, 1, 1)); + //__m128 zz = _mm_shuffle_ps(res, res, _MM_SHUFFLE(2, 2, 2, 2)); + //__m128 ww = _mm_shuffle_ps(res, res, _MM_SHUFFLE(3, 3, 3, 3)); + + __m128 xx = _mm_set1_ps(x); + __m128 yy = _mm_set1_ps(y); + __m128 zz = _mm_set1_ps(z); + __m128 ww = _mm_set1_ps(w); + + auto res = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m[0], xx), _mm_mul_ps(m[1], yy)), + _mm_add_ps(_mm_mul_ps(m[2], zz), _mm_mul_ps(m[3], ww))); + + _mm_storel_pi((__m64*)dst, res); + +# if defined(__SSE4_1__) + *reinterpret_cast(dst + 2) = _mm_extract_ps(res, 2); +# else + dst[2] = _mm_cvtss_f32(_mm_movehl_ps(res, res)); +# endif + } + + static void transformVec4(const __m128 m[4], const float* v /*vec4*/, float* dst /*vec4*/) + { + //__m128 res = _mm_loadu_ps(v); + //__m128 xx = _mm_shuffle_ps(res, res, _MM_SHUFFLE(0, 0, 0, 0)); + //__m128 yy = _mm_shuffle_ps(res, res, _MM_SHUFFLE(1, 1, 1, 1)); + //__m128 zz = _mm_shuffle_ps(res, res, _MM_SHUFFLE(2, 2, 2, 2)); + //__m128 ww = _mm_shuffle_ps(res, res, _MM_SHUFFLE(3, 3, 3, 3)); + + __m128 xx = _mm_set1_ps(v[0]); + __m128 yy = _mm_set1_ps(v[1]); + __m128 zz = _mm_set1_ps(v[2]); + __m128 ww = _mm_set1_ps(v[3]); + + auto res = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m[0], xx), _mm_mul_ps(m[1], yy)), + _mm_add_ps(_mm_mul_ps(m[2], zz), _mm_mul_ps(m[3], ww))); + _mm_storeu_ps(dst, res); + } + + static void crossVec3(const float* v1, const float* v2, float* dst) + { + __m128 a = _mm_set_ps(0.0f, v1[2], v1[1], v1[0]); + __m128 b = _mm_set_ps(0.0f, v2[2], v2[1], v2[0]); + + __m128 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 res = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b)); + + res = _mm_shuffle_ps(res, res, _MM_SHUFFLE(3, 0, 2, 1)); + + _mm_storel_pi((__m64*)dst, res); +# if defined(__SSE4_1__) + *reinterpret_cast(dst + 2) = _mm_extract_ps(res, 2); +# else + dst[2] = _mm_cvtss_f32(_mm_movehl_ps(res, res)); +# endif + } + + static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform) + { + auto& m = transform.col; + + for (size_t i = 0; i < count; ++i) + { + auto& vert = src[i].vertices; + __m128 v = _mm_set_ps(1.0f, vert.z, vert.y, vert.x); + v = _mm_add_ps( + _mm_add_ps(_mm_mul_ps(m[0], _mm_shuffle_ps(v, v, 0)), _mm_mul_ps(m[1], _mm_shuffle_ps(v, v, 0x55))), + _mm_add_ps(_mm_mul_ps(m[2], _mm_shuffle_ps(v, v, 0xaa)), _mm_mul_ps(m[3], _mm_shuffle_ps(v, v, 0xff)))); + _mm_storeu_ps((float*)&dst[i].vertices, v); + + // Copy tex coords and colors + // dst[i].texCoords = src[i].texCoords; + // dst[i].colors = src[i].colors; + memcpy(&dst[i].colors, &src[i].colors, sizeof(V3F_C4B_T2F::colors) + sizeof(V3F_C4B_T2F::texCoords)); + } + } + + static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) + { + __m128i offset_vector = _mm_set1_epi16(offset); + size_t remainder = count % 8; + size_t rounded_count = count - remainder; + + for (size_t i = 0; i < rounded_count; i += 8) + { + __m128i current_values = _mm_loadu_si128((__m128i*)(src + i)); // Load 8 values. + current_values = _mm_add_epi16(current_values, offset_vector); // Add offset to them. + _mm_storeu_si128((__m128i*)(dst + i), current_values); // Store the result. + } + + // If count is not divisible by 8, add offset for the remainder elements one by one. + for (size_t i = 0; i < remainder; ++i) + { + dst[rounded_count + i] = src[rounded_count + i] + offset; + } + } +}; + +#endif NS_AX_MATH_END diff --git a/core/platform/PlatformConfig.h b/core/platform/PlatformConfig.h index 0e0bd15a29de..39da4778f91c 100644 --- a/core/platform/PlatformConfig.h +++ b/core/platform/PlatformConfig.h @@ -163,5 +163,26 @@ Linux: Desktop GL/Vulkan # endif #endif +// ## SIMD detections +#if !defined(AX_NEON_INTRINSICS) +# if (AX_TARGET_PLATFORM != AX_PLATFORM_WASM) +# if defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM) || defined(__ARM_NEON__) +# define AX_NEON_INTRINSICS 1 +# endif +# endif +#endif + +#ifdef AX_SSE_INTRINSICS +// axmol math ISA require SSE2 at latest +# include +# if defined(__SSE4_1__) +# include +# endif +using _xm128_t = __m128; +#elif defined(AX_NEON_INTRINSICS) +# include +using _xm128_t = float32x4_t; +#endif + /// @endcond #endif // __BASE_AX_PLATFORM_CONFIG_H__ diff --git a/core/platform/PlatformMacros.h b/core/platform/PlatformMacros.h index 401193803702..98f3bde1c915 100644 --- a/core/platform/PlatformMacros.h +++ b/core/platform/PlatformMacros.h @@ -89,12 +89,12 @@ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md). * @since v0.99.5 */ #if (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID) -# if !defined(AX_ENABLE_CACHE_TEXTURE_DATA) -# define AX_ENABLE_CACHE_TEXTURE_DATA 1 -# endif +# if !defined(AX_ENABLE_CACHE_TEXTURE_DATA) +# define AX_ENABLE_CACHE_TEXTURE_DATA 1 +# endif #else -# undef AX_ENABLE_CACHE_TEXTURE_DATA -# define AX_ENABLE_CACHE_TEXTURE_DATA 0 +# undef AX_ENABLE_CACHE_TEXTURE_DATA +# define AX_ENABLE_CACHE_TEXTURE_DATA 0 #endif /** @def AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST @@ -102,12 +102,12 @@ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md). * */ #if (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID && !AX_ENABLE_CACHE_TEXTURE_DATA) -# if !defined(AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST) -# define AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST 1 -# endif +# if !defined(AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST) +# define AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST 1 +# endif #else -# undef AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST -# define AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST 0 +# undef AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST +# define AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST 0 #endif #if (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID) || (AX_TARGET_PLATFORM == AX_PLATFORM_WIN32) @@ -188,14 +188,20 @@ protected: \ varType varName; \ \ public: \ - virtual inline varType get##funName() const { return varName; } + virtual inline varType get##funName() const \ + { \ + return varName; \ + } #define AX_SYNTHESIZE_READONLY_PASS_BY_REF(varType, varName, funName) \ protected: \ varType varName; \ \ public: \ - virtual inline const varType& get##funName() const { return varName; } + virtual inline const varType& get##funName() const \ + { \ + return varName; \ + } /** @def AX_SYNTHESIZE * It is used to declare a protected variable. @@ -209,36 +215,51 @@ public: \ * The variables and methods declared after AX_SYNTHESIZE are all public. * If you need protected or private, please declare. */ -#define AX_SYNTHESIZE(varType, varName, funName) \ -protected: \ - varType varName; \ - \ -public: \ - virtual inline varType get##funName() const { return varName; } \ - virtual inline void set##funName(varType var) { varName = var; } - -#define AX_SYNTHESIZE_PASS_BY_REF(varType, varName, funName) \ -protected: \ - varType varName; \ - \ -public: \ - virtual inline const varType& get##funName() const { return varName; } \ - virtual inline void set##funName(const varType& var) { varName = var; } - -#define AX_SYNTHESIZE_RETAIN(varType, varName, funName) \ -private: \ - varType varName; \ - \ -public: \ - virtual inline varType get##funName() const { return varName; } \ - virtual inline void set##funName(varType var) \ - { \ - if (varName != var) \ - { \ - AX_SAFE_RETAIN(var); \ - AX_SAFE_RELEASE(varName); \ - varName = var; \ - } \ +#define AX_SYNTHESIZE(varType, varName, funName) \ +protected: \ + varType varName; \ + \ +public: \ + virtual inline varType get##funName() const \ + { \ + return varName; \ + } \ + virtual inline void set##funName(varType var) \ + { \ + varName = var; \ + } + +#define AX_SYNTHESIZE_PASS_BY_REF(varType, varName, funName) \ +protected: \ + varType varName; \ + \ +public: \ + virtual inline const varType& get##funName() const \ + { \ + return varName; \ + } \ + virtual inline void set##funName(const varType& var) \ + { \ + varName = var; \ + } + +#define AX_SYNTHESIZE_RETAIN(varType, varName, funName) \ +private: \ + varType varName; \ + \ +public: \ + virtual inline varType get##funName() const \ + { \ + return varName; \ + } \ + virtual inline void set##funName(varType var) \ + { \ + if (varName != var) \ + { \ + AX_SAFE_RETAIN(var); \ + AX_SAFE_RELEASE(varName); \ + varName = var; \ + } \ } #define AX_SAFE_DELETE(p) \ @@ -252,7 +273,7 @@ public: \ { \ if (p) \ { \ - delete[](p); \ + delete[] (p); \ (p) = nullptr; \ } \ } while (0) @@ -318,7 +339,7 @@ public: \ } while (0) #elif _AX_DEBUG == 1 -# define AXLOG(format, ...) ax::print(format, ##__VA_ARGS__) +# define AXLOG(format, ...) ax::print(format, ##__VA_ARGS__) # define AXLOGERROR(format, ...) ax::print(format, ##__VA_ARGS__) # define AXLOGINFO(format, ...) \ do \ @@ -327,10 +348,10 @@ public: \ # define AXLOGWARN(...) __AXLOGWITHFUNCTION(__VA_ARGS__) #elif _AX_DEBUG > 1 -# define AXLOG(format, ...) ax::print(format, ##__VA_ARGS__) +# define AXLOG(format, ...) ax::print(format, ##__VA_ARGS__) # define AXLOGERROR(format, ...) ax::print(format, ##__VA_ARGS__) -# define AXLOGINFO(format, ...) ax::print(format, ##__VA_ARGS__) -# define AXLOGWARN(...) __AXLOGWITHFUNCTION(__VA_ARGS__) +# define AXLOGINFO(format, ...) ax::print(format, ##__VA_ARGS__) +# define AXLOGWARN(...) __AXLOGWITHFUNCTION(__VA_ARGS__) #endif // _AX_DEBUG /** Lua engine debug */ @@ -349,8 +370,8 @@ public: \ */ #if defined(__GNUC__) && ((__GNUC__ >= 5) || ((__GNUG__ == 4) && (__GNUC_MINOR__ >= 4))) || \ (defined(__clang__) && (__clang_major__ >= 3)) || (_MSC_VER >= 1800) -# define AX_DISALLOW_COPY_AND_ASSIGN(TypeName) \ - TypeName(const TypeName&) = delete; \ +# define AX_DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ TypeName& operator=(const TypeName&) = delete; #else # define AX_DISALLOW_COPY_AND_ASSIGN(TypeName) \ @@ -444,15 +465,25 @@ public: \ */ #if __has_builtin(__builtin_expect) # ifdef __cplusplus -# define UTILS_LIKELY(exp) (__builtin_expect(!!(exp), true)) +# define UTILS_LIKELY(exp) (__builtin_expect(!!(exp), true)) # define UTILS_UNLIKELY(exp) (__builtin_expect(!!(exp), false)) # else -# define UTILS_LIKELY(exp) (__builtin_expect(!!(exp), 1)) +# define UTILS_LIKELY(exp) (__builtin_expect(!!(exp), 1)) # define UTILS_UNLIKELY(exp) (__builtin_expect(!!(exp), 0)) # endif #else -# define UTILS_LIKELY(exp) (!!(exp)) +# define UTILS_LIKELY(exp) (!!(exp)) # define UTILS_UNLIKELY(exp) (!!(exp)) #endif +#if defined(_MSC_VER) +// MSVC does not support loop unrolling hints +# define UTILS_UNROLL +# define UTILS_NOUNROLL +#else +// C++11 allows pragmas to be specified as part of defines using the _Pragma syntax. +# define UTILS_UNROLL _Pragma("unroll") +# define UTILS_NOUNROLL _Pragma("nounroll") +#endif + #endif // __AX_PLATFORM_MACROS_H__ diff --git a/tests/unit-tests/Source/core/math/MathUtilTests.cpp b/tests/unit-tests/Source/core/math/MathUtilTests.cpp index 4c5e8523b41a..b6d52fa2555c 100644 --- a/tests/unit-tests/Source/core/math/MathUtilTests.cpp +++ b/tests/unit-tests/Source/core/math/MathUtilTests.cpp @@ -26,57 +26,33 @@ #include #include "base/Config.h" #include "base/Types.h" +#include "math/MathBase.h" #include "TestUtils.h" -#if (AX_TARGET_PLATFORM == AX_PLATFORM_IOS) - #if defined(__arm64__) - #define USE_NEON64 1 - #define INCLUDE_NEON64 1 - #elif defined(__ARM_NEON__) - #define USE_NEON32 1 - #define INCLUDE_NEON32 1 - #endif -#elif (AX_TARGET_PLATFORM == AX_PLATFORM_OSX) - #if defined(__arm64__) || defined(__aarch64__) - #define USE_NEON64 1 - #define INCLUDE_NEON64 1 - #endif -#elif (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID) - #if defined(__arm64__) || defined(__aarch64__) - #define USE_NEON64 1 - #define INCLUDE_NEON64 1 - #elif defined(__ARM_NEON__) - #define INCLUDE_NEON32 1 - #endif -#endif +#define INCLUDE_SSE +#define USE_SSE -#if defined(USE_NEON32) || defined(USE_NEON64) // || defined(USE_SSE) - #define SKIP_SIMD_TEST doctest::skip(false) +#if defined(AX_SSE_INTRINSICS) || defined(AX_NEON_INTRINSICS) +# define SKIP_SIMD_TEST doctest::skip(false) #else - #define SKIP_SIMD_TEST doctest::skip(true) +# define SKIP_SIMD_TEST doctest::skip(true) #endif USING_NS_AX; -namespace UnitTest { - -#ifdef INCLUDE_NEON32 - #include "math/MathUtilNeon.inl" -#endif - -#ifdef INCLUDE_NEON64 - #include "math/MathUtilNeon64.inl" -#endif +namespace UnitTest +{ -#ifdef INCLUDE_SSE - // #include "math/MathUtilSSE.inl" +#ifdef AX_NEON_INTRINSICS +# include "math/MathUtilNeon.inl" +#elif defined(AX_SSE_INTRINSICS) +# include "math/MathUtilSSE.inl" #endif #include "math/MathUtil.inl" } // namespace UnitTest - static void __checkMathUtilResult(std::string_view description, const float* a1, const float* a2, int size) { // Check whether the result of the optimized instruction is the same as which is implemented in C @@ -87,11 +63,10 @@ static void __checkMathUtilResult(std::string_view description, const float* a1, } } - -TEST_SUITE("math/MathUtil") { +TEST_SUITE("math/MathUtil") +{ using namespace UnitTest::ax; - static void checkVerticesAreEqual(const V3F_C4B_T2F* v1, const V3F_C4B_T2F* v2, size_t count) { for (size_t i = 0; i < count; ++i) @@ -102,84 +77,94 @@ TEST_SUITE("math/MathUtil") { } } - - TEST_CASE("transformVertices") { + TEST_CASE("transformVertices") + { auto count = 5; std::vector src(count); std::vector expected(count); std::vector dst(count); - for (int i = 0; i < count; ++i) { + for (int i = 0; i < count; ++i) + { src[i].vertices.set(float(i), float(i + 1), float(i + 2)); src[i].colors.set(uint8_t(i + 3), uint8_t(i + 4), uint8_t(i + 5), uint8_t(i + 6)); src[i].texCoords.set(float(i + 7), float(i + 8)); - expected[i] = src[i]; + expected[i] = src[i]; expected[i].vertices.x = src[i].vertices.y * 4; expected[i].vertices.y = src[i].vertices.x * -5; expected[i].vertices.z = src[i].vertices.z * 6; } - Mat4 transform( - 0, 4, 0, 0, - -5, 0, 0, 0, - 0, 0, 6, 0, - 1, 2, 3, 1 - ); + Mat4 transform(0, 4, 0, 0, -5, 0, 0, 0, 0, 0, 6, 0, 1, 2, 3, 1); - SUBCASE("MathUtilC") { + SUBCASE("MathUtilC") + { MathUtilC::transformVertices(dst.data(), src.data(), count, transform); checkVerticesAreEqual(expected.data(), dst.data(), count); } - #if INCLUDE_NEON32 - SUBCASE("MathUtilNeon") { - MathUtilNeon::transformVertices(dst.data(), src.data(), count, transform); - checkVerticesAreEqual(expected.data(), dst.data(), count); - } - #endif - - #if INCLUDE_NEON64 - SUBCASE("MathUtilNeon64") { - MathUtilNeon64::transformVertices(dst.data(), src.data(), count, transform); - checkVerticesAreEqual(expected.data(), dst.data(), count); - } - #endif +#ifdef AX_NEON_INTRINSICS + SUBCASE("MathUtilNeon") + { + MathUtilNeon::transformVertices(dst.data(), src.data(), count, transform); + checkVerticesAreEqual(expected.data(), dst.data(), count); + } +#elif defined(AX_SSE_INTRINSICS) + SUBCASE("MathUtilSSE") + { + MathUtilSSE::transformVertices(dst.data(), src.data(), count, transform); + checkVerticesAreEqual(expected.data(), dst.data(), count); + } +#endif } - TEST_CASE("transformIndices") { + TEST_CASE("transformIndices") + { auto count = 43; std::vector src(count); std::vector expected(count); - for (int i = 0; i < count; ++i) { - src[i] = i; + for (int i = 0; i < count; ++i) + { + src[i] = i; expected[i] = i + 5; } uint16_t offset = 5; - SUBCASE("MathUtilC") { + SUBCASE("MathUtilC") + { std::vector dst(count); MathUtilC::transformIndices(dst.data(), src.data(), count, offset); for (int i = 0; i < count; ++i) CHECK_EQ(expected[i], dst[i]); } - #if INCLUDE_NEON64 - SUBCASE("MathUtilNeon64") { - std::vector dst(count); - MathUtilNeon64::transformIndices(dst.data(), src.data(), count, offset); - for (int i = 0; i < count; ++i) - CHECK_EQ(expected[i], dst[i]); - } - #endif +#if defined(AX_NEON_INTRINSICS) && AX_64BITS + SUBCASE("MathUtilNeon") + { + std::vector dst(count); + MathUtilNeon::transformIndices(dst.data(), src.data(), count, offset); + for (int i = 0; i < count; ++i) + CHECK_EQ(expected[i], dst[i]); + } +#elif defined(AX_SSE_INTRINSICS) + SUBCASE("MathUtilSSE") + { + std::vector dst(count); + MathUtilSSE::transformIndices(dst.data(), src.data(), count, offset); + for (int i = 0; i < count; ++i) + CHECK_EQ(expected[i], dst[i]); + } +#endif } } - -TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) { - TEST_CASE("old_tests") { +TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) +{ + TEST_CASE("old_tests") + { // I know the next line looks ugly, but it's a way to test MathUtil. :) using namespace UnitTest::ax; @@ -213,20 +198,18 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) { // inline static void addMatrix(const float* m, float scalar, float* dst); MathUtilC::addMatrix(inMat41, scalar, outMat4C); - #ifdef INCLUDE_NEON32 - MathUtilNeon::addMatrix(inMat41, scalar, outMat4Opt); - #endif - - #ifdef INCLUDE_NEON64 - MathUtilNeon64::addMatrix(inMat41, scalar, outMat4Opt); - #endif +#ifdef AX_NEON_INTRINSICS + MathUtilNeon::addMatrix(reinterpret_cast(inMat41), scalar, + reinterpret_cast<_xm128_t*>(outMat4Opt)); +#endif - #ifdef INCLUDE_SSE - // FIXME: - #endif +#ifdef AX_SSE_INTRINSICS + MathUtilSSE::addMatrix(reinterpret_cast(inMat41), scalar, + reinterpret_cast<_xm128_t*>(outMat4Opt)); +#endif __checkMathUtilResult("inline static void addMatrix(const float* m, float scalar, float* dst);", outMat4C, - outMat4Opt, MAT4_SIZE); + outMat4Opt, MAT4_SIZE); // Clean memset(outMat4C, 0, sizeof(outMat4C)); memset(outMat4Opt, 0, sizeof(outMat4Opt)); @@ -234,20 +217,16 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) { // inline static void addMatrix(const float* m1, const float* m2, float* dst); MathUtilC::addMatrix(inMat41, inMat42, outMat4C); - #ifdef INCLUDE_NEON32 - MathUtilNeon::addMatrix(inMat41, inMat42, outMat4Opt); - #endif - - #ifdef INCLUDE_NEON64 - MathUtilNeon64::addMatrix(inMat41, inMat42, outMat4Opt); - #endif - - #ifdef INCLUDE_SSE - // FIXME: - #endif +#ifdef AX_NEON_INTRINSICS + MathUtilNeon::addMatrix(reinterpret_cast(inMat41), reinterpret_cast(inMat42), + reinterpret_cast<_xm128_t*>(outMat4Opt)); +#elif defined(AX_SSE_INTRINSICS) + MathUtilSSE::addMatrix(reinterpret_cast(inMat41), reinterpret_cast(inMat42), + reinterpret_cast<_xm128_t*>(outMat4Opt)); +#endif __checkMathUtilResult("inline static void addMatrix(const float* m1, const float* m2, float* dst);", outMat4C, - outMat4Opt, MAT4_SIZE); + outMat4Opt, MAT4_SIZE); // Clean memset(outMat4C, 0, sizeof(outMat4C)); memset(outMat4Opt, 0, sizeof(outMat4Opt)); @@ -255,20 +234,18 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) { // inline static void subtractMatrix(const float* m1, const float* m2, float* dst); MathUtilC::subtractMatrix(inMat41, inMat42, outMat4C); - #ifdef INCLUDE_NEON32 - MathUtilNeon::subtractMatrix(inMat41, inMat42, outMat4Opt); - #endif - - #ifdef INCLUDE_NEON64 - MathUtilNeon64::subtractMatrix(inMat41, inMat42, outMat4Opt); - #endif - - #ifdef INCLUDE_SSE - // FIXME: - #endif +#ifdef AX_NEON_INTRINSICS + MathUtilNeon::subtractMatrix(reinterpret_cast(inMat41), + reinterpret_cast(inMat42), + reinterpret_cast<_xm128_t*>(outMat4Opt)); +#elif defined(AX_SSE_INTRINSICS) + MathUtilSSE::subtractMatrix(reinterpret_cast(inMat41), + reinterpret_cast(inMat42), + reinterpret_cast<_xm128_t*>(outMat4Opt)); +#endif - __checkMathUtilResult("inline static void subtractMatrix(const float* m1, const float* m2, float* dst);", outMat4C, - outMat4Opt, MAT4_SIZE); + __checkMathUtilResult("inline static void subtractMatrix(const float* m1, const float* m2, float* dst);", + outMat4C, outMat4Opt, MAT4_SIZE); // Clean memset(outMat4C, 0, sizeof(outMat4C)); memset(outMat4Opt, 0, sizeof(outMat4Opt)); @@ -276,20 +253,16 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) { // inline static void multiplyMatrix(const float* m, float scalar, float* dst); MathUtilC::multiplyMatrix(inMat41, scalar, outMat4C); - #ifdef INCLUDE_NEON32 - MathUtilNeon::multiplyMatrix(inMat41, scalar, outMat4Opt); - #endif - - #ifdef INCLUDE_NEON64 - MathUtilNeon64::multiplyMatrix(inMat41, scalar, outMat4Opt); - #endif - - #ifdef INCLUDE_SSE - // FIXME: - #endif +#ifdef AX_NEON_INTRINSICS + MathUtilNeon::multiplyMatrix(reinterpret_cast(inMat41), scalar, + reinterpret_cast<_xm128_t*>(outMat4Opt)); +#elif defined(AX_SSE_INTRINSICS) + MathUtilSSE::multiplyMatrix(reinterpret_cast(inMat41), scalar, + reinterpret_cast<_xm128_t*>(outMat4Opt)); +#endif __checkMathUtilResult("inline static void multiplyMatrix(const float* m, float scalar, float* dst);", outMat4C, - outMat4Opt, MAT4_SIZE); + outMat4Opt, MAT4_SIZE); // Clean memset(outMat4C, 0, sizeof(outMat4C)); memset(outMat4Opt, 0, sizeof(outMat4Opt)); @@ -297,20 +270,18 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) { // inline static void multiplyMatrix(const float* m1, const float* m2, float* dst); MathUtilC::multiplyMatrix(inMat41, inMat42, outMat4C); - #ifdef INCLUDE_NEON32 - MathUtilNeon::multiplyMatrix(inMat41, inMat42, outMat4Opt); - #endif - - #ifdef INCLUDE_NEON64 - MathUtilNeon64::multiplyMatrix(inMat41, inMat42, outMat4Opt); - #endif - - #ifdef INCLUDE_SSE - // FIXME: - #endif +#ifdef AX_NEON_INTRINSICS + MathUtilNeon::multiplyMatrix(reinterpret_cast(inMat41), + reinterpret_cast(inMat42), + reinterpret_cast<_xm128_t*>(outMat4Opt)); +#elif defined(AX_SSE_INTRINSICS) + MathUtilSSE::multiplyMatrix(reinterpret_cast(inMat41), + reinterpret_cast(inMat42), + reinterpret_cast<_xm128_t*>(outMat4Opt)); +#endif - __checkMathUtilResult("inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);", outMat4C, - outMat4Opt, MAT4_SIZE); + __checkMathUtilResult("inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);", + outMat4C, outMat4Opt, MAT4_SIZE); // Clean memset(outMat4C, 0, sizeof(outMat4C)); memset(outMat4Opt, 0, sizeof(outMat4Opt)); @@ -318,20 +289,14 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) { // inline static void negateMatrix(const float* m, float* dst); MathUtilC::negateMatrix(inMat41, outMat4C); - #ifdef INCLUDE_NEON32 - MathUtilNeon::negateMatrix(inMat41, outMat4Opt); - #endif - - #ifdef INCLUDE_NEON64 - MathUtilNeon64::negateMatrix(inMat41, outMat4Opt); - #endif - - #ifdef INCLUDE_SSE - // FIXME: - #endif +#ifdef AX_NEON_INTRINSICS + MathUtilNeon::negateMatrix(reinterpret_cast(inMat41), reinterpret_cast<_xm128_t*>(outMat4Opt)); +#elif defined(AX_SSE_INTRINSICS) + MathUtilSSE::negateMatrix(reinterpret_cast(inMat41), reinterpret_cast<_xm128_t*>(outMat4Opt)); +#endif __checkMathUtilResult("inline static void negateMatrix(const float* m, float* dst);", outMat4C, outMat4Opt, - MAT4_SIZE); + MAT4_SIZE); // Clean memset(outMat4C, 0, sizeof(outMat4C)); memset(outMat4Opt, 0, sizeof(outMat4Opt)); @@ -339,20 +304,16 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) { // inline static void transposeMatrix(const float* m, float* dst); MathUtilC::transposeMatrix(inMat41, outMat4C); - #ifdef INCLUDE_NEON32 - MathUtilNeon::transposeMatrix(inMat41, outMat4Opt); - #endif - - #ifdef INCLUDE_NEON64 - MathUtilNeon64::transposeMatrix(inMat41, outMat4Opt); - #endif - - #ifdef INCLUDE_SSE - // FIXME: - #endif +#ifdef AX_NEON_INTRINSICS + MathUtilNeon::transposeMatrix(reinterpret_cast(inMat41), + reinterpret_cast<_xm128_t*>(outMat4Opt)); +#elif defined(AX_SSE_INTRINSICS) + MathUtilSSE::transposeMatrix(reinterpret_cast(inMat41), + reinterpret_cast<_xm128_t*>(outMat4Opt)); +#endif __checkMathUtilResult("inline static void transposeMatrix(const float* m, float* dst);", outMat4C, outMat4Opt, - MAT4_SIZE); + MAT4_SIZE); // Clean memset(outMat4C, 0, sizeof(outMat4C)); memset(outMat4Opt, 0, sizeof(outMat4Opt)); @@ -360,21 +321,16 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) { // inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst); MathUtilC::transformVec4(inMat41, x, y, z, w, outVec4C); - #ifdef INCLUDE_NEON32 - MathUtilNeon::transformVec4(inMat41, x, y, z, w, outVec4Opt); - #endif - - #ifdef INCLUDE_NEON64 - MathUtilNeon64::transformVec4(inMat41, x, y, z, w, outVec4Opt); - #endif - - #ifdef INCLUDE_SSE - // FIXME: - #endif +#ifdef AX_NEON_INTRINSICS + MathUtilNeon::transformVec4(reinterpret_cast(inMat41), x, y, z, w, outVec4Opt); +#elif defined(AX_SSE_INTRINSICS) + // FIXME: + MathUtilSSE::transformVec4(reinterpret_cast(inMat41), x, y, z, w, outVec4Opt); +#endif __checkMathUtilResult( - "inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);", outVec4C, - outVec4Opt, VEC4_SIZE); + "inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);", + outVec4C, outVec4Opt, VEC4_SIZE); // Clean memset(outVec4C, 0, sizeof(outVec4C)); memset(outVec4Opt, 0, sizeof(outVec4Opt)); @@ -382,20 +338,15 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) { // inline static void transformVec4(const float* m, const float* v, float* dst); MathUtilC::transformVec4(inMat41, inVec4, outVec4C); - #ifdef INCLUDE_NEON32 - MathUtilNeon::transformVec4(inMat41, inVec4, outVec4Opt); - #endif - - #ifdef INCLUDE_NEON64 - MathUtilNeon64::transformVec4(inMat41, inVec4, outVec4Opt); - #endif - - #ifdef INCLUDE_SSE - // FIXME: - #endif +#ifdef AX_NEON_INTRINSICS + MathUtilNeon::transformVec4(reinterpret_cast(inMat41), reinterpret_cast(inVec4), + reinterpret_cast(outVec4Opt)); +#elif defined(AX_SSE_INTRINSICS) + MathUtilSSE::transformVec4(reinterpret_cast(inMat41), reinterpret_cast(inVec4), reinterpret_cast(outVec4Opt)); +#endif __checkMathUtilResult("inline static void transformVec4(const float* m, const float* v, float* dst);", outVec4C, - outVec4Opt, VEC4_SIZE); + outVec4Opt, VEC4_SIZE); // Clean memset(outVec4C, 0, sizeof(outVec4C)); memset(outVec4Opt, 0, sizeof(outVec4Opt)); @@ -403,20 +354,14 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) { // inline static void crossVec3(const float* v1, const float* v2, float* dst); MathUtilC::crossVec3(inVec4, inVec42, outVec4C); - #ifdef INCLUDE_NEON32 - MathUtilNeon::crossVec3(inVec4, inVec42, outVec4Opt); - #endif - - #ifdef INCLUDE_NEON64 - MathUtilNeon64::crossVec3(inVec4, inVec42, outVec4Opt); - #endif - - #ifdef INCLUDE_SSE - // FIXME: - #endif +#ifdef AX_NEON_INTRINSICS + MathUtilNeon::crossVec3(inVec4, inVec42, outVec4Opt); +#elif defined(AX_SSE_INTRINSICS) + MathUtilSSE::crossVec3(inVec4, inVec42, outVec4Opt); +#endif __checkMathUtilResult("inline static void crossVec3(const float* v1, const float* v2, float* dst);", outVec4C, - outVec4Opt, VEC4_SIZE); + outVec4Opt, VEC4_SIZE); // Clean memset(outVec4C, 0, sizeof(outVec4C)); memset(outVec4Opt, 0, sizeof(outVec4Opt));