From 8293ce0f5f87a15583a132189c1e55c2512e801e Mon Sep 17 00:00:00 2001
From: halx99 <halx99@live.com>
Date: Sat, 3 Aug 2024 20:18:13 +0800
Subject: [PATCH] Refactor math simd

---
 1k/1kiss.ps1                                  |   1 +
 1k/fetch.cmake                                |   8 +-
 3rdparty/README.md                            |   2 +-
 3rdparty/yasio/yasio/bindings/yasio_ni.cpp    |  24 +-
 3rdparty/yasio/yasio/config.hpp               |   2 +-
 .../yasio/impl/eventfd_select_interrupter.hpp |  10 +-
 3rdparty/yasio/yasio/logging.hpp              |   3 +
 3rdparty/yasio/yasio/xxsocket.cpp             |   2 +-
 CMakeOptions.md                               |   1 +
 cmake/Modules/AXConfigDefine.cmake            |  15 +-
 core/CMakeLists.txt                           |  40 +-
 core/base/Configuration.cpp                   |  14 +-
 core/base/Console.cpp                         |   2 +-
 core/math/Mat4.cpp                            |  41 +-
 core/math/Mat4.h                              |  12 +-
 core/math/Mat4.inl                            |   3 +-
 core/math/MathBase.h                          |  40 +-
 core/math/MathUtil.cpp                        | 232 +++---
 core/math/MathUtil.h                          |  30 +-
 core/math/MathUtil.inl                        | 382 +++++-----
 core/math/MathUtilNeon.inl                    | 684 +++++++++---------
 core/math/MathUtilNeon64.inl                  | 398 ----------
 core/math/MathUtilSSE.inl                     | 413 +++++++----
 core/platform/PlatformConfig.h                |  21 +
 core/platform/PlatformMacros.h                | 135 ++--
 .../Source/core/math/MathUtilTests.cpp        | 345 ++++-----
 26 files changed, 1281 insertions(+), 1579 deletions(-)
 delete mode 100644 core/math/MathUtilNeon64.inl

diff --git a/1k/1kiss.ps1 b/1k/1kiss.ps1
index 1e9009412b4b..8d98e9f5ebd3 100644
--- a/1k/1kiss.ps1
+++ b/1k/1kiss.ps1
@@ -895,6 +895,7 @@ function setup_cmake($skipOS = $false, $scope = 'local') {
             else {
                 & "$cmake_pkg_path" '--skip-license' '--prefix=/usr/local' 1>$null 2>$null
             }
+            if (!$?) { Remove-Item $cmake_pkg_path -Force }
         }
 
         $cmake_prog, $_ = find_prog -name 'cmake' -path $cmake_bin -silent $true
diff --git a/1k/fetch.cmake b/1k/fetch.cmake
index 4556042af33e..6fa0c0804fde 100644
--- a/1k/fetch.cmake
+++ b/1k/fetch.cmake
@@ -20,10 +20,16 @@ function(_1kfetch_init)
         set(_1kfetch_manifest "${_1kfetch_manifest}" CACHE STRING "" FORCE)
     endif()
 
+    if(NOT EXISTS ${PWSH_PROG}) # try again
+        unset(PWSH_PROG CACHE)
+        find_program(PWSH_PROG NAMES pwsh powershell NO_PACKAGE_ROOT_PATH NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH NO_CMAKE_SYSTEM_PATH NO_CMAKE_FIND_ROOT_PATH)
+    endif()
+
     execute_process(COMMAND ${PWSH_PROG} ${CMAKE_CURRENT_FUNCTION_LIST_DIR}/resolv-uri.ps1
         -name "1kdist"
         -manifest ${_1kfetch_manifest}
         OUTPUT_VARIABLE _1kdist_url
+        RESULT_VARIABLE _1kdist_error
     )
 
     if(_1kdist_url)
@@ -33,7 +39,7 @@ function(_1kfetch_init)
         set(_1kdist_base_url "${_1kdist_base_url}/${_1kdist_ver}" PARENT_SCOPE)
         set(_1kdist_ver ${_1kdist_ver} PARENT_SCOPE)
     else()
-        message(WARNING "Resolve 1kdist uri fail, the _1kfetch_dist will not work")
+        message(WARNING "Resolve 1kdist uri fail, ${_1kdist_error}, the _1kfetch_dist will not work")
     endif()
 endfunction()
 
diff --git a/3rdparty/README.md b/3rdparty/README.md
index 7cdfd34e53d8..bb9df631800b 100644
--- a/3rdparty/README.md
+++ b/3rdparty/README.md
@@ -248,7 +248,7 @@
 
 ## yasio
 - [![Upstream](https://img.shields.io/github/v/release/yasio/yasio?label=Upstream)](https://github.com/yasio/yasio)
-- Version: 4.2.3
+- Version: 4.2.4
 - License: MIT WITH Anti-996
 
 ## zlib
diff --git a/3rdparty/yasio/yasio/bindings/yasio_ni.cpp b/3rdparty/yasio/yasio/bindings/yasio_ni.cpp
index 8914488f6b06..50b1bab7239d 100644
--- a/3rdparty/yasio/yasio/bindings/yasio_ni.cpp
+++ b/3rdparty/yasio/yasio/bindings/yasio_ni.cpp
@@ -60,14 +60,14 @@ YASIO_NI_API void yasio_init_globals(void(YASIO_INTEROP_DECL* pfn)(int level, co
 YASIO_NI_API void yasio_cleanup_globals() { io_service::cleanup_globals(); }
 
 struct yasio_io_event {
-  int kind; //
-  int channel;
-  void* thandle;
+  int kind; // event kind
+  int channel; // channel index
+  void* thandle; // transport
   union {
-    void* msg;
-    int status; //
+    void* hmsg; // io_packet*
+    int ec; // error code
   };
-  void* user;
+  void* user; // user data
 };
 
 YASIO_NI_API void* yasio_create_service(int channel_count, void(YASIO_INTEROP_DECL* event_cb)(yasio_io_event* event), void* user)
@@ -82,9 +82,9 @@ YASIO_NI_API void* yasio_create_service(int channel_count, void(YASIO_INTEROP_DE
     event.thandle = e->transport();
     event.user    = user;
     if (event.kind == yasio::YEK_ON_PACKET)
-      event.msg = !is_packet_empty(pkt) ? &pkt : nullptr;
+      event.hmsg = !is_packet_empty(pkt) ? &pkt : nullptr;
     else
-      event.status = e->status();
+      event.ec = e->status();
     event_cb(&event);
   });
   return service;
@@ -157,8 +157,12 @@ YASIO_NI_API void yasio_set_option(void* service_ptr, int opt, const char* pszAr
   std::array<cxx17::string_view, YASIO_MAX_OPTION_ARGC> args;
   int argc = 0;
   yasio::split_if(&strArgs.front(), ';', [&](char* s, char* e) {
-    *e           = '\0'; // to c style string
-    args[argc++] = cxx17::string_view(s, e - s);
+    if (e) {
+        *e           = '\0'; // to c style string
+        args[argc++] = cxx17::string_view(s, e - s);
+    } else {
+        args[argc++] = cxx17::string_view{s};
+    }
     return (argc < YASIO_MAX_OPTION_ARGC);
   });
 
diff --git a/3rdparty/yasio/yasio/config.hpp b/3rdparty/yasio/yasio/config.hpp
index 77c35c1301bf..6ae79af5b2ec 100644
--- a/3rdparty/yasio/yasio/config.hpp
+++ b/3rdparty/yasio/yasio/config.hpp
@@ -205,7 +205,7 @@ SOFTWARE.
 /*
 **  The yasio version macros
 */
-#define YASIO_VERSION_NUM 0x040203
+#define YASIO_VERSION_NUM 0x040204
 
 /*
 ** The macros used by io_service.
diff --git a/3rdparty/yasio/yasio/impl/eventfd_select_interrupter.hpp b/3rdparty/yasio/yasio/impl/eventfd_select_interrupter.hpp
index eb5fe285a68a..55d32953da48 100644
--- a/3rdparty/yasio/yasio/impl/eventfd_select_interrupter.hpp
+++ b/3rdparty/yasio/yasio/impl/eventfd_select_interrupter.hpp
@@ -20,11 +20,11 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <fcntl.h>
-#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ < 8)
-#  include <asm/unistd.h>
-#else // __GLIBC__ == 2 && __GLIBC_MINOR__ < 8
+#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ < 8) && !defined(__UCLIBC__)
+#  include <asm/unistd.h> // for syscall without API: eventfd
+#else
 #  include <sys/eventfd.h>
-#endif // __GLIBC__ == 2 && __GLIBC_MINOR__ < 8
+#endif
 
 #include <unistd.h>
 
@@ -105,7 +105,7 @@ class eventfd_select_interrupter {
   // Open the descriptors. Throws on error.
   inline void open_descriptors()
   {
-#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ < 8)
+#if defined(__GLIBC__) && (__GLIBC__ == 2 && __GLIBC_MINOR__ < 8) && !defined(__UCLIBC__)
     write_descriptor_ = read_descriptor_ = syscall(__NR_eventfd, 0);
     if (read_descriptor_ != -1)
     {
diff --git a/3rdparty/yasio/yasio/logging.hpp b/3rdparty/yasio/yasio/logging.hpp
index 12b79b94f0d3..11f28046195c 100644
--- a/3rdparty/yasio/yasio/logging.hpp
+++ b/3rdparty/yasio/yasio/logging.hpp
@@ -40,6 +40,9 @@ inline void yasio__print(std::string&& message) { ::write(::fileno(stdout), mess
 #  include <android/log.h>
 #  include <jni.h>
 #  define YASIO_LOG_TAG(tag, format, ...) __android_log_print(ANDROID_LOG_INFO, "yasio", (tag format), ##__VA_ARGS__)
+#elif defined(__OHOS__)
+#  include <hilog/log.h>
+#  define YASIO_LOG_TAG(tag, format, ...) OH_LOG_INFO(LOG_APP, (tag format "\n"), ##__VA_ARGS__)
 #else
 #  define YASIO_LOG_TAG(tag, format, ...) printf((tag format "\n"), ##__VA_ARGS__)
 #endif
diff --git a/3rdparty/yasio/yasio/xxsocket.cpp b/3rdparty/yasio/yasio/xxsocket.cpp
index 947757846ad6..6c1e06fb6122 100644
--- a/3rdparty/yasio/yasio/xxsocket.cpp
+++ b/3rdparty/yasio/yasio/xxsocket.cpp
@@ -209,7 +209,7 @@ int xxsocket::pserve(const endpoint& ep)
   if (!this->reopen(ep.af()))
     return -1;
 
-  set_optval(SOL_SOCKET, SO_REUSEADDR, 1);
+  this->reuse_address(true);
 
   int n = this->bind(ep);
   if (n != 0)
diff --git a/CMakeOptions.md b/CMakeOptions.md
index 484c69a2f19c..89135c7d4a94 100644
--- a/CMakeOptions.md
+++ b/CMakeOptions.md
@@ -52,6 +52,7 @@ default is: `navigator.hardwareConcurrency`
 - AX_WASM_SHELL_FILE: specify the wasm shell file, by default use `${_AX_ROOT}/core/platform/wasm/shell_minimal.html`
 - AX_WASM_ENABLE_DEVTOOLS: whether enable web devtools aka `pause`, `resume`, `step` buttons in webpage, default: `TRUE`
 - AX_WASM_INITIAL_MEMORY: set the wasm initial memory size, default `1024MB`
+- AX_WASM_ISA_SIMD: specify the wasm simd intrinsics type, default `none`, supports `sse`, `neon`, note the `wasm-simd` not support by axmol yet
 
 ## The options for axmol apps
 
diff --git a/cmake/Modules/AXConfigDefine.cmake b/cmake/Modules/AXConfigDefine.cmake
index e2156b202e53..6e3e569e8913 100644
--- a/cmake/Modules/AXConfigDefine.cmake
+++ b/cmake/Modules/AXConfigDefine.cmake
@@ -186,22 +186,21 @@ endfunction()
 
 if(EMSCRIPTEN)
     set(AX_WASM_THREADS "4" CACHE STRING "Wasm threads count")
-
-    set(_AX_WASM_THREADS_INT 0)
+    set(_threads_hint "")
     if (AX_WASM_THREADS STREQUAL "auto") # not empty string or not 0
         # Enable pthread support globally
+        set(_threads_hint "(auto)")
         include(ProcessorCount)
+        set(_AX_WASM_THREADS_INT 0)
         ProcessorCount(_AX_WASM_THREADS_INT)
-    elseif(AX_WASM_THREADS MATCHES "^([0-9]+)$" OR AX_WASM_THREADS STREQUAL "navigator.hardwareConcurrency")
-        set(_AX_WASM_THREADS_INT ${AX_WASM_THREADS})
+        set(AX_WASM_THREADS "${_AX_WASM_THREADS_INT}" CACHE STRING "Wasm threads count" FORCE)
     endif()
 
-    message(STATUS "AX_WASM_THREADS=${AX_WASM_THREADS}")
-    message(STATUS "_AX_WASM_THREADS_INT=${_AX_WASM_THREADS_INT}")
+    message(STATUS "AX_WASM_THREADS=${AX_WASM_THREADS}${_threads_hint}")
 
-    if (_AX_WASM_THREADS_INT)
+    if(AX_WASM_THREADS MATCHES "^([0-9]+)$" OR AX_WASM_THREADS STREQUAL "navigator.hardwareConcurrency")
         list(APPEND _ax_compile_options -pthread)
-        add_link_options(-pthread -sPTHREAD_POOL_SIZE=${_AX_WASM_THREADS_INT})
+        add_link_options(-pthread -sPTHREAD_POOL_SIZE=${AX_WASM_THREADS})
     endif()
 
     set(AX_WASM_INITIAL_MEMORY "1024MB" CACHE STRING "")
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 96ef16b5f77f..7defad5ae35c 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -400,9 +400,43 @@ if(WINDOWS)
     endif()
 endif()
 
-# AX_USE_SSE
-if (AX_ISA_SIMD MATCHES "sse")
-    target_compile_definitions(${_AX_CORE_LIB} PUBLIC AX_USE_SSE=1)
+# axmol math simd intrinsics support
+set(_simdc_defines)
+set(_simdc_options)
+if (NOT WASM) # native platforms auto detect from cmake or preprocessor check
+    if (AX_ISA_SIMD MATCHES "sse")
+        list(APPEND _simdc_defines AX_SSE_INTRINSICS=1)
+        if (AX_ISA_SIMD MATCHES "sse4")
+            list(APPEND _simdc_defines __SSE4_1__=1)
+            if (LINUX)
+                list(APPEND _simdc_options -msse4.1)
+            endif()
+        endif()
+    endif()
+else() # wasm requires user specify SIMD intrinsics manually
+    set(AX_WASM_ISA_SIMD "none" CACHE STRING "")
+    string(TOLOWER ${AX_WASM_ISA_SIMD} AX_WASM_ISA_SIMD)
+    if(AX_WASM_ISA_SIMD MATCHES "sse")
+        message(AUTHOR_WARNING "Using SSE intrinsics for WASM ...")
+        list(APPEND _simdc_defines AX_SSE_INTRINSICS=1 __SSE__=1 __SSE2__=1)
+        list(APPEND _simdc_options -msse -msse2)
+        if(AX_ISA_LEVEL GREATER_EQUAL 2)
+            list(APPEND _simdc_defines __SSE4_1__=1)
+            list(APPEND _simdc_options -msse4.1)
+        endif()
+        list(APPEND _simdc_options -msimd128)
+    elseif(AX_WASM_ISA_SIMD MATCHES "neon")
+        message(AUTHOR_WARNING "Using NEON intrinsics for WASM ...")
+        list(APPEND _simdc_defines AX_NEON_INTRINSICS=1)
+        list(APPEND _simdc_options -mfpu=neon -msimd128)
+    endif()
+endif()
+
+if(_simdc_defines)
+    target_compile_definitions(${_AX_CORE_LIB} PUBLIC ${_simdc_defines})
+    if(_simdc_options)
+        target_compile_options(${_AX_CORE_LIB} PUBLIC ${_simdc_options})
+    endif()
 endif()
 
 # engine extensions
diff --git a/core/base/Configuration.cpp b/core/base/Configuration.cpp
index aa442d4ff543..8fe50d291f9f 100644
--- a/core/base/Configuration.cpp
+++ b/core/base/Configuration.cpp
@@ -71,7 +71,7 @@ bool Configuration::init()
 #if AX_ENABLE_PROFILERS
     _valueDict["axmol.compiled_with_profiler"] = Value(true);
 #else
-    _valueDict["axmol.compiled_with_profiler"]       = Value(false);
+    _valueDict["axmol.compiled_with_profiler"] = Value(false);
 #endif
 
 #if AX_ENABLE_GL_STATE_CACHE == 0
@@ -83,7 +83,17 @@ bool Configuration::init()
 #if _AX_DEBUG
     _valueDict["axmol.build_type"] = Value("DEBUG");
 #else
-    _valueDict["axmol.build_type"]                   = Value("RELEASE");
+    _valueDict["axmol.build_type"] = Value("RELEASE");
+#endif
+
+#if defined(AX_SSE_INTRINSICS)
+#    if defined(__SSE4_1__)
+    _valueDict["axmol.simd"] = Value("SSE41");
+#    else
+    _valueDict["axmol.simd"] = Value("SSE2");
+#    endif
+#elif defined(AX_NEON_INTRINSICS)
+    _valueDict["axmol.simd"] = Value("NEON");
 #endif
 
     return true;
diff --git a/core/base/Console.cpp b/core/base/Console.cpp
index b5fc8ef2d0c2..a9569d8697dd 100644
--- a/core/base/Console.cpp
+++ b/core/base/Console.cpp
@@ -398,7 +398,7 @@ bool Console::listenOnTCP(int port)
     if (sock.pserve(ep) != 0)
     {
         int ec = xxsocket::get_last_errno();
-        AXLOGW("Console: open server failed, ec:{}", ec);
+        AXLOGW("Console: open server failed, ec:{}, {}", ec, xxsocket::strerror(ec));
         return false;
     }
 
diff --git a/core/math/Mat4.cpp b/core/math/Mat4.cpp
index d08383871037..6ab50adc4ec4 100644
--- a/core/math/Mat4.cpp
+++ b/core/math/Mat4.cpp
@@ -17,7 +17,7 @@
 
  Original file from GamePlay3D: http://gameplay3d.org
 
- This file was modified to fit the cocos2d-x project
+ This file was modified to fit the axmol project
  */
 
 #include "math/Mat4.h"
@@ -459,11 +459,7 @@ void Mat4::add(float scalar)
 void Mat4::add(float scalar, Mat4* dst)
 {
     GP_ASSERT(dst);
-#ifdef AX_USE_SSE
-    MathUtil::addMatrix(col, scalar, dst->col);
-#else
     MathUtil::addMatrix(m, scalar, dst->m);
-#endif
 }
 
 void Mat4::add(const Mat4& mat)
@@ -474,11 +470,7 @@ void Mat4::add(const Mat4& mat)
 void Mat4::add(const Mat4& m1, const Mat4& m2, Mat4* dst)
 {
     GP_ASSERT(dst);
-#ifdef AX_USE_SSE
-    MathUtil::addMatrix(m1.col, m2.col, dst->col);
-#else
     MathUtil::addMatrix(m1.m, m2.m, dst->m);
-#endif
 }
 
 bool Mat4::decompose(Vec3* scale, Quaternion* rotation, Vec3* translation) const
@@ -751,11 +743,7 @@ void Mat4::multiply(float scalar, Mat4* dst) const
 void Mat4::multiply(const Mat4& m, float scalar, Mat4* dst)
 {
     GP_ASSERT(dst);
-#ifdef AX_USE_SSE
-    MathUtil::multiplyMatrix(m.col, scalar, dst->col);
-#else
     MathUtil::multiplyMatrix(m.m, scalar, dst->m);
-#endif
 }
 
 void Mat4::multiply(const Mat4& mat)
@@ -766,20 +754,12 @@ void Mat4::multiply(const Mat4& mat)
 void Mat4::multiply(const Mat4& m1, const Mat4& m2, Mat4* dst)
 {
     GP_ASSERT(dst);
-#ifdef AX_USE_SSE
-    MathUtil::multiplyMatrix(m1.col, m2.col, dst->col);
-#else
     MathUtil::multiplyMatrix(m1.m, m2.m, dst->m);
-#endif
 }
 
 void Mat4::negate()
 {
-#ifdef AX_USE_SSE
-    MathUtil::negateMatrix(col, col);
-#else
     MathUtil::negateMatrix(m, m);
-#endif
 }
 
 Mat4 Mat4::getNegated() const
@@ -945,11 +925,7 @@ void Mat4::subtract(const Mat4& mat)
 void Mat4::subtract(const Mat4& m1, const Mat4& m2, Mat4* dst)
 {
     GP_ASSERT(dst);
-#ifdef AX_USE_SSE
-    MathUtil::subtractMatrix(m1.col, m2.col, dst->col);
-#else
     MathUtil::subtractMatrix(m1.m, m2.m, dst->m);
-#endif
 }
 
 void Mat4::transformVector(Vec3* vector) const
@@ -967,7 +943,7 @@ void Mat4::transformVector(float x, float y, float z, float w, Vec3* dst) const
 {
     GP_ASSERT(dst);
 
-    MathUtil::transformVec4(m, x, y, z, w, (float*)dst);
+    MathUtil::transformVec4(m, x, y, z, w, reinterpret_cast<float*>(dst));
 }
 
 void Mat4::transformVector(Vec4* vector) const
@@ -979,14 +955,7 @@ void Mat4::transformVector(Vec4* vector) const
 void Mat4::transformVector(const Vec4& vector, Vec4* dst) const
 {
     GP_ASSERT(dst);
-#ifdef AX_USE_SSE
-    alignas(16) Vec4 inVal{vector};
-    alignas(16) Vec4 outVal;
-    MathUtil::transformVec4(col, reinterpret_cast<const __m128&>(inVal), reinterpret_cast<__m128&>(outVal));
-    *dst = outVal;
-#else
-    MathUtil::transformVec4(m, (const float*)&vector, (float*)dst);
-#endif
+    MathUtil::transformVec4(m, reinterpret_cast<const float*>(&vector), reinterpret_cast<float*>(dst));
 }
 
 void Mat4::translate(float x, float y, float z)
@@ -1013,11 +982,7 @@ void Mat4::translate(const Vec3& t, Mat4* dst) const
 
 void Mat4::transpose()
 {
-#ifdef AX_USE_SSE
-    MathUtil::transposeMatrix(col, col);
-#else
     MathUtil::transposeMatrix(m, m);
-#endif
 }
 
 Mat4 Mat4::getTransposed() const
diff --git a/core/math/Mat4.h b/core/math/Mat4.h
index e6cd4b6f3757..613648901699 100644
--- a/core/math/Mat4.h
+++ b/core/math/Mat4.h
@@ -18,7 +18,7 @@
 
  Original file from GamePlay3D: http://gameplay3d.org
 
- This file was modified to fit the cocos2d-x project
+ This file was modified to fit the axmol project
  */
 
 #ifndef MATH_MAT4_H
@@ -29,10 +29,6 @@
 #include "math/Vec3.h"
 #include "math/Vec4.h"
 
-#ifdef AX_USE_SSE
-#    include <xmmintrin.h>
-#endif
-
 /**
  * @addtogroup base
  * @{
@@ -73,7 +69,7 @@ NS_AX_MATH_BEGIN
  *
  * @see Transform
  */
-#ifdef AX_USE_SSE
+#if defined(AX_SSE_INTRINSICS) || defined(AX_NEON_INTRINSICS)
 class AX_DLL alignas(16) Mat4
 #else
 class AX_DLL Mat4
@@ -95,10 +91,10 @@ class AX_DLL Mat4
     /**
      * Stores the columns of this 4x4 matrix.
      * */
-#ifdef AX_USE_SSE
+#if defined(AX_SSE_INTRINSICS) || defined(AX_NEON_INTRINSICS)
     union
     {
-        __m128 col[4];
+        _xm128_t col[4];
         float m[16];
     };
 #else
diff --git a/core/math/Mat4.inl b/core/math/Mat4.inl
index 69dac19e5fc8..1babde69f1f5 100644
--- a/core/math/Mat4.inl
+++ b/core/math/Mat4.inl
@@ -1,5 +1,6 @@
 /**
  Copyright 2013 BlackBerry Inc.
+ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -15,7 +16,7 @@
 
  Original file from GamePlay3D: http://gameplay3d.org
 
- This file was modified to fit the cocos2d-x project
+ This file was modified to fit the axmol project
  */
 
 #include "math/Mat4.h"
diff --git a/core/math/MathBase.h b/core/math/MathBase.h
index 16d0a62e75ea..72d1e581fbbd 100644
--- a/core/math/MathBase.h
+++ b/core/math/MathBase.h
@@ -1,5 +1,6 @@
 /****************************************************************************
  Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd.
+ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
 
  https://axmol.dev/
 
@@ -22,46 +23,47 @@
  THE SOFTWARE.
  ****************************************************************************/
 
-#ifndef __CCMATHBASE_H__
-#define __CCMATHBASE_H__
+#ifndef __AXMATHBASE_H__
+#define __AXMATHBASE_H__
 
 #include <memory>
 #include <string.h>
 #include "platform/PlatformMacros.h"
+
 /**
  * @addtogroup base
  * @{
  */
 
 /**Util macro for conversion from degrees to radians.*/
-#define MATH_DEG_TO_RAD(x) ((x)*0.0174532925f)
+#define MATH_DEG_TO_RAD(x) ((x) * 0.0174532925f)
 /**Util macro for conversion from radians to degrees.*/
-#define MATH_RAD_TO_DEG(x) ((x)*57.29577951f)
+#define MATH_RAD_TO_DEG(x) ((x) * 57.29577951f)
 /**
 @{ Util macro for const float such as epsilon, small float and float precision tolerance.
 */
 #define MATH_FLOAT_SMALL 1.0e-37f
-#define MATH_TOLERANCE 2e-37f
-#define MATH_PIOVER2 1.57079632679489661923f
-#define MATH_EPSILON 0.000001f
+#define MATH_TOLERANCE   2e-37f
+#define MATH_PIOVER2     1.57079632679489661923f
+#define MATH_EPSILON     0.000001f
 /**@}*/
 
-//#define MATH_PIOVER4                0.785398163397448309616f
-//#define MATH_PIX2                   6.28318530717958647693f
-//#define MATH_E                      2.71828182845904523536f
-//#define MATH_LOG10E                 0.4342944819032518f
-//#define MATH_LOG2E                  1.442695040888963387f
-//#define MATH_PI                     3.14159265358979323846f
-//#define MATH_RANDOM_MINUS1_1()      ((2.0f*((float)rand()/RAND_MAX))-1.0f)      // Returns a random float between -1
-// and 1. #define MATH_RANDOM_0_1()           ((float)rand()/RAND_MAX)                    // Returns a random float
-// between 0 and 1. #define MATH_CLAMP(x, lo, hi)       ((x < lo) ? lo : ((x > hi) ? hi : x)) #ifndef M_1_PI #define
-// M_1_PI                      0.31830988618379067154
+// #define MATH_PIOVER4                0.785398163397448309616f
+// #define MATH_PIX2                   6.28318530717958647693f
+// #define MATH_E                      2.71828182845904523536f
+// #define MATH_LOG10E                 0.4342944819032518f
+// #define MATH_LOG2E                  1.442695040888963387f
+// #define MATH_PI                     3.14159265358979323846f
+// #define MATH_RANDOM_MINUS1_1()      ((2.0f*((float)rand()/RAND_MAX))-1.0f)      // Returns a random float between -1
+//  and 1. #define MATH_RANDOM_0_1()           ((float)rand()/RAND_MAX)                    // Returns a random float
+//  between 0 and 1. #define MATH_CLAMP(x, lo, hi)       ((x < lo) ? lo : ((x > hi) ? hi : x)) #ifndef M_1_PI #define
+//  M_1_PI                      0.31830988618379067154
 
 #ifdef __cplusplus
 #    define NS_AX_MATH_BEGIN \
-        namespace ax    \
+        namespace ax         \
         {
-#    define NS_AX_MATH_END }
+#    define NS_AX_MATH_END   }
 #    define USING_NS_AX_MATH using namespace ax
 #else
 #    define NS_AX_MATH_BEGIN
diff --git a/core/math/MathUtil.cpp b/core/math/MathUtil.cpp
index 9fb49000b3b2..805c8dfb331b 100644
--- a/core/math/MathUtil.cpp
+++ b/core/math/MathUtil.cpp
@@ -17,7 +17,7 @@ limitations under the License.
 
 Original file from GamePlay3D: http://gameplay3d.org
 
-This file was modified to fit the cocos2d-x project
+This file was modified to fit the axmol project
 */
 
 #include "math/MathUtil.h"
@@ -28,50 +28,10 @@ This file was modified to fit the cocos2d-x project
 #    include <cpu-features.h>
 #endif
 
-//#define USE_NEON32        : neon 32 code will be used
-//#define USE_NEON64        : neon 64 code will be used
-//#define INCLUDE_NEON32    : neon 32 code included
-//#define INCLUDE_NEON64    : neon 64 code included
-//#define USE_SSE           : SSE code used
-//#define INCLUDE_SSE       : SSE code included
-
-#if (AX_TARGET_PLATFORM == AX_PLATFORM_IOS)
-#    if defined(__arm64__)
-#        define USE_NEON64 1
-#        define INCLUDE_NEON64 1
-#    elif defined(__ARM_NEON__)
-#        define USE_NEON32 1
-#        define INCLUDE_NEON32 1
-#    endif
-#elif (AX_TARGET_PLATFORM == AX_PLATFORM_OSX)
-#    if defined(__arm64__) || defined(__aarch64__)
-#        define USE_NEON64 1
-#        define INCLUDE_NEON64 1
-#    endif
-#elif (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
-#    if defined(__arm64__) || defined(__aarch64__)
-#        define USE_NEON64 1
-#        define INCLUDE_NEON64 1
-#    elif defined(__ARM_NEON__)
-#        define INCLUDE_NEON32 1
-#    endif
-#endif
-
-#if defined(AX_USE_SSE)
-#    define USE_SSE 1
-#    define INCLUDE_SSE 1
-#endif
-
-#ifdef INCLUDE_NEON32
-#    include "math/MathUtilNeon.inl"
-#endif
-
-#ifdef INCLUDE_NEON64
-#    include "math/MathUtilNeon64.inl"
-#endif
-
-#ifdef INCLUDE_SSE
+#if defined(AX_SSE_INTRINSICS)
 #    include "math/MathUtilSSE.inl"
+#elif defined(AX_NEON_INTRINSICS)
+#    include "math/MathUtilNeon.inl"
 #endif
 
 #include "math/MathUtil.inl"
@@ -106,9 +66,8 @@ float MathUtil::lerp(float from, float to, float alpha)
 
 bool MathUtil::isNeon32Enabled()
 {
-#ifdef USE_NEON32
-    return true;
-#elif (defined(INCLUDE_NEON32) && (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID))
+#if defined(AX_NEON_INTRINSICS) && !AX_64BITS
+#    if AX_NEON_INTRINSICS == 1 && AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID
     class AnrdoidNeonChecker
     {
     public:
@@ -127,15 +86,9 @@ bool MathUtil::isNeon32Enabled()
     };
     static AnrdoidNeonChecker checker;
     return checker.isNeonEnabled();
-#else
-    return false;
-#endif
-}
-
-bool MathUtil::isNeon64Enabled()
-{
-#ifdef USE_NEON64
+#    else
     return true;
+#    endif
 #else
     return false;
 #endif
@@ -143,15 +96,17 @@ bool MathUtil::isNeon64Enabled()
 
 void MathUtil::addMatrix(const float* m, float scalar, float* dst)
 {
-#ifdef USE_NEON32
-    MathUtilNeon::addMatrix(m, scalar, dst);
-#elif defined(USE_NEON64)
-    MathUtilNeon64::addMatrix(m, scalar, dst);
-#elif defined(INCLUDE_NEON32)
+#if defined(AX_SSE_INTRINSICS)
+    MathUtilSSE::addMatrix(reinterpret_cast<const _xm128_t*>(m), scalar, reinterpret_cast<_xm128_t*>(dst));
+#elif defined(AX_NEON_INTRINSICS)
+#    if AX_64BITS || AX_NEON_INTRINSICS > 1
+    MathUtilNeon::addMatrix(reinterpret_cast<const _xm128_t*>(m), scalar, reinterpret_cast<_xm128_t*>(dst));
+#    else
     if (isNeon32Enabled())
-        MathUtilNeon::addMatrix(m, scalar, dst);
+        MathUtilNeon::addMatrix(reinterpret_cast<const _xm128_t*>(m), scalar, reinterpret_cast<_xm128_t*>(dst));
     else
         MathUtilC::addMatrix(m, scalar, dst);
+#    endif
 #else
     MathUtilC::addMatrix(m, scalar, dst);
 #endif
@@ -159,15 +114,20 @@ void MathUtil::addMatrix(const float* m, float scalar, float* dst)
 
 void MathUtil::addMatrix(const float* m1, const float* m2, float* dst)
 {
-#ifdef USE_NEON32
-    MathUtilNeon::addMatrix(m1, m2, dst);
-#elif defined(USE_NEON64)
-    MathUtilNeon64::addMatrix(m1, m2, dst);
-#elif defined(INCLUDE_NEON32)
+#if defined(AX_SSE_INTRINSICS)
+    MathUtilSSE::addMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
+                           reinterpret_cast<_xm128_t*>(dst));
+#elif defined(AX_NEON_INTRINSICS)
+#    if AX_64BITS || AX_NEON_INTRINSICS > 1
+    MathUtilNeon::addMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
+                            reinterpret_cast<_xm128_t*>(dst));
+#    else
     if (isNeon32Enabled())
-        MathUtilNeon::addMatrix(m1, m2, dst);
+        MathUtilNeon::addMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
+                                reinterpret_cast<_xm128_t*>(dst));
     else
         MathUtilC::addMatrix(m1, m2, dst);
+#    endif
 #else
     MathUtilC::addMatrix(m1, m2, dst);
 #endif
@@ -175,15 +135,20 @@ void MathUtil::addMatrix(const float* m1, const float* m2, float* dst)
 
 void MathUtil::subtractMatrix(const float* m1, const float* m2, float* dst)
 {
-#ifdef USE_NEON32
-    MathUtilNeon::subtractMatrix(m1, m2, dst);
-#elif defined(USE_NEON64)
-    MathUtilNeon64::subtractMatrix(m1, m2, dst);
-#elif defined(INCLUDE_NEON32)
+#if defined(AX_SSE_INTRINSICS)
+    MathUtilSSE::subtractMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
+                                reinterpret_cast<_xm128_t*>(dst));
+#elif defined(AX_NEON_INTRINSICS)
+#    if AX_64BITS || AX_NEON_INTRINSICS > 1
+    MathUtilNeon::subtractMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
+                                 reinterpret_cast<_xm128_t*>(dst));
+#    else
     if (isNeon32Enabled())
-        MathUtilNeon::subtractMatrix(m1, m2, dst);
+        MathUtilNeon::subtractMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
+                                     reinterpret_cast<_xm128_t*>(dst));
     else
         MathUtilC::subtractMatrix(m1, m2, dst);
+#    endif
 #else
     MathUtilC::subtractMatrix(m1, m2, dst);
 #endif
@@ -191,15 +156,17 @@ void MathUtil::subtractMatrix(const float* m1, const float* m2, float* dst)
 
 void MathUtil::multiplyMatrix(const float* m, float scalar, float* dst)
 {
-#ifdef USE_NEON32
-    MathUtilNeon::multiplyMatrix(m, scalar, dst);
-#elif defined(USE_NEON64)
-    MathUtilNeon64::multiplyMatrix(m, scalar, dst);
-#elif defined(INCLUDE_NEON32)
+#if defined(AX_SSE_INTRINSICS)
+    MathUtilSSE::multiplyMatrix(reinterpret_cast<const _xm128_t*>(m), scalar, reinterpret_cast<_xm128_t*>(dst));
+#elif defined(AX_NEON_INTRINSICS)
+#    if AX_64BITS || AX_NEON_INTRINSICS > 1
+    MathUtilNeon::multiplyMatrix(reinterpret_cast<const _xm128_t*>(m), scalar, reinterpret_cast<_xm128_t*>(dst));
+#    else
     if (isNeon32Enabled())
-        MathUtilNeon::multiplyMatrix(m, scalar, dst);
+        MathUtilNeon::multiplyMatrix(reinterpret_cast<const _xm128_t*>(m), scalar, reinterpret_cast<_xm128_t*>(dst));
     else
         MathUtilC::multiplyMatrix(m, scalar, dst);
+#    endif
 #else
     MathUtilC::multiplyMatrix(m, scalar, dst);
 #endif
@@ -207,15 +174,20 @@ void MathUtil::multiplyMatrix(const float* m, float scalar, float* dst)
 
 void MathUtil::multiplyMatrix(const float* m1, const float* m2, float* dst)
 {
-#ifdef USE_NEON32
-    MathUtilNeon::multiplyMatrix(m1, m2, dst);
-#elif defined(USE_NEON64)
-    MathUtilNeon64::multiplyMatrix(m1, m2, dst);
-#elif defined(INCLUDE_NEON32)
+#if defined(AX_SSE_INTRINSICS)
+    MathUtilSSE::multiplyMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
+                                reinterpret_cast<_xm128_t*>(dst));
+#elif defined(AX_NEON_INTRINSICS)
+#    if AX_64BITS || AX_NEON_INTRINSICS > 1
+    MathUtilNeon::multiplyMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
+                                 reinterpret_cast<_xm128_t*>(dst));
+#    else
     if (isNeon32Enabled())
-        MathUtilNeon::multiplyMatrix(m1, m2, dst);
+        MathUtilNeon::multiplyMatrix(reinterpret_cast<const _xm128_t*>(m1), reinterpret_cast<const _xm128_t*>(m2),
+                                     reinterpret_cast<_xm128_t*>(dst));
     else
         MathUtilC::multiplyMatrix(m1, m2, dst);
+#    endif
 #else
     MathUtilC::multiplyMatrix(m1, m2, dst);
 #endif
@@ -223,15 +195,17 @@ void MathUtil::multiplyMatrix(const float* m1, const float* m2, float* dst)
 
 void MathUtil::negateMatrix(const float* m, float* dst)
 {
-#ifdef USE_NEON32
-    MathUtilNeon::negateMatrix(m, dst);
-#elif defined(USE_NEON64)
-    MathUtilNeon64::negateMatrix(m, dst);
-#elif defined(INCLUDE_NEON32)
+#if defined(AX_SSE_INTRINSICS)
+    MathUtilSSE::negateMatrix(reinterpret_cast<const _xm128_t*>(m), reinterpret_cast<_xm128_t*>(dst));
+#elif defined(AX_NEON_INTRINSICS)
+#    if AX_64BITS || AX_NEON_INTRINSICS > 1
+    MathUtilNeon::negateMatrix(reinterpret_cast<const _xm128_t*>(m), reinterpret_cast<_xm128_t*>(dst));
+#    else
     if (isNeon32Enabled())
-        MathUtilNeon::negateMatrix(m, dst);
+        MathUtilNeon::negateMatrix(reinterpret_cast<const _xm128_t*>(m), reinterpret_cast<_xm128_t*>(dst));
     else
         MathUtilC::negateMatrix(m, dst);
+#    endif
 #else
     MathUtilC::negateMatrix(m, dst);
 #endif
@@ -239,47 +213,53 @@ void MathUtil::negateMatrix(const float* m, float* dst)
 
 void MathUtil::transposeMatrix(const float* m, float* dst)
 {
-#ifdef USE_NEON32
-    MathUtilNeon::transposeMatrix(m, dst);
-#elif defined(USE_NEON64)
-    MathUtilNeon64::transposeMatrix(m, dst);
-#elif defined(INCLUDE_NEON32)
+#if defined(AX_SSE_INTRINSICS)
+    MathUtilSSE::transposeMatrix(reinterpret_cast<const _xm128_t*>(m), reinterpret_cast<_xm128_t*>(dst));
+#elif defined(AX_NEON_INTRINSICS)
+#    if AX_64BITS || AX_NEON_INTRINSICS > 1
+    MathUtilNeon::transposeMatrix(reinterpret_cast<const _xm128_t*>(m), reinterpret_cast<_xm128_t*>(dst));
+#    else
     if (isNeon32Enabled())
-        MathUtilNeon::transposeMatrix(m, dst);
+        MathUtilNeon::transposeMatrix(reinterpret_cast<const _xm128_t*>(m), reinterpret_cast<_xm128_t*>(dst));
     else
         MathUtilC::transposeMatrix(m, dst);
+#    endif
 #else
     MathUtilC::transposeMatrix(m, dst);
 #endif
 }
 
-void MathUtil::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
+void MathUtil::transformVec4(const float* m, float x, float y, float z, float w, float* dst /*vec3*/)
 {
-#ifdef USE_NEON32
-    MathUtilNeon::transformVec4(m, x, y, z, w, dst);
-#elif defined(USE_NEON64)
-    MathUtilNeon64::transformVec4(m, x, y, z, w, dst);
-#elif defined(INCLUDE_NEON32)
+#if defined(AX_SSE_INTRINSICS)
+    MathUtilSSE::transformVec4(reinterpret_cast<const _xm128_t*>(m), x, y, z, w, dst);
+#elif defined(AX_NEON_INTRINSICS)
+#    if AX_64BITS || AX_NEON_INTRINSICS > 1
+    MathUtilNeon::transformVec4(reinterpret_cast<const _xm128_t*>(m), x, y, z, w, dst);
+#    else
     if (isNeon32Enabled())
-        MathUtilNeon::transformVec4(m, x, y, z, w, dst);
+        MathUtilNeon::transformVec4(reinterpret_cast<const _xm128_t*>(m), x, y, z, w, dst);
     else
         MathUtilC::transformVec4(m, x, y, z, w, dst);
+#    endif
 #else
     MathUtilC::transformVec4(m, x, y, z, w, dst);
 #endif
 }
 
-void MathUtil::transformVec4(const float* m, const float* v, float* dst)
+void MathUtil::transformVec4(const float* m, const float* v, float* dst /*vec4*/)
 {
-#ifdef USE_NEON32
-    MathUtilNeon::transformVec4(m, v, dst);
-#elif defined(USE_NEON64)
-    MathUtilNeon64::transformVec4(m, v, dst);
-#elif defined(INCLUDE_NEON32)
+#if defined(AX_SSE_INTRINSICS)
+    MathUtilSSE::transformVec4(reinterpret_cast<const _xm128_t*>(m), v, dst);
+#elif defined(AX_NEON_INTRINSICS)
+#    if AX_64BITS || AX_NEON_INTRINSICS > 1
+    MathUtilNeon::transformVec4(reinterpret_cast<const _xm128_t*>(m), v, dst);
+#    else
     if (isNeon32Enabled())
-        MathUtilNeon::transformVec4(m, v, dst);
+        MathUtilNeon::transformVec4(reinterpret_cast<const _xm128_t*>(m), v, dst);
     else
         MathUtilC::transformVec4(m, v, dst);
+#    endif
 #else
     MathUtilC::transformVec4(m, v, dst);
 #endif
@@ -287,15 +267,17 @@ void MathUtil::transformVec4(const float* m, const float* v, float* dst)
 
 void MathUtil::crossVec3(const float* v1, const float* v2, float* dst)
 {
-#ifdef USE_NEON32
+#if defined(AX_SSE_INTRINSICS)
+    MathUtilSSE::crossVec3(v1, v2, dst);
+#elif defined(AX_NEON_INTRINSICS)
+#    if AX_64BITS || AX_NEON_INTRINSICS > 1
     MathUtilNeon::crossVec3(v1, v2, dst);
-#elif defined(USE_NEON64)
-    MathUtilNeon64::crossVec3(v1, v2, dst);
-#elif defined(INCLUDE_NEON32)
+#    else
     if (isNeon32Enabled())
         MathUtilNeon::crossVec3(v1, v2, dst);
     else
         MathUtilC::crossVec3(v1, v2, dst);
+#    endif
 #else
     MathUtilC::crossVec3(v1, v2, dst);
 #endif
@@ -308,24 +290,28 @@ void MathUtil::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_
     static_assert(offsetof(V3F_C4B_T2F, vertices) == 0);
     static_assert(offsetof(V3F_C4B_T2F, colors) == 12);
     static_assert(offsetof(V3F_C4B_T2F, texCoords) == 16);
-
-#ifdef USE_NEON32
+#if defined(AX_SSE_INTRINSICS)
+    MathUtilSSE::transformVertices(dst, src, count, transform);
+#elif defined(AX_NEON_INTRINSICS)
+#    if AX_64BITS || AX_NEON_INTRINSICS > 1
     MathUtilNeon::transformVertices(dst, src, count, transform);
-#elif defined(USE_NEON64)
-    MathUtilNeon64::transformVertices(dst, src, count, transform);
-#elif defined(INCLUDE_NEON32)
+#    else
     if (isNeon32Enabled())
         MathUtilNeon::transformVertices(dst, src, count, transform);
     else
         MathUtilC::transformVertices(dst, src, count, transform);
+#    endif
 #else
     MathUtilC::transformVertices(dst, src, count, transform);
 #endif
 }
 
-void MathUtil::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) {
-#if defined(USE_NEON64)
-    MathUtilNeon64::transformIndices(dst, src, count, offset);
+void MathUtil::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
+{
+#if defined(AX_SSE_INTRINSICS)
+    MathUtilSSE::transformIndices(dst, src, count, offset);
+#elif defined(AX_NEON_INTRINSICS) && AX_64BITS
+    MathUtilNeon::transformIndices(dst, src, count, offset);
 #else
     MathUtilC::transformIndices(dst, src, count, offset);
 #endif
diff --git a/core/math/MathUtil.h b/core/math/MathUtil.h
index 7cb78b7845f0..b7057fe8c7d3 100644
--- a/core/math/MathUtil.h
+++ b/core/math/MathUtil.h
@@ -18,16 +18,12 @@
 
  Original file from GamePlay3D: http://gameplay3d.org
 
- This file was modified to fit the cocos2d-x project
+ This file was modified to fit the axmol project
  */
 
 #ifndef MATHUTIL_H_
 #define MATHUTIL_H_
 
-#ifdef AX_USE_SSE
-#    include <xmmintrin.h>
-#endif
-
 #include "math/MathBase.h"
 
 
@@ -42,7 +38,7 @@ NS_AX_END
 
 NS_AX_MATH_BEGIN
 
-class Mat4;
+class Vec4;
 
 /**
  * Defines a math utility class.
@@ -100,26 +96,8 @@ class AX_DLL MathUtil
 private:
     // Indicates that if neon is enabled
     static bool isNeon32Enabled();
-    static bool isNeon64Enabled();
 
 private:
-#ifdef AX_USE_SSE
-    static void addMatrix(const __m128 m[4], float scalar, __m128 dst[4]);
-
-    static void addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]);
-
-    static void subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]);
-
-    static void multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4]);
-
-    static void multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4]);
-
-    static void negateMatrix(const __m128 m[4], __m128 dst[4]);
-
-    static void transposeMatrix(const __m128 m[4], __m128 dst[4]);
-
-    static void transformVec4(const __m128 m[4], const __m128& v, __m128& dst);
-#endif
     static void addMatrix(const float* m, float scalar, float* dst);
 
     static void addMatrix(const float* m1, const float* m2, float* dst);
@@ -134,9 +112,9 @@ class AX_DLL MathUtil
 
     static void transposeMatrix(const float* m, float* dst);
 
-    static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
+    static void transformVec4(const float* m, float x, float y, float z, float w, float* dst/*vec3*/);
 
-    static void transformVec4(const float* m, const float* v, float* dst);
+    static void transformVec4(const float* m, const float* v, float* dst/*vec4*/);
 
     static void crossVec3(const float* v1, const float* v2, float* dst);
 
diff --git a/core/math/MathUtil.inl b/core/math/MathUtil.inl
index 4d7028bdbd59..a2da119df439 100644
--- a/core/math/MathUtil.inl
+++ b/core/math/MathUtil.inl
@@ -16,7 +16,7 @@
 
  Original file from GamePlay3D: http://gameplay3d.org
 
- This file was modified to fit the cocos2d-x project
+ This file was modified to fit the axmol project
  */
 
 NS_AX_MATH_BEGIN
@@ -24,221 +24,201 @@ NS_AX_MATH_BEGIN
 class MathUtilC
 {
 public:
-    inline static void addMatrix(const float* m, float scalar, float* dst);
-    inline static void addMatrix(const float* m1, const float* m2, float* dst);
-    inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
-    inline static void multiplyMatrix(const float* m, float scalar, float* dst);
-    inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
-
-    inline static void negateMatrix(const float* m, float* dst);
-    inline static void transposeMatrix(const float* m, float* dst);
-
-    inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
-    inline static void transformVec4(const float* m, const float* v, float* dst);
-    inline static void crossVec3(const float* v1, const float* v2, float* dst);
-
-    inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform);
-    inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset);
-};
+    inline static void addMatrix(const float* m, float scalar, float* dst)
+    {
+        dst[0]  = m[0] + scalar;
+        dst[1]  = m[1] + scalar;
+        dst[2]  = m[2] + scalar;
+        dst[3]  = m[3] + scalar;
+        dst[4]  = m[4] + scalar;
+        dst[5]  = m[5] + scalar;
+        dst[6]  = m[6] + scalar;
+        dst[7]  = m[7] + scalar;
+        dst[8]  = m[8] + scalar;
+        dst[9]  = m[9] + scalar;
+        dst[10] = m[10] + scalar;
+        dst[11] = m[11] + scalar;
+        dst[12] = m[12] + scalar;
+        dst[13] = m[13] + scalar;
+        dst[14] = m[14] + scalar;
+        dst[15] = m[15] + scalar;
+    }
 
-inline void MathUtilC::addMatrix(const float* m, float scalar, float* dst)
-{
-    dst[0]  = m[0]  + scalar;
-    dst[1]  = m[1]  + scalar;
-    dst[2]  = m[2]  + scalar;
-    dst[3]  = m[3]  + scalar;
-    dst[4]  = m[4]  + scalar;
-    dst[5]  = m[5]  + scalar;
-    dst[6]  = m[6]  + scalar;
-    dst[7]  = m[7]  + scalar;
-    dst[8]  = m[8]  + scalar;
-    dst[9]  = m[9]  + scalar;
-    dst[10] = m[10] + scalar;
-    dst[11] = m[11] + scalar;
-    dst[12] = m[12] + scalar;
-    dst[13] = m[13] + scalar;
-    dst[14] = m[14] + scalar;
-    dst[15] = m[15] + scalar;
-}
-
-inline void MathUtilC::addMatrix(const float* m1, const float* m2, float* dst)
-{
-    dst[0]  = m1[0]  + m2[0];
-    dst[1]  = m1[1]  + m2[1];
-    dst[2]  = m1[2]  + m2[2];
-    dst[3]  = m1[3]  + m2[3];
-    dst[4]  = m1[4]  + m2[4];
-    dst[5]  = m1[5]  + m2[5];
-    dst[6]  = m1[6]  + m2[6];
-    dst[7]  = m1[7]  + m2[7];
-    dst[8]  = m1[8]  + m2[8];
-    dst[9]  = m1[9]  + m2[9];
-    dst[10] = m1[10] + m2[10];
-    dst[11] = m1[11] + m2[11];
-    dst[12] = m1[12] + m2[12];
-    dst[13] = m1[13] + m2[13];
-    dst[14] = m1[14] + m2[14];
-    dst[15] = m1[15] + m2[15];
-}
-
-inline void MathUtilC::subtractMatrix(const float* m1, const float* m2, float* dst)
-{
-    dst[0]  = m1[0]  - m2[0];
-    dst[1]  = m1[1]  - m2[1];
-    dst[2]  = m1[2]  - m2[2];
-    dst[3]  = m1[3]  - m2[3];
-    dst[4]  = m1[4]  - m2[4];
-    dst[5]  = m1[5]  - m2[5];
-    dst[6]  = m1[6]  - m2[6];
-    dst[7]  = m1[7]  - m2[7];
-    dst[8]  = m1[8]  - m2[8];
-    dst[9]  = m1[9]  - m2[9];
-    dst[10] = m1[10] - m2[10];
-    dst[11] = m1[11] - m2[11];
-    dst[12] = m1[12] - m2[12];
-    dst[13] = m1[13] - m2[13];
-    dst[14] = m1[14] - m2[14];
-    dst[15] = m1[15] - m2[15];
-}
-
-inline void MathUtilC::multiplyMatrix(const float* m, float scalar, float* dst)
-{
-    dst[0]  = m[0]  * scalar;
-    dst[1]  = m[1]  * scalar;
-    dst[2]  = m[2]  * scalar;
-    dst[3]  = m[3]  * scalar;
-    dst[4]  = m[4]  * scalar;
-    dst[5]  = m[5]  * scalar;
-    dst[6]  = m[6]  * scalar;
-    dst[7]  = m[7]  * scalar;
-    dst[8]  = m[8]  * scalar;
-    dst[9]  = m[9]  * scalar;
-    dst[10] = m[10] * scalar;
-    dst[11] = m[11] * scalar;
-    dst[12] = m[12] * scalar;
-    dst[13] = m[13] * scalar;
-    dst[14] = m[14] * scalar;
-    dst[15] = m[15] * scalar;
-}
-
-inline void MathUtilC::multiplyMatrix(const float* m1, const float* m2, float* dst)
-{
-    // Support the case where m1 or m2 is the same array as dst.
-    float product[16];
+    inline static void addMatrix(const float* m1, const float* m2, float* dst)
+    {
+        dst[0]  = m1[0] + m2[0];
+        dst[1]  = m1[1] + m2[1];
+        dst[2]  = m1[2] + m2[2];
+        dst[3]  = m1[3] + m2[3];
+        dst[4]  = m1[4] + m2[4];
+        dst[5]  = m1[5] + m2[5];
+        dst[6]  = m1[6] + m2[6];
+        dst[7]  = m1[7] + m2[7];
+        dst[8]  = m1[8] + m2[8];
+        dst[9]  = m1[9] + m2[9];
+        dst[10] = m1[10] + m2[10];
+        dst[11] = m1[11] + m2[11];
+        dst[12] = m1[12] + m2[12];
+        dst[13] = m1[13] + m2[13];
+        dst[14] = m1[14] + m2[14];
+        dst[15] = m1[15] + m2[15];
+    }
 
-    product[0]  = m1[0] * m2[0]  + m1[4] * m2[1] + m1[8]   * m2[2]  + m1[12] * m2[3];
-    product[1]  = m1[1] * m2[0]  + m1[5] * m2[1] + m1[9]   * m2[2]  + m1[13] * m2[3];
-    product[2]  = m1[2] * m2[0]  + m1[6] * m2[1] + m1[10]  * m2[2]  + m1[14] * m2[3];
-    product[3]  = m1[3] * m2[0]  + m1[7] * m2[1] + m1[11]  * m2[2]  + m1[15] * m2[3];
+    inline static void subtractMatrix(const float* m1, const float* m2, float* dst)
+    {
+        dst[0]  = m1[0] - m2[0];
+        dst[1]  = m1[1] - m2[1];
+        dst[2]  = m1[2] - m2[2];
+        dst[3]  = m1[3] - m2[3];
+        dst[4]  = m1[4] - m2[4];
+        dst[5]  = m1[5] - m2[5];
+        dst[6]  = m1[6] - m2[6];
+        dst[7]  = m1[7] - m2[7];
+        dst[8]  = m1[8] - m2[8];
+        dst[9]  = m1[9] - m2[9];
+        dst[10] = m1[10] - m2[10];
+        dst[11] = m1[11] - m2[11];
+        dst[12] = m1[12] - m2[12];
+        dst[13] = m1[13] - m2[13];
+        dst[14] = m1[14] - m2[14];
+        dst[15] = m1[15] - m2[15];
+    }
 
-    product[4]  = m1[0] * m2[4]  + m1[4] * m2[5] + m1[8]   * m2[6]  + m1[12] * m2[7];
-    product[5]  = m1[1] * m2[4]  + m1[5] * m2[5] + m1[9]   * m2[6]  + m1[13] * m2[7];
-    product[6]  = m1[2] * m2[4]  + m1[6] * m2[5] + m1[10]  * m2[6]  + m1[14] * m2[7];
-    product[7]  = m1[3] * m2[4]  + m1[7] * m2[5] + m1[11]  * m2[6]  + m1[15] * m2[7];
+    inline static void multiplyMatrix(const float* m, float scalar, float* dst)
+    {
+        dst[0]  = m[0] * scalar;
+        dst[1]  = m[1] * scalar;
+        dst[2]  = m[2] * scalar;
+        dst[3]  = m[3] * scalar;
+        dst[4]  = m[4] * scalar;
+        dst[5]  = m[5] * scalar;
+        dst[6]  = m[6] * scalar;
+        dst[7]  = m[7] * scalar;
+        dst[8]  = m[8] * scalar;
+        dst[9]  = m[9] * scalar;
+        dst[10] = m[10] * scalar;
+        dst[11] = m[11] * scalar;
+        dst[12] = m[12] * scalar;
+        dst[13] = m[13] * scalar;
+        dst[14] = m[14] * scalar;
+        dst[15] = m[15] * scalar;
+    }
 
-    product[8]  = m1[0] * m2[8]  + m1[4] * m2[9] + m1[8]   * m2[10] + m1[12] * m2[11];
-    product[9]  = m1[1] * m2[8]  + m1[5] * m2[9] + m1[9]   * m2[10] + m1[13] * m2[11];
-    product[10] = m1[2] * m2[8]  + m1[6] * m2[9] + m1[10]  * m2[10] + m1[14] * m2[11];
-    product[11] = m1[3] * m2[8]  + m1[7] * m2[9] + m1[11]  * m2[10] + m1[15] * m2[11];
+    inline static void multiplyMatrix(const float* m1, const float* m2, float* dst)
+    {
+        // Support the case where m1 or m2 is the same array as dst.
+        float product[16];
+
+        product[0] = m1[0] * m2[0] + m1[4] * m2[1] + m1[8] * m2[2] + m1[12] * m2[3];
+        product[1] = m1[1] * m2[0] + m1[5] * m2[1] + m1[9] * m2[2] + m1[13] * m2[3];
+        product[2] = m1[2] * m2[0] + m1[6] * m2[1] + m1[10] * m2[2] + m1[14] * m2[3];
+        product[3] = m1[3] * m2[0] + m1[7] * m2[1] + m1[11] * m2[2] + m1[15] * m2[3];
+
+        product[4] = m1[0] * m2[4] + m1[4] * m2[5] + m1[8] * m2[6] + m1[12] * m2[7];
+        product[5] = m1[1] * m2[4] + m1[5] * m2[5] + m1[9] * m2[6] + m1[13] * m2[7];
+        product[6] = m1[2] * m2[4] + m1[6] * m2[5] + m1[10] * m2[6] + m1[14] * m2[7];
+        product[7] = m1[3] * m2[4] + m1[7] * m2[5] + m1[11] * m2[6] + m1[15] * m2[7];
+
+        product[8]  = m1[0] * m2[8] + m1[4] * m2[9] + m1[8] * m2[10] + m1[12] * m2[11];
+        product[9]  = m1[1] * m2[8] + m1[5] * m2[9] + m1[9] * m2[10] + m1[13] * m2[11];
+        product[10] = m1[2] * m2[8] + m1[6] * m2[9] + m1[10] * m2[10] + m1[14] * m2[11];
+        product[11] = m1[3] * m2[8] + m1[7] * m2[9] + m1[11] * m2[10] + m1[15] * m2[11];
+
+        product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8] * m2[14] + m1[12] * m2[15];
+        product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9] * m2[14] + m1[13] * m2[15];
+        product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15];
+        product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15];
+
+        memcpy(dst, product, MATRIX_SIZE);
+    }
 
-    product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8]  * m2[14] + m1[12] * m2[15];
-    product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9]  * m2[14] + m1[13] * m2[15];
-    product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15];
-    product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15];
+    inline static void negateMatrix(const float* m, float* dst)
+    {
+        dst[0]  = -m[0];
+        dst[1]  = -m[1];
+        dst[2]  = -m[2];
+        dst[3]  = -m[3];
+        dst[4]  = -m[4];
+        dst[5]  = -m[5];
+        dst[6]  = -m[6];
+        dst[7]  = -m[7];
+        dst[8]  = -m[8];
+        dst[9]  = -m[9];
+        dst[10] = -m[10];
+        dst[11] = -m[11];
+        dst[12] = -m[12];
+        dst[13] = -m[13];
+        dst[14] = -m[14];
+        dst[15] = -m[15];
+    }
 
-    memcpy(dst, product, MATRIX_SIZE);
-}
+    inline static void transposeMatrix(const float* m, float* dst)
+    {
+        float t[16] = {m[0], m[4], m[8],  m[12], m[1], m[5], m[9],  m[13],
+                       m[2], m[6], m[10], m[14], m[3], m[7], m[11], m[15]};
+        memcpy(dst, t, MATRIX_SIZE);
+    }
 
-inline void MathUtilC::negateMatrix(const float* m, float* dst)
-{
-    dst[0]  = -m[0];
-    dst[1]  = -m[1];
-    dst[2]  = -m[2];
-    dst[3]  = -m[3];
-    dst[4]  = -m[4];
-    dst[5]  = -m[5];
-    dst[6]  = -m[6];
-    dst[7]  = -m[7];
-    dst[8]  = -m[8];
-    dst[9]  = -m[9];
-    dst[10] = -m[10];
-    dst[11] = -m[11];
-    dst[12] = -m[12];
-    dst[13] = -m[13];
-    dst[14] = -m[14];
-    dst[15] = -m[15];
-}
-
-inline void MathUtilC::transposeMatrix(const float* m, float* dst)
-{
-    float t[16] = {
-        m[0], m[4], m[8], m[12],
-        m[1], m[5], m[9], m[13],
-        m[2], m[6], m[10], m[14],
-        m[3], m[7], m[11], m[15]
-    };
-    memcpy(dst, t, MATRIX_SIZE);
-}
-
-inline void MathUtilC::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
-{
-    dst[0] = x * m[0] + y * m[4] + z * m[8] + w * m[12];
-    dst[1] = x * m[1] + y * m[5] + z * m[9] + w * m[13];
-    dst[2] = x * m[2] + y * m[6] + z * m[10] + w * m[14];
-}
+    inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst)
+    {
+        dst[0] = x * m[0] + y * m[4] + z * m[8] + w * m[12];
+        dst[1] = x * m[1] + y * m[5] + z * m[9] + w * m[13];
+        dst[2] = x * m[2] + y * m[6] + z * m[10] + w * m[14];
+    }
 
-inline void MathUtilC::transformVec4(const float* m, const float* v, float* dst)
-{
-    // Handle case where v == dst.
-    float x = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + v[3] * m[12];
-    float y = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + v[3] * m[13];
-    float z = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + v[3] * m[14];
-    float w = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + v[3] * m[15];
-
-    dst[0] = x;
-    dst[1] = y;
-    dst[2] = z;
-    dst[3] = w;
-}
-
-inline void MathUtilC::crossVec3(const float* v1, const float* v2, float* dst)
-{
-    float x = (v1[1] * v2[2]) - (v1[2] * v2[1]);
-    float y = (v1[2] * v2[0]) - (v1[0] * v2[2]);
-    float z = (v1[0] * v2[1]) - (v1[1] * v2[0]);
+    inline static void transformVec4(const float* m, const float* v, float* dst)
+    {
+        // Handle case where v == dst.
+        float x = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + v[3] * m[12];
+        float y = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + v[3] * m[13];
+        float z = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + v[3] * m[14];
+        float w = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + v[3] * m[15];
+
+        dst[0] = x;
+        dst[1] = y;
+        dst[2] = z;
+        dst[3] = w;
+    }
 
-    dst[0] = x;
-    dst[1] = y;
-    dst[2] = z;
-}
+    inline static void crossVec3(const float* v1, const float* v2, float* dst)
+    {
+        float x = (v1[1] * v2[2]) - (v1[2] * v2[1]);
+        float y = (v1[2] * v2[0]) - (v1[0] * v2[2]);
+        float z = (v1[0] * v2[1]) - (v1[1] * v2[0]);
 
-inline void MathUtilC::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
-{
-    auto end = dst + count;
-    auto t = transform; // Make copy for better aliasing inference
-    auto m = t.m;
+        dst[0] = x;
+        dst[1] = y;
+        dst[2] = z;
+    }
 
-    while (dst < end)
+    inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
     {
-        auto pos = src->vertices;
-        dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8]  + m[12];
-        dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9]  + m[13];
-        dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
-        memcpy(&dst->colors, &src->colors, sizeof(dst->colors) + sizeof(dst->texCoords));
-        ++dst;
-        ++src;
+        auto end = dst + count;
+        auto& t  = transform;  // Make copy for better aliasing inference
+        auto m   = t.m;
+
+        while (dst < end)
+        {
+            auto pos        = src->vertices;
+            dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8] + m[12];
+            dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9] + m[13];
+            dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
+            memcpy(&dst->colors, &src->colors, sizeof(V3F_C4B_T2F::colors) + sizeof(V3F_C4B_T2F::texCoords));
+            ++dst;
+            ++src;
+        }
     }
-}
 
-inline void MathUtilC::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
-{
-    auto end = dst + count;
-    while (dst < end)
+    inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
     {
-        *dst = *src + offset;
-        ++dst;
-        ++src;
+        auto end = dst + count;
+        while (dst < end)
+        {
+            *dst = *src + offset;
+            ++dst;
+            ++src;
+        }
     }
-}
+};
 
 NS_AX_MATH_END
diff --git a/core/math/MathUtilNeon.inl b/core/math/MathUtilNeon.inl
index e80382490351..42773e51637c 100644
--- a/core/math/MathUtilNeon.inl
+++ b/core/math/MathUtilNeon.inl
@@ -16,356 +16,374 @@
 
  Original file from GamePlay3D: http://gameplay3d.org
 
- This file was modified to fit the cocos2d-x project
+ This file was modified to fit the axmol project
  */
 
 #include <arm_neon.h>
 
 NS_AX_MATH_BEGIN
 
-class MathUtilNeon
+struct MathUtilNeon
 {
-public:
-    inline static void addMatrix(const float* m, float scalar, float* dst);
-    inline static void addMatrix(const float* m1, const float* m2, float* dst);
-    inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
-    inline static void multiplyMatrix(const float* m, float scalar, float* dst);
-    inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
+#if defined(__EMSCRIPTEN__)
+#    define vmlaq_lane_f32(a, b, c, lane) vaddq_f32(a, vmulq_lane_f32(b, c, lane))
+#endif
 
-    inline static void negateMatrix(const float* m, float* dst);
-    inline static void transposeMatrix(const float* m, float* dst);
+    inline static void addMatrix(const _xm128_t* m, float scalar, _xm128_t* dst)
+    {
+        float32x4_t s = vdupq_n_f32(scalar);
+        dst[0]        = vaddq_f32(m[0], s);
+        dst[1]        = vaddq_f32(m[1], s);
+        dst[2]        = vaddq_f32(m[2], s);
+        dst[3]        = vaddq_f32(m[3], s);
+    }
 
-    inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
-    inline static void transformVec4(const float* m, const float* v, float* dst);
-    inline static void crossVec3(const float* v1, const float* v2, float* dst);
+    inline static void addMatrix(const _xm128_t* m1, const _xm128_t* m2, _xm128_t* dst)
+    {
+        dst[0] = vaddq_f32(m1[0], m2[0]);
+        dst[1] = vaddq_f32(m1[1], m2[1]);
+        dst[2] = vaddq_f32(m1[2], m2[2]);
+        dst[3] = vaddq_f32(m1[3], m2[3]);
+    }
 
-    inline static void transformVertices(ax::V3F_C4B_T2F* dst, const ax::V3F_C4B_T2F* src, size_t count, const ax::Mat4& transform);
-};
+    inline static void subtractMatrix(const _xm128_t* m1, const _xm128_t* m2, _xm128_t* dst)
+    {
+        dst[0] = vsubq_f32(m1[0], m2[0]);
+        dst[1] = vsubq_f32(m1[1], m2[1]);
+        dst[2] = vsubq_f32(m1[2], m2[2]);
+        dst[3] = vsubq_f32(m1[3], m2[3]);
+    }
 
-inline void MathUtilNeon::addMatrix(const float* m, float scalar, float* dst)
-{
-    asm volatile(
-                 "vld1.32 {q0, q1}, [%1]!    \n\t" // M[m0-m7]
-                 "vld1.32 {q2, q3}, [%1]     \n\t" // M[m8-m15]
-                 "vld1.32 {d8[0]},  [%2]     \n\t" // s
-                 "vmov.f32 s17, s16          \n\t" // s
-                 "vmov.f32 s18, s16          \n\t" // s
-                 "vmov.f32 s19, s16          \n\t" // s
-
-                 "vadd.f32 q8, q0, q4        \n\t" // DST->M[m0-m3] = M[m0-m3] + s
-                 "vadd.f32 q9, q1, q4        \n\t" // DST->M[m4-m7] = M[m4-m7] + s
-                 "vadd.f32 q10, q2, q4       \n\t" // DST->M[m8-m11] = M[m8-m11] + s
-                 "vadd.f32 q11, q3, q4       \n\t" // DST->M[m12-m15] = M[m12-m15] + s
-
-                 "vst1.32 {q8, q9}, [%0]!    \n\t" // DST->M[m0-m7]
-                 "vst1.32 {q10, q11}, [%0]   \n\t" // DST->M[m8-m15]
-                 :
-                 : "r"(dst), "r"(m), "r"(&scalar)
-                 : "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "memory"
-                 );
-}
-
-inline void MathUtilNeon::addMatrix(const float* m1, const float* m2, float* dst)
-{
-    asm volatile(
-                 "vld1.32     {q0, q1},     [%1]! \n\t" // M1[m0-m7]
-                 "vld1.32     {q2, q3},     [%1]  \n\t" // M1[m8-m15]
-                 "vld1.32     {q8, q9},     [%2]! \n\t" // M2[m0-m7]
-                 "vld1.32     {q10, q11}, [%2]    \n\t" // M2[m8-m15]
-
-                 "vadd.f32   q12, q0, q8          \n\t" // DST->M[m0-m3] = M1[m0-m3] + M2[m0-m3]
-                 "vadd.f32   q13, q1, q9          \n\t" // DST->M[m4-m7] = M1[m4-m7] + M2[m4-m7]
-                 "vadd.f32   q14, q2, q10         \n\t" // DST->M[m8-m11] = M1[m8-m11] + M2[m8-m11]
-                 "vadd.f32   q15, q3, q11         \n\t" // DST->M[m12-m15] = M1[m12-m15] + M2[m12-m15]
-
-                 "vst1.32    {q12, q13}, [%0]!    \n\t" // DST->M[m0-m7]
-                 "vst1.32    {q14, q15}, [%0]     \n\t" // DST->M[m8-m15]
-                 :
-                 : "r"(dst), "r"(m1), "r"(m2)
-                 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory"
-                 );
-}
-
-inline void MathUtilNeon::subtractMatrix(const float* m1, const float* m2, float* dst)
-{
-    asm volatile(
-                 "vld1.32     {q0, q1},     [%1]!  \n\t" // M1[m0-m7]
-                 "vld1.32     {q2, q3},     [%1]   \n\t" // M1[m8-m15]
-                 "vld1.32     {q8, q9},     [%2]!  \n\t" // M2[m0-m7]
-                 "vld1.32     {q10, q11}, [%2]     \n\t" // M2[m8-m15]
-
-                 "vsub.f32   q12, q0, q8         \n\t" // DST->M[m0-m3] = M1[m0-m3] - M2[m0-m3]
-                 "vsub.f32   q13, q1, q9         \n\t" // DST->M[m4-m7] = M1[m4-m7] - M2[m4-m7]
-                 "vsub.f32   q14, q2, q10        \n\t" // DST->M[m8-m11] = M1[m8-m11] - M2[m8-m11]
-                 "vsub.f32   q15, q3, q11        \n\t" // DST->M[m12-m15] = M1[m12-m15] - M2[m12-m15]
-
-                 "vst1.32    {q12, q13}, [%0]!   \n\t" // DST->M[m0-m7]
-                 "vst1.32    {q14, q15}, [%0]    \n\t" // DST->M[m8-m15]
-                 :
-                 : "r"(dst), "r"(m1), "r"(m2)
-                 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory"
-                 );
-}
-
-inline void MathUtilNeon::multiplyMatrix(const float* m, float scalar, float* dst)
-{
-    asm volatile(
-                 "vld1.32     {d0[0]},         [%2]        \n\t" // M[m0-m7]
-                 "vld1.32    {q4-q5},          [%1]!       \n\t" // M[m8-m15]
-                 "vld1.32    {q6-q7},          [%1]        \n\t" // s
-
-                 "vmul.f32     q8, q4, d0[0]               \n\t" // DST->M[m0-m3] = M[m0-m3] * s
-                 "vmul.f32     q9, q5, d0[0]               \n\t" // DST->M[m4-m7] = M[m4-m7] * s
-                 "vmul.f32     q10, q6, d0[0]              \n\t" // DST->M[m8-m11] = M[m8-m11] * s
-                 "vmul.f32     q11, q7, d0[0]              \n\t" // DST->M[m12-m15] = M[m12-m15] * s
-
-                 "vst1.32     {q8-q9},           [%0]!     \n\t" // DST->M[m0-m7]
-                 "vst1.32     {q10-q11},         [%0]      \n\t" // DST->M[m8-m15]
-                 :
-                 : "r"(dst), "r"(m), "r"(&scalar)
-                 : "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "memory"
-                 );
-}
-
-inline void MathUtilNeon::multiplyMatrix(const float* m1, const float* m2, float* dst)
-{
-    asm volatile(
-                 "vld1.32     {d16 - d19}, [%1]! \n\t"       // M1[m0-m7]
-                 "vld1.32     {d20 - d23}, [%1]  \n\t"       // M1[m8-m15]
-                 "vld1.32     {d0 - d3}, [%2]!   \n\t"       // M2[m0-m7]
-                 "vld1.32     {d4 - d7}, [%2]    \n\t"       // M2[m8-m15]
-
-                 "vmul.f32    q12, q8, d0[0]     \n\t"         // DST->M[m0-m3] = M1[m0-m3] * M2[m0]
-                 "vmul.f32    q13, q8, d2[0]     \n\t"         // DST->M[m4-m7] = M1[m4-m7] * M2[m4]
-                 "vmul.f32    q14, q8, d4[0]     \n\t"         // DST->M[m8-m11] = M1[m8-m11] * M2[m8]
-                 "vmul.f32    q15, q8, d6[0]     \n\t"         // DST->M[m12-m15] = M1[m12-m15] * M2[m12]
-
-                 "vmla.f32    q12, q9, d0[1]     \n\t"         // DST->M[m0-m3] += M1[m0-m3] * M2[m1]
-                 "vmla.f32    q13, q9, d2[1]     \n\t"         // DST->M[m4-m7] += M1[m4-m7] * M2[m5]
-                 "vmla.f32    q14, q9, d4[1]     \n\t"         // DST->M[m8-m11] += M1[m8-m11] * M2[m9]
-                 "vmla.f32    q15, q9, d6[1]     \n\t"         // DST->M[m12-m15] += M1[m12-m15] * M2[m13]
-
-                 "vmla.f32    q12, q10, d1[0]    \n\t"         // DST->M[m0-m3] += M1[m0-m3] * M2[m2]
-                 "vmla.f32    q13, q10, d3[0]    \n\t"         // DST->M[m4-m7] += M1[m4-m7] * M2[m6]
-                 "vmla.f32    q14, q10, d5[0]    \n\t"         // DST->M[m8-m11] += M1[m8-m11] * M2[m10]
-                 "vmla.f32    q15, q10, d7[0]    \n\t"         // DST->M[m12-m15] += M1[m12-m15] * M2[m14]
-
-                 "vmla.f32    q12, q11, d1[1]    \n\t"         // DST->M[m0-m3] += M1[m0-m3] * M2[m3]
-                 "vmla.f32    q13, q11, d3[1]    \n\t"         // DST->M[m4-m7] += M1[m4-m7] * M2[m7]
-                 "vmla.f32    q14, q11, d5[1]    \n\t"         // DST->M[m8-m11] += M1[m8-m11] * M2[m11]
-                 "vmla.f32    q15, q11, d7[1]    \n\t"         // DST->M[m12-m15] += M1[m12-m15] * M2[m15]
-
-                 "vst1.32    {d24 - d27}, [%0]!  \n\t"       // DST->M[m0-m7]
-                 "vst1.32    {d28 - d31}, [%0]   \n\t"       // DST->M[m8-m15]
-
-                 : // output
-                 : "r"(dst), "r"(m1), "r"(m2) // input - note *value* of pointer doesn't change.
-                 : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-                 );
-}
-
-inline void MathUtilNeon::negateMatrix(const float* m, float* dst)
-{
-    asm volatile(
-                 "vld1.32     {q0-q1},  [%1]!     \n\t" // load m0-m7
-                 "vld1.32     {q2-q3},  [%1]      \n\t" // load m8-m15
-
-                 "vneg.f32     q4, q0             \n\t" // negate m0-m3
-                 "vneg.f32     q5, q1             \n\t" // negate m4-m7
-                 "vneg.f32     q6, q2             \n\t" // negate m8-m15
-                 "vneg.f32     q7, q3             \n\t" // negate m8-m15
-
-                 "vst1.32     {q4-q5},  [%0]!     \n\t" // store m0-m7
-                 "vst1.32     {q6-q7},  [%0]      \n\t" // store m8-m15
-                 :
-                 : "r"(dst), "r"(m)
-                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory"
-                 );
-}
-
-inline void MathUtilNeon::transposeMatrix(const float* m, float* dst)
-{
-    asm volatile(
-                 "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%1]!    \n\t" // DST->M[m0, m4, m8, m12] = M[m0-m3]
-                 "vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1]!    \n\t" // DST->M[m1, m5, m9, m12] = M[m4-m7]
-                 "vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%1]!    \n\t" // DST->M[m2, m6, m10, m12] = M[m8-m11]
-                 "vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%1]     \n\t" // DST->M[m3, m7, m11, m12] = M[m12-m15]
-
-                 "vst1.32 {q0-q1}, [%0]!                         \n\t" // DST->M[m0-m7]
-                 "vst1.32 {q2-q3}, [%0]                          \n\t" // DST->M[m8-m15]
-                 :
-                 : "r"(dst), "r"(m)
-                 : "q0", "q1", "q2", "q3", "memory"
-                 );
-}
-
-inline void MathUtilNeon::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
-{
-    asm volatile(
-                 "vld1.32    {d0[0]},        [%1]    \n\t"    // V[x]
-                 "vld1.32    {d0[1]},        [%2]    \n\t"    // V[y]
-                 "vld1.32    {d1[0]},        [%3]    \n\t"    // V[z]
-                 "vld1.32    {d1[1]},        [%4]    \n\t"    // V[w]
-                 "vld1.32    {d18 - d21},    [%5]!   \n\t"    // M[m0-m7]
-                 "vld1.32    {d22 - d25},    [%5]    \n\t"    // M[m8-m15]
-
-                 "vmul.f32 q13,  q9, d0[0]           \n\t"    // DST->V = M[m0-m3] * V[x]
-                 "vmla.f32 q13, q10, d0[1]           \n\t"    // DST->V += M[m4-m7] * V[y]
-                 "vmla.f32 q13, q11, d1[0]           \n\t"    // DST->V += M[m8-m11] * V[z]
-                 "vmla.f32 q13, q12, d1[1]           \n\t"    // DST->V += M[m12-m15] * V[w]
-
-                 "vst1.32 {d26}, [%0]!               \n\t"    // DST->V[x, y]
-                 "vst1.32 {d27[0]}, [%0]             \n\t"    // DST->V[z]
-                 :
-                 : "r"(dst), "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m)
-                 : "q0", "q9", "q10","q11", "q12", "q13", "memory"
-                 );
-}
-
-inline void MathUtilNeon::transformVec4(const float* m, const float* v, float* dst)
-{
-    asm volatile
-    (
-     "vld1.32    {d0, d1}, [%1]     \n\t"   // V[x, y, z, w]
-     "vld1.32    {d18 - d21}, [%2]! \n\t"   // M[m0-m7]
-     "vld1.32    {d22 - d25}, [%2]  \n\t"    // M[m8-m15]
-
-     "vmul.f32   q13, q9, d0[0]     \n\t"   // DST->V = M[m0-m3] * V[x]
-     "vmla.f32   q13, q10, d0[1]    \n\t"   // DST->V = M[m4-m7] * V[y]
-     "vmla.f32   q13, q11, d1[0]    \n\t"   // DST->V = M[m8-m11] * V[z]
-     "vmla.f32   q13, q12, d1[1]    \n\t"   // DST->V = M[m12-m15] * V[w]
-
-     "vst1.32    {d26, d27}, [%0]   \n\t"   // DST->V
-     :
-     : "r"(dst), "r"(v), "r"(m)
-     : "q0", "q9", "q10","q11", "q12", "q13", "memory"
-     );
-}
-
-inline void MathUtilNeon::crossVec3(const float* v1, const float* v2, float* dst)
-{
-    asm volatile(
-                 "vld1.32 {d1[1]},  [%1]         \n\t" //
-                 "vld1.32 {d0},     [%2]         \n\t" //
-                 "vmov.f32 s2, s1                \n\t" // q0 = (v1y, v1z, v1z, v1x)
-
-                 "vld1.32 {d2[1]},  [%3]         \n\t" //
-                 "vld1.32 {d3},     [%4]         \n\t" //
-                 "vmov.f32 s4, s7                  \n\t" // q1 = (v2z, v2x, v2y, v2z)
-
-                 "vmul.f32 d4, d0, d2            \n\t" // x = v1y * v2z, y = v1z * v2x
-                 "vmls.f32 d4, d1, d3            \n\t" // x -= v1z * v2y, y-= v1x - v2z
-
-                 "vmul.f32 d5, d3, d1[1]         \n\t" // z = v1x * v2y
-                 "vmls.f32 d5, d0, d2[1]         \n\t" // z-= v1y * vx
-
-                 "vst1.32 {d4},       [%0]!      \n\t" // V[x, y]
-                 "vst1.32 {d5[0]}, [%0]          \n\t" // V[z]
-                 :
-                 : "r"(dst), "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1))
-                 : "q0", "q1", "q2", "memory"
-                 );
-}
-
-inline void MathUtilNeon::transformVertices(ax::V3F_C4B_T2F* dst, const ax::V3F_C4B_T2F* src, size_t count, const ax::Mat4& transform)
-{
-    auto end = dst + count;
+    inline static void multiplyMatrix(const _xm128_t* m, float scalar, _xm128_t* dst)
+    {
+        _xm128_t s = vdupq_n_f32(scalar);
+        UTILS_UNROLL
+        for (int i = 0; i < 4; ++i)
+        {
+            dst[i] = vmulq_f32(m[i], s);
+        }
+    }
 
-    // Load matrix
-    float32x4_t mc0 = vld1q_f32(transform.m);
-    float32x4_t mc1 = vld1q_f32(transform.m + 4);
-    float32x4_t mc2 = vld1q_f32(transform.m + 8);
-    float32x4_t mc3 = vld1q_f32(transform.m + 12);
+    inline static void multiplyMatrix(const _xm128_t* m1, const _xm128_t* m2, _xm128_t* dst)
+    {
+        float32x4_t product[4];
+        float32x4_t val;
+        UTILS_UNROLL
+        for (int i = 0; i < 4; ++i)
+        {
+            val        = vmulq_n_f32(m1[0], vgetq_lane_f32(m2[i], 0));
+            val        = vmlaq_n_f32(val, m1[1], vgetq_lane_f32(m2[i], 1));
+            val        = vmlaq_n_f32(val, m1[2], vgetq_lane_f32(m2[i], 2));
+            val        = vmlaq_n_f32(val, m1[3], vgetq_lane_f32(m2[i], 3));
+            product[i] = val;
+        }
+        memcpy(dst, product, sizeof(product));
+    }
 
-    // Process 4 vertices at a time
-    auto end4 = dst + count / 4 * 4;
-    while (dst < end4)
+    inline static void negateMatrix(const _xm128_t* m, _xm128_t* dst)
     {
-        // Load 4 vertices. Note that color will also get loaded into w
-        float32x2_t xy0 = vld1_f32(&src[0].vertices.x);
-        float32x2_t zw0 = vld1_f32(&src[0].vertices.z);
-        float32x2_t uv0 = vld1_f32(&src[0].texCoords.u);
-        float32x2_t xy1 = vld1_f32(&src[1].vertices.x);
-        float32x2_t zw1 = vld1_f32(&src[1].vertices.z);
-        float32x2_t uv1 = vld1_f32(&src[1].texCoords.u);
-        float32x2_t xy2 = vld1_f32(&src[2].vertices.x);
-        float32x2_t zw2 = vld1_f32(&src[2].vertices.z);
-        float32x2_t uv2 = vld1_f32(&src[2].texCoords.u);
-        float32x2_t xy3 = vld1_f32(&src[3].vertices.x);
-        float32x2_t zw3 = vld1_f32(&src[3].vertices.z);
-        float32x2_t uv3 = vld1_f32(&src[3].texCoords.u);
-
-        // Multiply x by column 0
-        float32x4_t r0 = vmulq_lane_f32(mc0, xy0, 0);
-        float32x4_t r1 = vmulq_lane_f32(mc0, xy1, 0);
-        float32x4_t r2 = vmulq_lane_f32(mc0, xy2, 0);
-        float32x4_t r3 = vmulq_lane_f32(mc0, xy3, 0);
-
-        // Multiply y by column 1 and add to result
-        r0 = vmlaq_lane_f32(r0, mc1, xy0, 1);
-        r1 = vmlaq_lane_f32(r1, mc1, xy1, 1);
-        r2 = vmlaq_lane_f32(r2, mc1, xy2, 1);
-        r3 = vmlaq_lane_f32(r3, mc1, xy3, 1);
-
-        // Multiply z by column 2 and add to result
-        r0 = vmlaq_lane_f32(r0, mc2, zw0, 0);
-        r1 = vmlaq_lane_f32(r1, mc2, zw1, 0);
-        r2 = vmlaq_lane_f32(r2, mc2, zw2, 0);
-        r3 = vmlaq_lane_f32(r3, mc2, zw3, 0);
-
-        // Add column 3
-        r0 = vaddq_f32(r0, mc3);
-        r1 = vaddq_f32(r1, mc3);
-        r2 = vaddq_f32(r2, mc3);
-        r3 = vaddq_f32(r3, mc3);
-
-        // Set color
-        r0 = vsetq_lane_f32(vget_lane_f32(zw0, 1), r0, 3);
-        r1 = vsetq_lane_f32(vget_lane_f32(zw1, 1), r1, 3);
-        r2 = vsetq_lane_f32(vget_lane_f32(zw2, 1), r2, 3);
-        r3 = vsetq_lane_f32(vget_lane_f32(zw3, 1), r3, 3);
-
-        // Store result
-        vst1q_f32(&dst[0].vertices.x, r0);
-        vst1_f32(&dst[0].texCoords.u, uv0);
-        vst1q_f32(&dst[1].vertices.x, r1);
-        vst1_f32(&dst[1].texCoords.u, uv1);
-        vst1q_f32(&dst[2].vertices.x, r2);
-        vst1_f32(&dst[2].texCoords.u, uv2);
-        vst1q_f32(&dst[3].vertices.x, r3);
-        vst1_f32(&dst[3].texCoords.u, uv3);
-
-        dst += 4;
-        src += 4;
+        UTILS_UNROLL
+        for (int i = 0; i < 4; ++i)
+        {
+            dst[i] = vnegq_f32(m[i]);
+        }
     }
 
-    // Process remaining vertices
-    while (dst < end)
+    inline static void transposeMatrix(const _xm128_t* m, _xm128_t* dst)
     {
-        // Load vertex
-        float32x2_t xy = vld1_f32(&src->vertices.x);
-        float32x2_t zw = vld1_f32(&src->vertices.z);
-        float32x2_t uv = vld1_f32(&src->texCoords.u);
-
-        // Multiply x by column 0
-        float32x4_t r = vmulq_lane_f32(mc0, xy, 0);
-        // Multiply y by column 1 and add to result
-        r = vmlaq_lane_f32(r, mc1, xy, 1);
-        // Multiply z by column 2 and add to result
-        r = vmlaq_lane_f32(r, mc2, zw, 0);
-        // Add column 3
-        r = vaddq_f32(r, mc3);
-
-        // Set color
-        r = vsetq_lane_f32(vget_lane_f32(zw, 1), r, 3);
-
-        // Store result
-        vst1q_f32(&dst->vertices.x, r);
-        vst1_f32(&dst->texCoords.u, uv);
-
-        ++dst;
-        ++src;
+        auto tmp0 = vzipq_f32(m[0], m[2]);
+        auto tmp1 = vzipq_f32(m[1], m[3]);
+        auto tmp2 = vzipq_f32(tmp0.val[0], tmp1.val[0]);
+        auto tmp3 = vzipq_f32(tmp0.val[1], tmp1.val[1]);
+
+        dst[0] = tmp2.val[0];
+        dst[1] = tmp2.val[1];
+        dst[2] = tmp3.val[0];
+        dst[3] = tmp3.val[1];
     }
-}
+
+    inline static void transformVec4(const _xm128_t* m, float x, float y, float z, float w, float* dst/*vec3*/)
+    {
+        auto v0 = vmulq_n_f32(m[0], x);
+        auto v1 = vmulq_n_f32(m[1], y);
+        auto v2 = vmulq_n_f32(m[2], z);
+        auto v3 = vmulq_n_f32(m[3], w);
+        auto prod = vaddq_f32(v0, vaddq_f32(v1, vaddq_f32(v2, v3)));
+        vst1_f32(dst, vget_low_f32(prod));
+        vst1_lane_f32(dst + 2, vget_high_f32(prod), 0);
+    }
+
+    inline static void transformVec4(const _xm128_t* m, const float* v /*vec4*/, float* dst /*vec4*/)
+    {
+        auto v0 = vmulq_n_f32(m[0], v[0]);
+        auto v1 = vmulq_n_f32(m[1], v[1]);
+        auto v2 = vmulq_n_f32(m[2], v[2]);
+        auto v3 = vmulq_n_f32(m[3], v[3]);
+        auto prod = vaddq_f32(v0, vaddq_f32(v1, vaddq_f32(v2, v3)));
+        vst1q_f32(dst, prod);
+    }
+
+    inline static void crossVec3(const float* v1, const float* v2, float* dst)
+    {
+        // refer to:
+        // https://developer.arm.com/documentation/den0018/a/NEON-Code-Examples-with-Mixed-Operations/Cross-product/Single-cross-product
+        // Vector a is stored in memory such that ai is at the lower address and
+        // ak is at the higher address. Vector b is also stored in the same way.
+
+        float32x4_t vec_a     = vcombine_f32(vld1_f32(v1 + 1), vld1_f32(v1));  // Q register = [aj, ai, ak, aj]
+        float32x4_t vec_b     = vcombine_f32(vld1_f32(v2 + 1), vld1_f32(v2));  // Q register = [bj, bi, bk, bj]
+        float32x4_t vec_a_rot = vextq_f32(vec_a, vec_a, 1);
+        float32x4_t vec_b_rot = vextq_f32(vec_b, vec_b, 1);
+
+        float32x4_t prod = vmulq_f32(vec_a, vec_b_rot);
+
+        // prod = [ ajbj, aibj, akbi, ajbk ]
+
+        prod = vmlsq_f32(prod, vec_a_rot, vec_b);
+        // prod = [ ajbj-ajbj, aibj-ajbi, akbi-aibk, ajbk-akbj ]
+
+        vst1_f32(dst, vget_low_f32(prod));               // Store the lower two elements to address r
+        vst1_lane_f32(dst + 2, vget_high_f32(prod), 0);  // Store the 3rd element
+    }
+
+#if AX_64BITS
+    inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
+    {
+        auto end = dst + count;
+
+        // Load matrix
+        float32x4x4_t m = vld1q_f32_x4(transform.m);
+
+        // Process 4 vertices at a time if there's enough data
+        auto end4 = dst + count / 4 * 4;
+        while (dst < end4)
+        {
+            // Do this for each vertex
+            // dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8]  + m[12];
+            // dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9]  + m[13];
+            // dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
+
+            // First, load each vertex, multiply x by column 0 and add to column 3
+            // Note: since we're reading 4 floats it will load color bytes into v.w
+            float32x4_t v0 = vld1q_f32(&src[0].vertices.x);
+            float32x4_t r0 = vmlaq_laneq_f32(m.val[3], m.val[0], v0, 0);
+            float32x4_t v1 = vld1q_f32(&src[1].vertices.x);
+            float32x4_t r1 = vmlaq_laneq_f32(m.val[3], m.val[0], v1, 0);
+            float32x4_t v2 = vld1q_f32(&src[2].vertices.x);
+            float32x4_t r2 = vmlaq_laneq_f32(m.val[3], m.val[0], v2, 0);
+            float32x4_t v3 = vld1q_f32(&src[3].vertices.x);
+            float32x4_t r3 = vmlaq_laneq_f32(m.val[3], m.val[0], v3, 0);
+
+            // Load texCoords
+            float32x2_t uv0 = vld1_f32(&src[0].texCoords.u);
+            float32x2_t uv1 = vld1_f32(&src[1].texCoords.u);
+            float32x2_t uv2 = vld1_f32(&src[2].texCoords.u);
+            float32x2_t uv3 = vld1_f32(&src[3].texCoords.u);
+
+            // Multiply y by column 1 and add to result
+            r0 = vmlaq_laneq_f32(r0, m.val[1], v0, 1);
+            r1 = vmlaq_laneq_f32(r1, m.val[1], v1, 1);
+            r2 = vmlaq_laneq_f32(r2, m.val[1], v2, 1);
+            r3 = vmlaq_laneq_f32(r3, m.val[1], v3, 1);
+
+            // Multiply z by column 2 and add to result
+            r0 = vmlaq_laneq_f32(r0, m.val[2], v0, 2);
+            r1 = vmlaq_laneq_f32(r1, m.val[2], v1, 2);
+            r2 = vmlaq_laneq_f32(r2, m.val[2], v2, 2);
+            r3 = vmlaq_laneq_f32(r3, m.val[2], v3, 2);
+
+            // Set w to loaded color
+            r0 = vsetq_lane_f32(vgetq_lane_f32(v0, 3), r0, 3);
+            r1 = vsetq_lane_f32(vgetq_lane_f32(v1, 3), r1, 3);
+            r2 = vsetq_lane_f32(vgetq_lane_f32(v2, 3), r2, 3);
+            r3 = vsetq_lane_f32(vgetq_lane_f32(v3, 3), r3, 3);
+
+            // Store result
+            vst1q_f32(&dst[0].vertices.x, r0);
+            vst1_f32(&dst[0].texCoords.u, uv0);
+            vst1q_f32(&dst[1].vertices.x, r1);
+            vst1_f32(&dst[1].texCoords.u, uv1);
+            vst1q_f32(&dst[2].vertices.x, r2);
+            vst1_f32(&dst[2].texCoords.u, uv2);
+            vst1q_f32(&dst[3].vertices.x, r3);
+            vst1_f32(&dst[3].texCoords.u, uv3);
+
+            dst += 4;
+            src += 4;
+        }
+
+        // Process remaining vertices one by one
+        while (dst < end)
+        {
+            float32x4_t v  = vld1q_f32(&src->vertices.x);
+            float32x4_t r  = vmlaq_laneq_f32(m.val[3], m.val[0], v, 0);
+            r              = vmlaq_laneq_f32(r, m.val[1], v, 1);
+            r              = vmlaq_laneq_f32(r, m.val[2], v, 2);
+            r              = vsetq_lane_f32(vgetq_lane_f32(v, 3), r, 3);
+            float32x2_t uv = vld1_f32(&src->texCoords.u);
+            vst1q_f32(&dst->vertices.x, r);
+            vst1_f32(&dst->texCoords.u, uv);
+
+            ++dst;
+            ++src;
+        }
+    }
+
+    inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
+    {
+        auto end = dst + count;
+        auto off = vdupq_n_u16(offset);
+
+        if (count < 8)
+            goto LEFTOVER;
+
+        // Process 32 indices at a time if there's enough data
+        while (count >= 32)
+        {
+            // Load 32 indices
+            uint16x8x4_t v = vld1q_u16_x4(src);
+
+            // Add offset
+            v.val[0] = vaddq_u16(v.val[0], off);
+            v.val[1] = vaddq_u16(v.val[1], off);
+            v.val[2] = vaddq_u16(v.val[2], off);
+            v.val[3] = vaddq_u16(v.val[3], off);
+
+            // Store result
+            vst1q_u16_x4(dst, v);
+
+            dst += 32;
+            src += 32;
+            count -= 32;
+        }
+
+        // Process 8 indices at a time if there's enough data
+        while (count >= 8)
+        {
+            uint16x8_t v = vld1q_u16(src);
+            v            = vaddq_u16(v, off);
+            vst1q_u16(dst, v);
+
+            dst += 8;
+            src += 8;
+            count -= 8;
+        }
+
+    LEFTOVER:
+        // Process remaining indices one by one
+        while (count > 0)
+        {
+            *dst = *src + offset;
+            ++dst;
+            ++src;
+            --count;
+        }
+    }
+#else
+    inline static void transformVertices(ax::V3F_C4B_T2F* dst,
+                                         const ax::V3F_C4B_T2F* src,
+                                         size_t count,
+                                         const ax::Mat4& transform)
+    {
+        auto end = dst + count;
+
+        // Load matrix
+        float32x4_t mc0 = vld1q_f32(transform.m);
+        float32x4_t mc1 = vld1q_f32(transform.m + 4);
+        float32x4_t mc2 = vld1q_f32(transform.m + 8);
+        float32x4_t mc3 = vld1q_f32(transform.m + 12);
+
+        // Process 4 vertices at a time
+        auto end4 = dst + count / 4 * 4;
+        while (dst < end4)
+        {
+            // Load 4 vertices. Note that color will also get loaded into w
+            float32x2_t xy0 = vld1_f32(&src[0].vertices.x);
+            float32x2_t zw0 = vld1_f32(&src[0].vertices.z);
+            float32x2_t uv0 = vld1_f32(&src[0].texCoords.u);
+            float32x2_t xy1 = vld1_f32(&src[1].vertices.x);
+            float32x2_t zw1 = vld1_f32(&src[1].vertices.z);
+            float32x2_t uv1 = vld1_f32(&src[1].texCoords.u);
+            float32x2_t xy2 = vld1_f32(&src[2].vertices.x);
+            float32x2_t zw2 = vld1_f32(&src[2].vertices.z);
+            float32x2_t uv2 = vld1_f32(&src[2].texCoords.u);
+            float32x2_t xy3 = vld1_f32(&src[3].vertices.x);
+            float32x2_t zw3 = vld1_f32(&src[3].vertices.z);
+            float32x2_t uv3 = vld1_f32(&src[3].texCoords.u);
+
+            // Multiply x by column 0
+            float32x4_t r0 = vmulq_lane_f32(mc0, xy0, 0);
+            float32x4_t r1 = vmulq_lane_f32(mc0, xy1, 0);
+            float32x4_t r2 = vmulq_lane_f32(mc0, xy2, 0);
+            float32x4_t r3 = vmulq_lane_f32(mc0, xy3, 0);
+
+            // Multiply y by column 1 and add to result
+            r0 = vmlaq_lane_f32(r0, mc1, xy0, 1);
+            r1 = vmlaq_lane_f32(r1, mc1, xy1, 1);
+            r2 = vmlaq_lane_f32(r2, mc1, xy2, 1);
+            r3 = vmlaq_lane_f32(r3, mc1, xy3, 1);
+
+            // Multiply z by column 2 and add to result
+            r0 = vmlaq_lane_f32(r0, mc2, zw0, 0);
+            r1 = vmlaq_lane_f32(r1, mc2, zw1, 0);
+            r2 = vmlaq_lane_f32(r2, mc2, zw2, 0);
+            r3 = vmlaq_lane_f32(r3, mc2, zw3, 0);
+
+            // Add column 3
+            r0 = vaddq_f32(r0, mc3);
+            r1 = vaddq_f32(r1, mc3);
+            r2 = vaddq_f32(r2, mc3);
+            r3 = vaddq_f32(r3, mc3);
+
+            // Set color
+            r0 = vsetq_lane_f32(vget_lane_f32(zw0, 1), r0, 3);
+            r1 = vsetq_lane_f32(vget_lane_f32(zw1, 1), r1, 3);
+            r2 = vsetq_lane_f32(vget_lane_f32(zw2, 1), r2, 3);
+            r3 = vsetq_lane_f32(vget_lane_f32(zw3, 1), r3, 3);
+
+            // Store result
+            vst1q_f32(&dst[0].vertices.x, r0);
+            vst1_f32(&dst[0].texCoords.u, uv0);
+            vst1q_f32(&dst[1].vertices.x, r1);
+            vst1_f32(&dst[1].texCoords.u, uv1);
+            vst1q_f32(&dst[2].vertices.x, r2);
+            vst1_f32(&dst[2].texCoords.u, uv2);
+            vst1q_f32(&dst[3].vertices.x, r3);
+            vst1_f32(&dst[3].texCoords.u, uv3);
+
+            dst += 4;
+            src += 4;
+        }
+
+        // Process remaining vertices
+        while (dst < end)
+        {
+            // Load vertex
+            float32x2_t xy = vld1_f32(&src->vertices.x);
+            float32x2_t zw = vld1_f32(&src->vertices.z);
+            float32x2_t uv = vld1_f32(&src->texCoords.u);
+
+            // Multiply x by column 0
+            float32x4_t r = vmulq_lane_f32(mc0, xy, 0);
+            // Multiply y by column 1 and add to result
+            r = vmlaq_lane_f32(r, mc1, xy, 1);
+            // Multiply z by column 2 and add to result
+            r = vmlaq_lane_f32(r, mc2, zw, 0);
+            // Add column 3
+            r = vaddq_f32(r, mc3);
+
+            // Set color
+            r = vsetq_lane_f32(vget_lane_f32(zw, 1), r, 3);
+
+            // Store result
+            vst1q_f32(&dst->vertices.x, r);
+            vst1_f32(&dst->texCoords.u, uv);
+
+            ++dst;
+            ++src;
+        }
+    }
+#endif
+};
 
 NS_AX_MATH_END
diff --git a/core/math/MathUtilNeon64.inl b/core/math/MathUtilNeon64.inl
deleted file mode 100644
index 1bfb02759dc1..000000000000
--- a/core/math/MathUtilNeon64.inl
+++ /dev/null
@@ -1,398 +0,0 @@
-/**
- Copyright 2013 BlackBerry Inc.
- Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
- Original file from GamePlay3D: http://gameplay3d.org
-
- This file was modified to fit the cocos2d-x project
- */
-
-#include <arm_neon.h>
-#include "base/Types.h"
-
-NS_AX_MATH_BEGIN
-
-class MathUtilNeon64
-{
-public:
-    inline static void addMatrix(const float* m, float scalar, float* dst);
-    inline static void addMatrix(const float* m1, const float* m2, float* dst);
-    inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
-    inline static void multiplyMatrix(const float* m, float scalar, float* dst);
-    inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
-
-    inline static void negateMatrix(const float* m, float* dst);
-    inline static void transposeMatrix(const float* m, float* dst);
-
-    inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
-    inline static void transformVec4(const float* m, const float* v, float* dst);
-    inline static void crossVec3(const float* v1, const float* v2, float* dst);
-
-    inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform);
-    inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset);
-};
-
-inline void MathUtilNeon64::addMatrix(const float* m, float scalar, float* dst)
-{
-    asm volatile(
-	    "ld4  {v0.4s, v1.4s, v2.4s, v3.4s}, [%1]    	\n\t" // M[m0-m7] M[m8-m15]
-	    "ld1r {v4.4s}, [%2]				                \n\t" //ssss
-
-	    "fadd v8.4s, v0.4s, v4.4s			\n\t" // DST->M[m0-m3] = M[m0-m3] + s
-	    "fadd v9.4s, v1.4s, v4.4s			\n\t" // DST->M[m4-m7] = M[m4-m7] + s
-	    "fadd v10.4s, v2.4s, v4.4s			\n\t" // DST->M[m8-m11] = M[m8-m11] + s
-	    "fadd v11.4s, v3.4s, v4.4s			\n\t" // DST->M[m12-m15] = M[m12-m15] + s
-
-        "st4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] 	\n\t"    // Result in V9
-	    :
-        : "r"(dst), "r"(m), "r"(&scalar)
-        : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "memory"
-    );
-}
-
-inline void MathUtilNeon64::addMatrix(const float* m1, const float* m2, float* dst)
-{
-    asm volatile(
-        "ld4     {v0.4s, v1.4s, v2.4s, v3.4s},     [%1] 	\n\t" // M1[m0-m7] M1[m8-m15]
-        "ld4     {v8.4s, v9.4s, v10.4s, v11.4s},   [%2] 	\n\t" // M2[m0-m7] M2[m8-m15]
-
-        "fadd   v12.4s, v0.4s, v8.4s          \n\t" // DST->M[m0-m3] = M1[m0-m3] + M2[m0-m3]
-        "fadd   v13.4s, v1.4s, v9.4s          \n\t" // DST->M[m4-m7] = M1[m4-m7] + M2[m4-m7]
-        "fadd   v14.4s, v2.4s, v10.4s         \n\t" // DST->M[m8-m11] = M1[m8-m11] + M2[m8-m11]
-        "fadd   v15.4s, v3.4s, v11.4s         \n\t" // DST->M[m12-m15] = M1[m12-m15] + M2[m12-m15]
-
-        "st4    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0]    \n\t" // DST->M[m0-m7] DST->M[m8-m15]
-        :
-        : "r"(dst), "r"(m1), "r"(m2)
-        : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
-    );
-}
-
-inline void MathUtilNeon64::subtractMatrix(const float* m1, const float* m2, float* dst)
-{
-    asm volatile(
-        "ld4     {v0.4s, v1.4s, v2.4s, v3.4s},     [%1]  \n\t" // M1[m0-m7] M1[m8-m15]
-        "ld4     {v8.4s, v9.4s, v10.4s, v11.4s},   [%2]  \n\t" // M2[m0-m7] M2[m8-m15]
-
-        "fsub   v12.4s, v0.4s, v8.4s         \n\t" // DST->M[m0-m3] = M1[m0-m3] - M2[m0-m3]
-        "fsub   v13.4s, v1.4s, v9.4s         \n\t" // DST->M[m4-m7] = M1[m4-m7] - M2[m4-m7]
-        "fsub   v14.4s, v2.4s, v10.4s        \n\t" // DST->M[m8-m11] = M1[m8-m11] - M2[m8-m11]
-        "fsub   v15.4s, v3.4s, v11.4s        \n\t" // DST->M[m12-m15] = M1[m12-m15] - M2[m12-m15]
-
-        "st4    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0]   \n\t" // DST->M[m0-m7] DST->M[m8-m15]
-        :
-        : "r"(dst), "r"(m1), "r"(m2)
-        : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
-    );
-}
-
-inline void MathUtilNeon64::multiplyMatrix(const float* m, float scalar, float* dst)
-{
-    asm volatile(
-        "ld1     {v0.s}[0],         [%2]            \n\t" //s
-        "ld4     {v4.4s, v5.4s, v6.4s, v7.4s}, [%1]       \n\t" //M[m0-m7] M[m8-m15]
-
-        "fmul     v8.4s, v4.4s, v0.s[0]               \n\t" // DST->M[m0-m3] = M[m0-m3] * s
-        "fmul     v9.4s, v5.4s, v0.s[0]               \n\t" // DST->M[m4-m7] = M[m4-m7] * s
-        "fmul     v10.4s, v6.4s, v0.s[0]              \n\t" // DST->M[m8-m11] = M[m8-m11] * s
-        "fmul     v11.4s, v7.4s, v0.s[0]              \n\t" // DST->M[m12-m15] = M[m12-m15] * s
-
-        "st4     {v8.4s, v9.4s, v10.4s, v11.4s},           [%0]     \n\t" // DST->M[m0-m7] DST->M[m8-m15]
-        :
-        : "r"(dst), "r"(m), "r"(&scalar)
-        : "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
-    );
-}
-
-inline void MathUtilNeon64::multiplyMatrix(const float* m1, const float* m2, float* dst)
-{
-    asm volatile(
-        "ld1     {v8.4s, v9.4s, v10.4s, v11.4s}, [%1] \n\t"       // M1[m0-m7] M1[m8-m15] M2[m0-m7]  M2[m8-m15]
-        "ld4     {v0.4s, v1.4s, v2.4s, v3.4s},  [%2]   \n\t"       // M2[m0-m15]
-
-
-        "fmul    v12.4s, v8.4s, v0.s[0]     \n\t"         // DST->M[m0-m3] = M1[m0-m3] * M2[m0]
-        "fmul    v13.4s, v8.4s, v0.s[1]     \n\t"         // DST->M[m4-m7] = M1[m4-m7] * M2[m4]
-        "fmul    v14.4s, v8.4s, v0.s[2]     \n\t"         // DST->M[m8-m11] = M1[m8-m11] * M2[m8]
-        "fmul    v15.4s, v8.4s, v0.s[3]     \n\t"         // DST->M[m12-m15] = M1[m12-m15] * M2[m12]
-
-        "fmla    v12.4s, v9.4s, v1.s[0]     \n\t"         // DST->M[m0-m3] += M1[m0-m3] * M2[m1]
-        "fmla    v13.4s, v9.4s, v1.s[1]     \n\t"         // DST->M[m4-m7] += M1[m4-m7] * M2[m5]
-        "fmla    v14.4s, v9.4s, v1.s[2]     \n\t"         // DST->M[m8-m11] += M1[m8-m11] * M2[m9]
-        "fmla    v15.4s, v9.4s, v1.s[3]     \n\t"         // DST->M[m12-m15] += M1[m12-m15] * M2[m13]
-
-        "fmla    v12.4s, v10.4s, v2.s[0]    \n\t"         // DST->M[m0-m3] += M1[m0-m3] * M2[m2]
-        "fmla    v13.4s, v10.4s, v2.s[1]    \n\t"         // DST->M[m4-m7] += M1[m4-m7] * M2[m6]
-        "fmla    v14.4s, v10.4s, v2.s[2]    \n\t"         // DST->M[m8-m11] += M1[m8-m11] * M2[m10]
-        "fmla    v15.4s, v10.4s, v2.s[3]    \n\t"         // DST->M[m12-m15] += M1[m12-m15] * M2[m14]
-
-        "fmla    v12.4s, v11.4s, v3.s[0]    \n\t"         // DST->M[m0-m3] += M1[m0-m3] * M2[m3]
-        "fmla    v13.4s, v11.4s, v3.s[1]    \n\t"         // DST->M[m4-m7] += M1[m4-m7] * M2[m7]
-        "fmla    v14.4s, v11.4s, v3.s[2]    \n\t"         // DST->M[m8-m11] += M1[m8-m11] * M2[m11]
-        "fmla    v15.4s, v11.4s, v3.s[3]    \n\t"         // DST->M[m12-m15] += M1[m12-m15] * M2[m15]
-
-        "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0]  \n\t"       // DST->M[m0-m7]// DST->M[m8-m15]
-
-        : // output
-        : "r"(dst), "r"(m1), "r"(m2) // input - note *value* of pointer doesn't change.
-        : "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-     );
-}
-
-inline void MathUtilNeon64::negateMatrix(const float* m, float* dst)
-{
-    asm volatile(
-        "ld4     {v0.4s, v1.4s, v2.4s, v3.4s},  [%1]     \n\t" // load m0-m7 load m8-m15
-
-        "fneg     v4.4s, v0.4s             \n\t" // negate m0-m3
-        "fneg     v5.4s, v1.4s             \n\t" // negate m4-m7
-        "fneg     v6.4s, v2.4s             \n\t" // negate m8-m15
-        "fneg     v7.4s, v3.4s             \n\t" // negate m8-m15
-
-        "st4     {v4.4s, v5.4s, v6.4s, v7.4s},  [%0]     \n\t" // store m0-m7 store m8-m15
-        :
-        : "r"(dst), "r"(m)
-        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory"
-    );
-}
-
-inline void MathUtilNeon64::transposeMatrix(const float* m, float* dst)
-{
-    asm volatile(
-        "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1]    \n\t" // DST->M[m0, m4, m8, m12] = M[m0-m3]
-							 //DST->M[m1, m5, m9, m12] = M[m4-m7]
-        "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0]    \n\t"
-        :
-        : "r"(dst), "r"(m)
-        : "v0", "v1", "v2", "v3", "memory"
-    );
-}
-
-inline void MathUtilNeon64::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
-{
-    asm volatile(
-        "ld1    {v0.s}[0],        [%1]    \n\t"    // V[x]
-        "ld1    {v0.s}[1],        [%2]    \n\t"    // V[y]
-        "ld1    {v0.s}[2],        [%3]    \n\t"    // V[z]
-        "ld1    {v0.s}[3],        [%4]    \n\t"    // V[w]
-        "ld1    {v9.4s, v10.4s, v11.4s, v12.4s}, [%5]   \n\t"    // M[m0-m7] M[m8-m15]
-
-
-        "fmul v13.4s, v9.4s, v0.s[0]           \n\t"      // DST->V = M[m0-m3] * V[x]
-        "fmla v13.4s, v10.4s, v0.s[1]           \n\t"    // DST->V += M[m4-m7] * V[y]
-        "fmla v13.4s, v11.4s, v0.s[2]           \n\t"    // DST->V += M[m8-m11] * V[z]
-        "fmla v13.4s, v12.4s, v0.s[3]           \n\t"    // DST->V += M[m12-m15] * V[w]
-
-        //"st1 {v13.4s}, [%0]               \n\t"    // DST->V[x, y] // DST->V[z]
-        "st1 {v13.2s}, [%0], 8               \n\t"
-        "st1 {v13.s}[2], [%0]                \n\t"
-        : "+r"(dst)
-        : "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m)
-        : "v0", "v9", "v10","v11", "v12", "v13", "memory"
-    );
-}
-
-inline void MathUtilNeon64::transformVec4(const float* m, const float* v, float* dst)
-{
-    asm volatile
-    (
-        "ld1    {v0.4s}, [%1]     \n\t"   // V[x, y, z, w]
-        "ld1    {v9.4s, v10.4s, v11.4s, v12.4s}, [%2] \n\t"   // M[m0-m7] M[m8-m15]
-
-        "fmul   v13.4s, v9.4s, v0.s[0]     \n\t"   // DST->V = M[m0-m3] * V[x]
-        "fmla   v13.4s, v10.4s, v0.s[1]    \n\t"   // DST->V = M[m4-m7] * V[y]
-        "fmla   v13.4s, v11.4s, v0.s[2]    \n\t"   // DST->V = M[m8-m11] * V[z]
-        "fmla   v13.4s, v12.4s, v0.s[3]    \n\t"   // DST->V = M[m12-m15] * V[w]
-
-        "st1    {v13.4s}, [%0]  	 \n\t"   // DST->V
-        :
-        : "r"(dst), "r"(v), "r"(m)
-        : "v0", "v9", "v10","v11", "v12", "v13", "memory"
-    );
-}
-
-inline void MathUtilNeon64::crossVec3(const float* v1, const float* v2, float* dst)
-{
-        asm volatile(
-        "ld1 {v0.2s},  [%2]           \n\t"
-        "ld1 {v0.s}[2],  [%1]           \n\t"
-        "mov v0.s[3], v0.s[0]         \n\t" // q0 = (v1y, v1z, v1x, v1x)
-
-        "ld1 {v1.4s},  [%3]           \n\t"
-        "mov v1.s[3], v1.s[0]           \n\t" // q1 = (v2x, v2y, v2z, v2x)
-
-        "fmul v2.4s, v0.4s, v1.4s            \n\t" // x = v1y * v2z, y = v1z * v2x
-
-
-        "mov v0.s[0], v0.s[1]           \n\t"
-        "mov v0.s[1], v0.s[2]           \n\t"
-        "mov v0.s[2], v0.s[3]           \n\t"
-
-        "mov v1.s[3], v1.s[2]           \n\t"
-
-        "fmul v0.4s, v0.4s, v1.4s            \n\t"
-
-        "mov v0.s[3], v0.s[1]           \n\t"
-        "mov v0.s[1], v0.s[2]           \n\t"
-        "mov v0.s[2], v0.s[0]           \n\t"
-
-        "fsub v2.4s, v0.4s, v2.4s            \n\t"
-
-        "mov v2.s[0], v2.s[1]           \n\t"
-        "mov v2.s[1], v2.s[2]           \n\t"
-        "mov v2.s[2], v2.s[3]           \n\t"
-
-        "st1 {v2.2s},       [%0], 8      \n\t" // V[x, y]
-        "st1 {v2.s}[2],     [%0]         \n\t" // V[z]
-        : "+r"(dst)
-        : "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1))
-        : "v0", "v1", "v2", "memory"
-    );
-}
-
-inline void MathUtilNeon64::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
-{
-    auto end = dst + count;
-
-    // Load matrix
-    float32x4x4_t m = vld1q_f32_x4(transform.m);
-
-    // Process 4 vertices at a time if there's enough data
-    auto end4 = dst + count / 4 * 4;
-    while (dst < end4)
-    {
-        // Do this for each vertex
-        // dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8]  + m[12];
-        // dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9]  + m[13];
-        // dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
-
-        // First, load each vertex, multiply x by column 0 and add to column 3
-        // Note: since we're reading 4 floats it will load color bytes into v.w
-        float32x4_t v0 = vld1q_f32(&src[0].vertices.x);
-        float32x4_t r0 = vmlaq_laneq_f32(m.val[3], m.val[0], v0, 0);
-        float32x4_t v1 = vld1q_f32(&src[1].vertices.x);
-        float32x4_t r1 = vmlaq_laneq_f32(m.val[3], m.val[0], v1, 0);
-        float32x4_t v2 = vld1q_f32(&src[2].vertices.x);
-        float32x4_t r2 = vmlaq_laneq_f32(m.val[3], m.val[0], v2, 0);
-        float32x4_t v3 = vld1q_f32(&src[3].vertices.x);
-        float32x4_t r3 = vmlaq_laneq_f32(m.val[3], m.val[0], v3, 0);
-
-        // Load texCoords
-        float32x2_t uv0 = vld1_f32(&src[0].texCoords.u);
-        float32x2_t uv1 = vld1_f32(&src[1].texCoords.u);
-        float32x2_t uv2 = vld1_f32(&src[2].texCoords.u);
-        float32x2_t uv3 = vld1_f32(&src[3].texCoords.u);
-
-        // Multiply y by column 1 and add to result
-        r0 = vmlaq_laneq_f32(r0, m.val[1], v0, 1);
-        r1 = vmlaq_laneq_f32(r1, m.val[1], v1, 1);
-        r2 = vmlaq_laneq_f32(r2, m.val[1], v2, 1);
-        r3 = vmlaq_laneq_f32(r3, m.val[1], v3, 1);
-
-        // Multiply z by column 2 and add to result
-        r0 = vmlaq_laneq_f32(r0, m.val[2], v0, 2);
-        r1 = vmlaq_laneq_f32(r1, m.val[2], v1, 2);
-        r2 = vmlaq_laneq_f32(r2, m.val[2], v2, 2);
-        r3 = vmlaq_laneq_f32(r3, m.val[2], v3, 2);
-
-        // Set w to loaded color
-        r0 = vsetq_lane_f32(vgetq_lane_f32(v0, 3), r0, 3);
-        r1 = vsetq_lane_f32(vgetq_lane_f32(v1, 3), r1, 3);
-        r2 = vsetq_lane_f32(vgetq_lane_f32(v2, 3), r2, 3);
-        r3 = vsetq_lane_f32(vgetq_lane_f32(v3, 3), r3, 3);
-
-        // Store result
-        vst1q_f32(&dst[0].vertices.x, r0);
-        vst1_f32(&dst[0].texCoords.u, uv0);
-        vst1q_f32(&dst[1].vertices.x, r1);
-        vst1_f32(&dst[1].texCoords.u, uv1);
-        vst1q_f32(&dst[2].vertices.x, r2);
-        vst1_f32(&dst[2].texCoords.u, uv2);
-        vst1q_f32(&dst[3].vertices.x, r3);
-        vst1_f32(&dst[3].texCoords.u, uv3);
-
-        dst += 4;
-        src += 4;
-    }
-
-    // Process remaining vertices one by one
-    while (dst < end)
-    {
-        float32x4_t v = vld1q_f32(&src->vertices.x);
-        float32x4_t r = vmlaq_laneq_f32(m.val[3], m.val[0], v, 0);
-        r = vmlaq_laneq_f32(r, m.val[1], v, 1);
-        r = vmlaq_laneq_f32(r, m.val[2], v, 2);
-        r = vsetq_lane_f32(vgetq_lane_f32(v, 3), r, 3);
-        float32x2_t uv = vld1_f32(&src->texCoords.u);
-        vst1q_f32(&dst->vertices.x, r);
-        vst1_f32(&dst->texCoords.u, uv);
-
-        ++dst;
-        ++src;
-    }
-}
-
-inline void MathUtilNeon64::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
-{
-    auto end = dst + count;
-    auto off = vdupq_n_u16(offset);
-
-    if (count < 8)
-        goto LEFTOVER;
-
-    // Process 32 indices at a time if there's enough data
-    while (count >= 32)
-    {
-        // Load 32 indices
-        uint16x8x4_t v = vld1q_u16_x4(src);
-
-        // Add offset
-        v.val[0] = vaddq_u16(v.val[0], off);
-        v.val[1] = vaddq_u16(v.val[1], off);
-        v.val[2] = vaddq_u16(v.val[2], off);
-        v.val[3] = vaddq_u16(v.val[3], off);
-
-        // Store result
-        vst1q_u16_x4(dst, v);
-
-        dst += 32;
-        src += 32;
-        count -= 32;
-    }
-
-    // Process 8 indices at a time if there's enough data
-    while (count >= 8)
-    {
-        uint16x8_t v = vld1q_u16(src);
-        v = vaddq_u16(v, off);
-        vst1q_u16(dst, v);
-
-        dst += 8;
-        src += 8;
-        count -= 8;
-    }
-
-LEFTOVER:
-    // Process remaining indices one by one
-    while (count > 0)
-    {
-        *dst = *src + offset;
-        ++dst;
-        ++src;
-        --count;
-    }
-}
-
-NS_AX_MATH_END
diff --git a/core/math/MathUtilSSE.inl b/core/math/MathUtilSSE.inl
index 48a377bdcbff..4869fe98b1de 100644
--- a/core/math/MathUtilSSE.inl
+++ b/core/math/MathUtilSSE.inl
@@ -1,157 +1,276 @@
+/****************************************************************************
+Copyright (c) 2010-2012 cocos2d-x.org
+Copyright (c) 2013-2017 Chukong Technologies
+Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd.
+Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
+
+https://axmol.dev/
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+****************************************************************************/
+
 NS_AX_MATH_BEGIN
 
-#ifdef AX_USE_SSE
+#ifdef AX_SSE_INTRINSICS
 
-void MathUtil::addMatrix(const __m128 m[4], float scalar, __m128 dst[4])
-{
-    __m128 s = _mm_set1_ps(scalar);
-    dst[0] = _mm_add_ps(m[0], s);
-    dst[1] = _mm_add_ps(m[1], s);
-    dst[2] = _mm_add_ps(m[2], s);
-    dst[3] = _mm_add_ps(m[3], s);
-}
-
-void MathUtil::addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
+struct MathUtilSSE
 {
-    dst[0] = _mm_add_ps(m1[0], m2[0]);
-    dst[1] = _mm_add_ps(m1[1], m2[1]);
-    dst[2] = _mm_add_ps(m1[2], m2[2]);
-    dst[3] = _mm_add_ps(m1[3], m2[3]);
-}
 
-void MathUtil::subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
-{
-    dst[0] = _mm_sub_ps(m1[0], m2[0]);
-    dst[1] = _mm_sub_ps(m1[1], m2[1]);
-    dst[2] = _mm_sub_ps(m1[2], m2[2]);
-    dst[3] = _mm_sub_ps(m1[3], m2[3]);
-}
+    static void addMatrix(const __m128 m[4], float scalar, __m128 dst[4])
+    {
+        __m128 s = _mm_set1_ps(scalar);
+        dst[0]   = _mm_add_ps(m[0], s);
+        dst[1]   = _mm_add_ps(m[1], s);
+        dst[2]   = _mm_add_ps(m[2], s);
+        dst[3]   = _mm_add_ps(m[3], s);
+    }
 
-void MathUtil::multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4])
-{
-    __m128 s = _mm_set1_ps(scalar);
-    dst[0] = _mm_mul_ps(m[0], s);
-    dst[1] = _mm_mul_ps(m[1], s);
-    dst[2] = _mm_mul_ps(m[2], s);
-    dst[3] = _mm_mul_ps(m[3], s);
-}
-
-void MathUtil::multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
-{
-    __m128 dst0, dst1, dst2, dst3;
-	{
-		__m128 e0 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(0, 0, 0, 0));
-		__m128 e1 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(1, 1, 1, 1));
-		__m128 e2 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(2, 2, 2, 2));
-		__m128 e3 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(3, 3, 3, 3));
-        
-		__m128 v0 = _mm_mul_ps(m1[0], e0);
-		__m128 v1 = _mm_mul_ps(m1[1], e1);
-		__m128 v2 = _mm_mul_ps(m1[2], e2);
-		__m128 v3 = _mm_mul_ps(m1[3], e3);
-        
-		__m128 a0 = _mm_add_ps(v0, v1);
-		__m128 a1 = _mm_add_ps(v2, v3);
-		__m128 a2 = _mm_add_ps(a0, a1);
-        
-		dst0 = a2;
-	}
-    
-	{
-		__m128 e0 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(0, 0, 0, 0));
-		__m128 e1 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(1, 1, 1, 1));
-		__m128 e2 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(2, 2, 2, 2));
-		__m128 e3 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(3, 3, 3, 3));
-        
-		__m128 v0 = _mm_mul_ps(m1[0], e0);
-		__m128 v1 = _mm_mul_ps(m1[1], e1);
-		__m128 v2 = _mm_mul_ps(m1[2], e2);
-		__m128 v3 = _mm_mul_ps(m1[3], e3);
-        
-		__m128 a0 = _mm_add_ps(v0, v1);
-		__m128 a1 = _mm_add_ps(v2, v3);
-		__m128 a2 = _mm_add_ps(a0, a1);
-        
-		dst1 = a2;
-	}
-    
-	{
-		__m128 e0 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(0, 0, 0, 0));
-		__m128 e1 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(1, 1, 1, 1));
-		__m128 e2 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(2, 2, 2, 2));
-		__m128 e3 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(3, 3, 3, 3));
-        
-		__m128 v0 = _mm_mul_ps(m1[0], e0);
-		__m128 v1 = _mm_mul_ps(m1[1], e1);
-		__m128 v2 = _mm_mul_ps(m1[2], e2);
-		__m128 v3 = _mm_mul_ps(m1[3], e3);
-        
-		__m128 a0 = _mm_add_ps(v0, v1);
-		__m128 a1 = _mm_add_ps(v2, v3);
-		__m128 a2 = _mm_add_ps(a0, a1);
-        
-		dst2 = a2;
-	}
-    
-	{
-		__m128 e0 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(0, 0, 0, 0));
-		__m128 e1 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(1, 1, 1, 1));
-		__m128 e2 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(2, 2, 2, 2));
-		__m128 e3 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(3, 3, 3, 3));
-        
-		__m128 v0 = _mm_mul_ps(m1[0], e0);
-		__m128 v1 = _mm_mul_ps(m1[1], e1);
-		__m128 v2 = _mm_mul_ps(m1[2], e2);
-		__m128 v3 = _mm_mul_ps(m1[3], e3);
-        
-		__m128 a0 = _mm_add_ps(v0, v1);
-		__m128 a1 = _mm_add_ps(v2, v3);
-		__m128 a2 = _mm_add_ps(a0, a1);
-        
-		dst3 = a2;
-	}
-    dst[0] = dst0;
-    dst[1] = dst1;
-    dst[2] = dst2;
-    dst[3] = dst3;
-}
-
-void MathUtil::negateMatrix(const __m128 m[4], __m128 dst[4])
-{
-    __m128 z = _mm_setzero_ps();
-    dst[0] = _mm_sub_ps(z, m[0]);
-    dst[1] = _mm_sub_ps(z, m[1]);
-    dst[2] = _mm_sub_ps(z, m[2]);
-    dst[3] = _mm_sub_ps(z, m[3]);
-}
-
-void MathUtil::transposeMatrix(const __m128 m[4], __m128 dst[4])
-{
-    __m128 tmp0 = _mm_shuffle_ps(m[0], m[1], 0x44);
-    __m128 tmp2 = _mm_shuffle_ps(m[0], m[1], 0xEE);
-    __m128 tmp1 = _mm_shuffle_ps(m[2], m[3], 0x44);
-    __m128 tmp3 = _mm_shuffle_ps(m[2], m[3], 0xEE);
-    
-    dst[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);
-    dst[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
-    dst[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);
-    dst[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
-}
-
-void MathUtil::transformVec4(const __m128 m[4], const __m128& v, __m128& dst)
-{
-    __m128 col1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
-    __m128 col2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
-    __m128 col3 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
-    __m128 col4 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
-    
-    dst = _mm_add_ps(
-                     _mm_add_ps(_mm_mul_ps(m[0], col1), _mm_mul_ps(m[1], col2)),
-                     _mm_add_ps(_mm_mul_ps(m[2], col3), _mm_mul_ps(m[3], col4))
-                     );
-}
+    static void addMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
+    {
+        dst[0] = _mm_add_ps(m1[0], m2[0]);
+        dst[1] = _mm_add_ps(m1[1], m2[1]);
+        dst[2] = _mm_add_ps(m1[2], m2[2]);
+        dst[3] = _mm_add_ps(m1[3], m2[3]);
+    }
 
-#endif
+    static void subtractMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
+    {
+        dst[0] = _mm_sub_ps(m1[0], m2[0]);
+        dst[1] = _mm_sub_ps(m1[1], m2[1]);
+        dst[2] = _mm_sub_ps(m1[2], m2[2]);
+        dst[3] = _mm_sub_ps(m1[3], m2[3]);
+    }
+
+    static void multiplyMatrix(const __m128 m[4], float scalar, __m128 dst[4])
+    {
+        __m128 s = _mm_set1_ps(scalar);
+        dst[0]   = _mm_mul_ps(m[0], s);
+        dst[1]   = _mm_mul_ps(m[1], s);
+        dst[2]   = _mm_mul_ps(m[2], s);
+        dst[3]   = _mm_mul_ps(m[3], s);
+    }
+
+    static void multiplyMatrix(const __m128 m1[4], const __m128 m2[4], __m128 dst[4])
+    {
+        __m128 dst0, dst1, dst2, dst3;
+        {
+            __m128 e0 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(0, 0, 0, 0));
+            __m128 e1 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(1, 1, 1, 1));
+            __m128 e2 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(2, 2, 2, 2));
+            __m128 e3 = _mm_shuffle_ps(m2[0], m2[0], _MM_SHUFFLE(3, 3, 3, 3));
+
+            __m128 v0 = _mm_mul_ps(m1[0], e0);
+            __m128 v1 = _mm_mul_ps(m1[1], e1);
+            __m128 v2 = _mm_mul_ps(m1[2], e2);
+            __m128 v3 = _mm_mul_ps(m1[3], e3);
+
+            __m128 a0 = _mm_add_ps(v0, v1);
+            __m128 a1 = _mm_add_ps(v2, v3);
+            __m128 a2 = _mm_add_ps(a0, a1);
+
+            dst0 = a2;
+        }
+
+        {
+            __m128 e0 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(0, 0, 0, 0));
+            __m128 e1 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(1, 1, 1, 1));
+            __m128 e2 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(2, 2, 2, 2));
+            __m128 e3 = _mm_shuffle_ps(m2[1], m2[1], _MM_SHUFFLE(3, 3, 3, 3));
+
+            __m128 v0 = _mm_mul_ps(m1[0], e0);
+            __m128 v1 = _mm_mul_ps(m1[1], e1);
+            __m128 v2 = _mm_mul_ps(m1[2], e2);
+            __m128 v3 = _mm_mul_ps(m1[3], e3);
+
+            __m128 a0 = _mm_add_ps(v0, v1);
+            __m128 a1 = _mm_add_ps(v2, v3);
+            __m128 a2 = _mm_add_ps(a0, a1);
+
+            dst1 = a2;
+        }
+
+        {
+            __m128 e0 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(0, 0, 0, 0));
+            __m128 e1 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(1, 1, 1, 1));
+            __m128 e2 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(2, 2, 2, 2));
+            __m128 e3 = _mm_shuffle_ps(m2[2], m2[2], _MM_SHUFFLE(3, 3, 3, 3));
+
+            __m128 v0 = _mm_mul_ps(m1[0], e0);
+            __m128 v1 = _mm_mul_ps(m1[1], e1);
+            __m128 v2 = _mm_mul_ps(m1[2], e2);
+            __m128 v3 = _mm_mul_ps(m1[3], e3);
+
+            __m128 a0 = _mm_add_ps(v0, v1);
+            __m128 a1 = _mm_add_ps(v2, v3);
+            __m128 a2 = _mm_add_ps(a0, a1);
+
+            dst2 = a2;
+        }
 
+        {
+            __m128 e0 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(0, 0, 0, 0));
+            __m128 e1 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(1, 1, 1, 1));
+            __m128 e2 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(2, 2, 2, 2));
+            __m128 e3 = _mm_shuffle_ps(m2[3], m2[3], _MM_SHUFFLE(3, 3, 3, 3));
+
+            __m128 v0 = _mm_mul_ps(m1[0], e0);
+            __m128 v1 = _mm_mul_ps(m1[1], e1);
+            __m128 v2 = _mm_mul_ps(m1[2], e2);
+            __m128 v3 = _mm_mul_ps(m1[3], e3);
+
+            __m128 a0 = _mm_add_ps(v0, v1);
+            __m128 a1 = _mm_add_ps(v2, v3);
+            __m128 a2 = _mm_add_ps(a0, a1);
+
+            dst3 = a2;
+        }
+        dst[0] = dst0;
+        dst[1] = dst1;
+        dst[2] = dst2;
+        dst[3] = dst3;
+    }
+
+    static void negateMatrix(const __m128 m[4], __m128 dst[4])
+    {
+        __m128 z = _mm_setzero_ps();
+        dst[0]   = _mm_sub_ps(z, m[0]);
+        dst[1]   = _mm_sub_ps(z, m[1]);
+        dst[2]   = _mm_sub_ps(z, m[2]);
+        dst[3]   = _mm_sub_ps(z, m[3]);
+    }
+
+    static void transposeMatrix(const __m128 m[4], __m128 dst[4])
+    {
+        __m128 tmp0 = _mm_shuffle_ps(m[0], m[1], 0x44);
+        __m128 tmp2 = _mm_shuffle_ps(m[0], m[1], 0xEE);
+        __m128 tmp1 = _mm_shuffle_ps(m[2], m[3], 0x44);
+        __m128 tmp3 = _mm_shuffle_ps(m[2], m[3], 0xEE);
+
+        dst[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);
+        dst[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
+        dst[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);
+        dst[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
+    }
+
+    static void transformVec4(const __m128 m[4], float x, float y, float z, float w, float* dst /*vec3*/)
+    {
+        //__m128 res = _mm_set_ps(w, z, y, x);
+        //__m128 xx = _mm_shuffle_ps(res, res, _MM_SHUFFLE(0, 0, 0, 0));
+        //__m128 yy = _mm_shuffle_ps(res, res, _MM_SHUFFLE(1, 1, 1, 1));
+        //__m128 zz = _mm_shuffle_ps(res, res, _MM_SHUFFLE(2, 2, 2, 2));
+        //__m128 ww = _mm_shuffle_ps(res, res, _MM_SHUFFLE(3, 3, 3, 3));
+
+        __m128 xx = _mm_set1_ps(x);
+        __m128 yy = _mm_set1_ps(y);
+        __m128 zz = _mm_set1_ps(z);
+        __m128 ww = _mm_set1_ps(w);
+
+        auto res = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m[0], xx), _mm_mul_ps(m[1], yy)),
+                              _mm_add_ps(_mm_mul_ps(m[2], zz), _mm_mul_ps(m[3], ww)));
+
+        _mm_storel_pi((__m64*)dst, res);
+
+#    if defined(__SSE4_1__)
+        *reinterpret_cast<int*>(dst + 2) = _mm_extract_ps(res, 2);
+#    else
+        dst[2] = _mm_cvtss_f32(_mm_movehl_ps(res, res));
+#    endif
+    }
+
+    static void transformVec4(const __m128 m[4], const float* v /*vec4*/, float* dst /*vec4*/)
+    {
+        //__m128 res = _mm_loadu_ps(v);
+        //__m128 xx = _mm_shuffle_ps(res, res, _MM_SHUFFLE(0, 0, 0, 0));
+        //__m128 yy = _mm_shuffle_ps(res, res, _MM_SHUFFLE(1, 1, 1, 1));
+        //__m128 zz = _mm_shuffle_ps(res, res, _MM_SHUFFLE(2, 2, 2, 2));
+        //__m128 ww = _mm_shuffle_ps(res, res, _MM_SHUFFLE(3, 3, 3, 3));
+
+        __m128 xx = _mm_set1_ps(v[0]);
+        __m128 yy = _mm_set1_ps(v[1]);
+        __m128 zz = _mm_set1_ps(v[2]);
+        __m128 ww = _mm_set1_ps(v[3]);
+
+        auto res = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m[0], xx), _mm_mul_ps(m[1], yy)),
+                              _mm_add_ps(_mm_mul_ps(m[2], zz), _mm_mul_ps(m[3], ww)));
+        _mm_storeu_ps(dst, res);
+    }
+
+    static void crossVec3(const float* v1, const float* v2, float* dst)
+    {
+        __m128 a = _mm_set_ps(0.0f, v1[2], v1[1], v1[0]);
+        __m128 b = _mm_set_ps(0.0f, v2[2], v2[1], v2[0]);
+
+        __m128 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+        __m128 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1));
+        __m128 res   = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b));
+
+        res = _mm_shuffle_ps(res, res, _MM_SHUFFLE(3, 0, 2, 1));
+
+        _mm_storel_pi((__m64*)dst, res);
+#    if defined(__SSE4_1__)
+        *reinterpret_cast<int*>(dst + 2) = _mm_extract_ps(res, 2);
+#    else
+        dst[2] = _mm_cvtss_f32(_mm_movehl_ps(res, res));
+#    endif
+    }
+
+    static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
+    {
+        auto& m = transform.col;
+
+        for (size_t i = 0; i < count; ++i)
+        {
+            auto& vert = src[i].vertices;
+            __m128 v   = _mm_set_ps(1.0f, vert.z, vert.y, vert.x);
+            v          = _mm_add_ps(
+                _mm_add_ps(_mm_mul_ps(m[0], _mm_shuffle_ps(v, v, 0)), _mm_mul_ps(m[1], _mm_shuffle_ps(v, v, 0x55))),
+                _mm_add_ps(_mm_mul_ps(m[2], _mm_shuffle_ps(v, v, 0xaa)), _mm_mul_ps(m[3], _mm_shuffle_ps(v, v, 0xff))));
+            _mm_storeu_ps((float*)&dst[i].vertices, v);
+
+            // Copy tex coords and colors
+            // dst[i].texCoords = src[i].texCoords;
+            // dst[i].colors    = src[i].colors;
+            memcpy(&dst[i].colors, &src[i].colors, sizeof(V3F_C4B_T2F::colors) + sizeof(V3F_C4B_T2F::texCoords));
+        }
+    }
+
+    static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
+    {
+        __m128i offset_vector = _mm_set1_epi16(offset);
+        size_t remainder      = count % 8;
+        size_t rounded_count  = count - remainder;
+
+        for (size_t i = 0; i < rounded_count; i += 8)
+        {
+            __m128i current_values = _mm_loadu_si128((__m128i*)(src + i));          // Load 8 values.
+            current_values         = _mm_add_epi16(current_values, offset_vector);  // Add offset to them.
+            _mm_storeu_si128((__m128i*)(dst + i), current_values);                  // Store the result.
+        }
+
+        // If count is not divisible by 8, add offset for the remainder elements one by one.
+        for (size_t i = 0; i < remainder; ++i)
+        {
+            dst[rounded_count + i] = src[rounded_count + i] + offset;
+        }
+    }
+};
+
+#endif
 
 NS_AX_MATH_END
diff --git a/core/platform/PlatformConfig.h b/core/platform/PlatformConfig.h
index 0e0bd15a29de..39da4778f91c 100644
--- a/core/platform/PlatformConfig.h
+++ b/core/platform/PlatformConfig.h
@@ -163,5 +163,26 @@ Linux: Desktop GL/Vulkan
 #    endif
 #endif
 
+// ## SIMD detections
+#if !defined(AX_NEON_INTRINSICS)
+#    if (AX_TARGET_PLATFORM != AX_PLATFORM_WASM)
+#        if defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM) || defined(__ARM_NEON__)
+#            define AX_NEON_INTRINSICS 1
+#        endif
+#    endif
+#endif
+
+#ifdef AX_SSE_INTRINSICS
+// axmol math ISA require SSE2 at latest
+#    include <emmintrin.h>
+#    if defined(__SSE4_1__)
+#        include <smmintrin.h>
+#    endif
+using _xm128_t = __m128;
+#elif defined(AX_NEON_INTRINSICS)
+#    include <arm_neon.h>
+using _xm128_t = float32x4_t;
+#endif
+
 /// @endcond
 #endif  // __BASE_AX_PLATFORM_CONFIG_H__
diff --git a/core/platform/PlatformMacros.h b/core/platform/PlatformMacros.h
index 401193803702..98f3bde1c915 100644
--- a/core/platform/PlatformMacros.h
+++ b/core/platform/PlatformMacros.h
@@ -89,12 +89,12 @@ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
  * @since v0.99.5
  */
 #if (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
-#   if !defined(AX_ENABLE_CACHE_TEXTURE_DATA)
-#       define AX_ENABLE_CACHE_TEXTURE_DATA 1
-#   endif
+#    if !defined(AX_ENABLE_CACHE_TEXTURE_DATA)
+#        define AX_ENABLE_CACHE_TEXTURE_DATA 1
+#    endif
 #else
-#   undef AX_ENABLE_CACHE_TEXTURE_DATA
-#   define AX_ENABLE_CACHE_TEXTURE_DATA 0
+#    undef AX_ENABLE_CACHE_TEXTURE_DATA
+#    define AX_ENABLE_CACHE_TEXTURE_DATA 0
 #endif
 
 /** @def AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST
@@ -102,12 +102,12 @@ Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
  *
  */
 #if (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID && !AX_ENABLE_CACHE_TEXTURE_DATA)
-#   if !defined(AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST)
-#       define AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST 1
-#   endif
+#    if !defined(AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST)
+#        define AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST 1
+#    endif
 #else
-#   undef AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST
-#   define AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST 0
+#    undef AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST
+#    define AX_ENABLE_RESTART_APPLICATION_ON_CONTEXT_LOST 0
 #endif
 
 #if (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID) || (AX_TARGET_PLATFORM == AX_PLATFORM_WIN32)
@@ -188,14 +188,20 @@ protected:                                                \
     varType varName;                                      \
                                                           \
 public:                                                   \
-    virtual inline varType get##funName() const { return varName; }
+    virtual inline varType get##funName() const           \
+    {                                                     \
+        return varName;                                   \
+    }
 
 #define AX_SYNTHESIZE_READONLY_PASS_BY_REF(varType, varName, funName) \
 protected:                                                            \
     varType varName;                                                  \
                                                                       \
 public:                                                               \
-    virtual inline const varType& get##funName() const { return varName; }
+    virtual inline const varType& get##funName() const                \
+    {                                                                 \
+        return varName;                                               \
+    }
 
 /** @def AX_SYNTHESIZE
  * It is used to declare a protected variable.
@@ -209,36 +215,51 @@ public:                                                               \
  *            The variables and methods declared after AX_SYNTHESIZE are all public.
  *            If you need protected or private, please declare.
  */
-#define AX_SYNTHESIZE(varType, varName, funName)                    \
-protected:                                                          \
-    varType varName;                                                \
-                                                                    \
-public:                                                             \
-    virtual inline varType get##funName() const { return varName; } \
-    virtual inline void set##funName(varType var) { varName = var; }
-
-#define AX_SYNTHESIZE_PASS_BY_REF(varType, varName, funName)               \
-protected:                                                                 \
-    varType varName;                                                       \
-                                                                           \
-public:                                                                    \
-    virtual inline const varType& get##funName() const { return varName; } \
-    virtual inline void set##funName(const varType& var) { varName = var; }
-
-#define AX_SYNTHESIZE_RETAIN(varType, varName, funName)             \
-private:                                                            \
-    varType varName;                                                \
-                                                                    \
-public:                                                             \
-    virtual inline varType get##funName() const { return varName; } \
-    virtual inline void set##funName(varType var)                   \
-    {                                                               \
-        if (varName != var)                                         \
-        {                                                           \
-            AX_SAFE_RETAIN(var);                                    \
-            AX_SAFE_RELEASE(varName);                               \
-            varName = var;                                          \
-        }                                                           \
+#define AX_SYNTHESIZE(varType, varName, funName)  \
+protected:                                        \
+    varType varName;                              \
+                                                  \
+public:                                           \
+    virtual inline varType get##funName() const   \
+    {                                             \
+        return varName;                           \
+    }                                             \
+    virtual inline void set##funName(varType var) \
+    {                                             \
+        varName = var;                            \
+    }
+
+#define AX_SYNTHESIZE_PASS_BY_REF(varType, varName, funName) \
+protected:                                                   \
+    varType varName;                                         \
+                                                             \
+public:                                                      \
+    virtual inline const varType& get##funName() const       \
+    {                                                        \
+        return varName;                                      \
+    }                                                        \
+    virtual inline void set##funName(const varType& var)     \
+    {                                                        \
+        varName = var;                                       \
+    }
+
+#define AX_SYNTHESIZE_RETAIN(varType, varName, funName) \
+private:                                                \
+    varType varName;                                    \
+                                                        \
+public:                                                 \
+    virtual inline varType get##funName() const         \
+    {                                                   \
+        return varName;                                 \
+    }                                                   \
+    virtual inline void set##funName(varType var)       \
+    {                                                   \
+        if (varName != var)                             \
+        {                                               \
+            AX_SAFE_RETAIN(var);                        \
+            AX_SAFE_RELEASE(varName);                   \
+            varName = var;                              \
+        }                                               \
     }
 
 #define AX_SAFE_DELETE(p) \
@@ -252,7 +273,7 @@ public:                                                             \
     {                           \
         if (p)                  \
         {                       \
-            delete[](p);        \
+            delete[] (p);       \
             (p) = nullptr;      \
         }                       \
     } while (0)
@@ -318,7 +339,7 @@ public:                                                             \
         } while (0)
 
 #elif _AX_DEBUG == 1
-#    define AXLOG(format, ...) ax::print(format, ##__VA_ARGS__)
+#    define AXLOG(format, ...)      ax::print(format, ##__VA_ARGS__)
 #    define AXLOGERROR(format, ...) ax::print(format, ##__VA_ARGS__)
 #    define AXLOGINFO(format, ...) \
         do                         \
@@ -327,10 +348,10 @@ public:                                                             \
 #    define AXLOGWARN(...) __AXLOGWITHFUNCTION(__VA_ARGS__)
 
 #elif _AX_DEBUG > 1
-#    define AXLOG(format, ...) ax::print(format, ##__VA_ARGS__)
+#    define AXLOG(format, ...)      ax::print(format, ##__VA_ARGS__)
 #    define AXLOGERROR(format, ...) ax::print(format, ##__VA_ARGS__)
-#    define AXLOGINFO(format, ...) ax::print(format, ##__VA_ARGS__)
-#    define AXLOGWARN(...) __AXLOGWITHFUNCTION(__VA_ARGS__)
+#    define AXLOGINFO(format, ...)  ax::print(format, ##__VA_ARGS__)
+#    define AXLOGWARN(...)          __AXLOGWITHFUNCTION(__VA_ARGS__)
 #endif  // _AX_DEBUG
 
 /** Lua engine debug */
@@ -349,8 +370,8 @@ public:                                                             \
  */
 #if defined(__GNUC__) && ((__GNUC__ >= 5) || ((__GNUG__ == 4) && (__GNUC_MINOR__ >= 4))) || \
     (defined(__clang__) && (__clang_major__ >= 3)) || (_MSC_VER >= 1800)
-#    define AX_DISALLOW_COPY_AND_ASSIGN(TypeName) \
-        TypeName(const TypeName&) = delete;       \
+#    define AX_DISALLOW_COPY_AND_ASSIGN(TypeName)      \
+        TypeName(const TypeName&)            = delete; \
         TypeName& operator=(const TypeName&) = delete;
 #else
 #    define AX_DISALLOW_COPY_AND_ASSIGN(TypeName) \
@@ -444,15 +465,25 @@ public:                                                             \
  */
 #if __has_builtin(__builtin_expect)
 #    ifdef __cplusplus
-#        define UTILS_LIKELY(exp) (__builtin_expect(!!(exp), true))
+#        define UTILS_LIKELY(exp)   (__builtin_expect(!!(exp), true))
 #        define UTILS_UNLIKELY(exp) (__builtin_expect(!!(exp), false))
 #    else
-#        define UTILS_LIKELY(exp) (__builtin_expect(!!(exp), 1))
+#        define UTILS_LIKELY(exp)   (__builtin_expect(!!(exp), 1))
 #        define UTILS_UNLIKELY(exp) (__builtin_expect(!!(exp), 0))
 #    endif
 #else
-#    define UTILS_LIKELY(exp) (!!(exp))
+#    define UTILS_LIKELY(exp)   (!!(exp))
 #    define UTILS_UNLIKELY(exp) (!!(exp))
 #endif
 
+#if defined(_MSC_VER)
+// MSVC does not support loop unrolling hints
+#    define UTILS_UNROLL
+#    define UTILS_NOUNROLL
+#else
+// C++11 allows pragmas to be specified as part of defines using the _Pragma syntax.
+#    define UTILS_UNROLL   _Pragma("unroll")
+#    define UTILS_NOUNROLL _Pragma("nounroll")
+#endif
+
 #endif  // __AX_PLATFORM_MACROS_H__
diff --git a/tests/unit-tests/Source/core/math/MathUtilTests.cpp b/tests/unit-tests/Source/core/math/MathUtilTests.cpp
index 4c5e8523b41a..b6d52fa2555c 100644
--- a/tests/unit-tests/Source/core/math/MathUtilTests.cpp
+++ b/tests/unit-tests/Source/core/math/MathUtilTests.cpp
@@ -26,57 +26,33 @@
 #include <doctest.h>
 #include "base/Config.h"
 #include "base/Types.h"
+#include "math/MathBase.h"
 #include "TestUtils.h"
 
-#if (AX_TARGET_PLATFORM == AX_PLATFORM_IOS)
-    #if defined(__arm64__)
-        #define USE_NEON64 1
-        #define INCLUDE_NEON64 1
-    #elif defined(__ARM_NEON__)
-        #define USE_NEON32 1
-        #define INCLUDE_NEON32 1
-    #endif
-#elif (AX_TARGET_PLATFORM == AX_PLATFORM_OSX)
-    #if defined(__arm64__) || defined(__aarch64__)
-        #define USE_NEON64 1
-        #define INCLUDE_NEON64 1
-    #endif
-#elif (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
-    #if defined(__arm64__) || defined(__aarch64__)
-        #define USE_NEON64 1
-        #define INCLUDE_NEON64 1
-    #elif defined(__ARM_NEON__)
-        #define INCLUDE_NEON32 1
-    #endif
-#endif
+#define INCLUDE_SSE
+#define USE_SSE
 
-#if defined(USE_NEON32) || defined(USE_NEON64) // || defined(USE_SSE)
-    #define SKIP_SIMD_TEST doctest::skip(false)
+#if defined(AX_SSE_INTRINSICS) || defined(AX_NEON_INTRINSICS)
+#    define SKIP_SIMD_TEST doctest::skip(false)
 #else
-    #define SKIP_SIMD_TEST doctest::skip(true)
+#    define SKIP_SIMD_TEST doctest::skip(true)
 #endif
 
 USING_NS_AX;
 
-namespace UnitTest {
-
-#ifdef INCLUDE_NEON32
-    #include "math/MathUtilNeon.inl"
-#endif
-
-#ifdef INCLUDE_NEON64
-    #include "math/MathUtilNeon64.inl"
-#endif
+namespace UnitTest
+{
 
-#ifdef INCLUDE_SSE
-    // #include "math/MathUtilSSE.inl"
+#ifdef AX_NEON_INTRINSICS
+#    include "math/MathUtilNeon.inl"
+#elif defined(AX_SSE_INTRINSICS)
+#    include "math/MathUtilSSE.inl"
 #endif
 
 #include "math/MathUtil.inl"
 
 }  // namespace UnitTest
 
-
 static void __checkMathUtilResult(std::string_view description, const float* a1, const float* a2, int size)
 {
     // Check whether the result of the optimized instruction is the same as which is implemented in C
@@ -87,11 +63,10 @@ static void __checkMathUtilResult(std::string_view description, const float* a1,
     }
 }
 
-
-TEST_SUITE("math/MathUtil") {
+TEST_SUITE("math/MathUtil")
+{
     using namespace UnitTest::ax;
 
-
     static void checkVerticesAreEqual(const V3F_C4B_T2F* v1, const V3F_C4B_T2F* v2, size_t count)
     {
         for (size_t i = 0; i < count; ++i)
@@ -102,84 +77,94 @@ TEST_SUITE("math/MathUtil") {
         }
     }
 
-
-    TEST_CASE("transformVertices") {
+    TEST_CASE("transformVertices")
+    {
         auto count = 5;
         std::vector<V3F_C4B_T2F> src(count);
         std::vector<V3F_C4B_T2F> expected(count);
         std::vector<V3F_C4B_T2F> dst(count);
 
-        for (int i = 0; i < count; ++i) {
+        for (int i = 0; i < count; ++i)
+        {
             src[i].vertices.set(float(i), float(i + 1), float(i + 2));
             src[i].colors.set(uint8_t(i + 3), uint8_t(i + 4), uint8_t(i + 5), uint8_t(i + 6));
             src[i].texCoords.set(float(i + 7), float(i + 8));
 
-            expected[i] = src[i];
+            expected[i]            = src[i];
             expected[i].vertices.x = src[i].vertices.y * 4;
             expected[i].vertices.y = src[i].vertices.x * -5;
             expected[i].vertices.z = src[i].vertices.z * 6;
         }
 
-        Mat4 transform(
-            0, 4, 0, 0,
-            -5, 0, 0, 0,
-            0, 0, 6, 0,
-            1, 2, 3, 1
-        );
+        Mat4 transform(0, 4, 0, 0, -5, 0, 0, 0, 0, 0, 6, 0, 1, 2, 3, 1);
 
-        SUBCASE("MathUtilC") {
+        SUBCASE("MathUtilC")
+        {
             MathUtilC::transformVertices(dst.data(), src.data(), count, transform);
             checkVerticesAreEqual(expected.data(), dst.data(), count);
         }
 
-        #if INCLUDE_NEON32
-            SUBCASE("MathUtilNeon") {
-                MathUtilNeon::transformVertices(dst.data(), src.data(), count, transform);
-                checkVerticesAreEqual(expected.data(), dst.data(), count);
-            }
-        #endif
-
-        #if INCLUDE_NEON64
-            SUBCASE("MathUtilNeon64") {
-                MathUtilNeon64::transformVertices(dst.data(), src.data(), count, transform);
-                checkVerticesAreEqual(expected.data(), dst.data(), count);
-            }
-        #endif
+#ifdef AX_NEON_INTRINSICS
+        SUBCASE("MathUtilNeon")
+        {
+            MathUtilNeon::transformVertices(dst.data(), src.data(), count, transform);
+            checkVerticesAreEqual(expected.data(), dst.data(), count);
+        }
+#elif defined(AX_SSE_INTRINSICS)
+        SUBCASE("MathUtilSSE")
+        {
+            MathUtilSSE::transformVertices(dst.data(), src.data(), count, transform);
+            checkVerticesAreEqual(expected.data(), dst.data(), count);
+        }
+#endif
     }
 
-    TEST_CASE("transformIndices") {
+    TEST_CASE("transformIndices")
+    {
         auto count = 43;
         std::vector<uint16_t> src(count);
         std::vector<uint16_t> expected(count);
 
-        for (int i = 0; i < count; ++i) {
-            src[i] = i;
+        for (int i = 0; i < count; ++i)
+        {
+            src[i]      = i;
             expected[i] = i + 5;
         }
 
         uint16_t offset = 5;
 
-        SUBCASE("MathUtilC") {
+        SUBCASE("MathUtilC")
+        {
             std::vector<uint16_t> dst(count);
             MathUtilC::transformIndices(dst.data(), src.data(), count, offset);
             for (int i = 0; i < count; ++i)
                 CHECK_EQ(expected[i], dst[i]);
         }
 
-        #if INCLUDE_NEON64
-            SUBCASE("MathUtilNeon64") {
-                std::vector<uint16_t> dst(count);
-                MathUtilNeon64::transformIndices(dst.data(), src.data(), count, offset);
-                for (int i = 0; i < count; ++i)
-                    CHECK_EQ(expected[i], dst[i]);
-            }
-        #endif
+#if defined(AX_NEON_INTRINSICS) && AX_64BITS
+        SUBCASE("MathUtilNeon")
+        {
+            std::vector<uint16_t> dst(count);
+            MathUtilNeon::transformIndices(dst.data(), src.data(), count, offset);
+            for (int i = 0; i < count; ++i)
+                CHECK_EQ(expected[i], dst[i]);
+        }
+#elif defined(AX_SSE_INTRINSICS)
+        SUBCASE("MathUtilSSE")
+        {
+            std::vector<uint16_t> dst(count);
+            MathUtilSSE::transformIndices(dst.data(), src.data(), count, offset);
+            for (int i = 0; i < count; ++i)
+                CHECK_EQ(expected[i], dst[i]);
+        }
+#endif
     }
 }
 
-
-TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
-    TEST_CASE("old_tests") {
+TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST)
+{
+    TEST_CASE("old_tests")
+    {
         // I know the next line looks ugly, but it's a way to test MathUtil. :)
         using namespace UnitTest::ax;
 
@@ -213,20 +198,18 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
         // inline static void addMatrix(const float* m, float scalar, float* dst);
         MathUtilC::addMatrix(inMat41, scalar, outMat4C);
 
-        #ifdef INCLUDE_NEON32
-            MathUtilNeon::addMatrix(inMat41, scalar, outMat4Opt);
-        #endif
-
-        #ifdef INCLUDE_NEON64
-            MathUtilNeon64::addMatrix(inMat41, scalar, outMat4Opt);
-        #endif
+#ifdef AX_NEON_INTRINSICS
+        MathUtilNeon::addMatrix(reinterpret_cast<const _xm128_t*>(inMat41), scalar,
+                                reinterpret_cast<_xm128_t*>(outMat4Opt));
+#endif
 
-        #ifdef INCLUDE_SSE
-        // FIXME:
-        #endif
+#ifdef AX_SSE_INTRINSICS
+        MathUtilSSE::addMatrix(reinterpret_cast<const _xm128_t*>(inMat41), scalar,
+                               reinterpret_cast<_xm128_t*>(outMat4Opt));
+#endif
 
         __checkMathUtilResult("inline static void addMatrix(const float* m, float scalar, float* dst);", outMat4C,
-                            outMat4Opt, MAT4_SIZE);
+                              outMat4Opt, MAT4_SIZE);
         // Clean
         memset(outMat4C, 0, sizeof(outMat4C));
         memset(outMat4Opt, 0, sizeof(outMat4Opt));
@@ -234,20 +217,16 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
         // inline static void addMatrix(const float* m1, const float* m2, float* dst);
         MathUtilC::addMatrix(inMat41, inMat42, outMat4C);
 
-        #ifdef INCLUDE_NEON32
-            MathUtilNeon::addMatrix(inMat41, inMat42, outMat4Opt);
-        #endif
-
-        #ifdef INCLUDE_NEON64
-            MathUtilNeon64::addMatrix(inMat41, inMat42, outMat4Opt);
-        #endif
-
-        #ifdef INCLUDE_SSE
-            // FIXME:
-        #endif
+#ifdef AX_NEON_INTRINSICS
+        MathUtilNeon::addMatrix(reinterpret_cast<const _xm128_t*>(inMat41), reinterpret_cast<const _xm128_t*>(inMat42),
+                                reinterpret_cast<_xm128_t*>(outMat4Opt));
+#elif defined(AX_SSE_INTRINSICS)
+        MathUtilSSE::addMatrix(reinterpret_cast<const _xm128_t*>(inMat41), reinterpret_cast<const _xm128_t*>(inMat42),
+                               reinterpret_cast<_xm128_t*>(outMat4Opt));
+#endif
 
         __checkMathUtilResult("inline static void addMatrix(const float* m1, const float* m2, float* dst);", outMat4C,
-                            outMat4Opt, MAT4_SIZE);
+                              outMat4Opt, MAT4_SIZE);
         // Clean
         memset(outMat4C, 0, sizeof(outMat4C));
         memset(outMat4Opt, 0, sizeof(outMat4Opt));
@@ -255,20 +234,18 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
         // inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
         MathUtilC::subtractMatrix(inMat41, inMat42, outMat4C);
 
-        #ifdef INCLUDE_NEON32
-            MathUtilNeon::subtractMatrix(inMat41, inMat42, outMat4Opt);
-        #endif
-
-        #ifdef INCLUDE_NEON64
-            MathUtilNeon64::subtractMatrix(inMat41, inMat42, outMat4Opt);
-        #endif
-
-        #ifdef INCLUDE_SSE
-            // FIXME:
-        #endif
+#ifdef AX_NEON_INTRINSICS
+        MathUtilNeon::subtractMatrix(reinterpret_cast<const _xm128_t*>(inMat41),
+                                     reinterpret_cast<const _xm128_t*>(inMat42),
+                                     reinterpret_cast<_xm128_t*>(outMat4Opt));
+#elif defined(AX_SSE_INTRINSICS)
+        MathUtilSSE::subtractMatrix(reinterpret_cast<const _xm128_t*>(inMat41),
+                                    reinterpret_cast<const _xm128_t*>(inMat42),
+                                    reinterpret_cast<_xm128_t*>(outMat4Opt));
+#endif
 
-        __checkMathUtilResult("inline static void subtractMatrix(const float* m1, const float* m2, float* dst);", outMat4C,
-                            outMat4Opt, MAT4_SIZE);
+        __checkMathUtilResult("inline static void subtractMatrix(const float* m1, const float* m2, float* dst);",
+                              outMat4C, outMat4Opt, MAT4_SIZE);
         // Clean
         memset(outMat4C, 0, sizeof(outMat4C));
         memset(outMat4Opt, 0, sizeof(outMat4Opt));
@@ -276,20 +253,16 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
         // inline static void multiplyMatrix(const float* m, float scalar, float* dst);
         MathUtilC::multiplyMatrix(inMat41, scalar, outMat4C);
 
-        #ifdef INCLUDE_NEON32
-            MathUtilNeon::multiplyMatrix(inMat41, scalar, outMat4Opt);
-        #endif
-
-        #ifdef INCLUDE_NEON64
-            MathUtilNeon64::multiplyMatrix(inMat41, scalar, outMat4Opt);
-        #endif
-
-        #ifdef INCLUDE_SSE
-            // FIXME:
-        #endif
+#ifdef AX_NEON_INTRINSICS
+        MathUtilNeon::multiplyMatrix(reinterpret_cast<const _xm128_t*>(inMat41), scalar,
+                                     reinterpret_cast<_xm128_t*>(outMat4Opt));
+#elif defined(AX_SSE_INTRINSICS)
+        MathUtilSSE::multiplyMatrix(reinterpret_cast<const _xm128_t*>(inMat41), scalar,
+                                    reinterpret_cast<_xm128_t*>(outMat4Opt));
+#endif
 
         __checkMathUtilResult("inline static void multiplyMatrix(const float* m, float scalar, float* dst);", outMat4C,
-                            outMat4Opt, MAT4_SIZE);
+                              outMat4Opt, MAT4_SIZE);
         // Clean
         memset(outMat4C, 0, sizeof(outMat4C));
         memset(outMat4Opt, 0, sizeof(outMat4Opt));
@@ -297,20 +270,18 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
         // inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
         MathUtilC::multiplyMatrix(inMat41, inMat42, outMat4C);
 
-        #ifdef INCLUDE_NEON32
-            MathUtilNeon::multiplyMatrix(inMat41, inMat42, outMat4Opt);
-        #endif
-
-        #ifdef INCLUDE_NEON64
-            MathUtilNeon64::multiplyMatrix(inMat41, inMat42, outMat4Opt);
-        #endif
-
-        #ifdef INCLUDE_SSE
-            // FIXME:
-        #endif
+#ifdef AX_NEON_INTRINSICS
+        MathUtilNeon::multiplyMatrix(reinterpret_cast<const _xm128_t*>(inMat41),
+                                     reinterpret_cast<const _xm128_t*>(inMat42),
+                                     reinterpret_cast<_xm128_t*>(outMat4Opt));
+#elif defined(AX_SSE_INTRINSICS)
+        MathUtilSSE::multiplyMatrix(reinterpret_cast<const _xm128_t*>(inMat41),
+                                    reinterpret_cast<const _xm128_t*>(inMat42),
+                                    reinterpret_cast<_xm128_t*>(outMat4Opt));
+#endif
 
-        __checkMathUtilResult("inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);", outMat4C,
-                            outMat4Opt, MAT4_SIZE);
+        __checkMathUtilResult("inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);",
+                              outMat4C, outMat4Opt, MAT4_SIZE);
         // Clean
         memset(outMat4C, 0, sizeof(outMat4C));
         memset(outMat4Opt, 0, sizeof(outMat4Opt));
@@ -318,20 +289,14 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
         // inline static void negateMatrix(const float* m, float* dst);
         MathUtilC::negateMatrix(inMat41, outMat4C);
 
-        #ifdef INCLUDE_NEON32
-            MathUtilNeon::negateMatrix(inMat41, outMat4Opt);
-        #endif
-
-        #ifdef INCLUDE_NEON64
-            MathUtilNeon64::negateMatrix(inMat41, outMat4Opt);
-        #endif
-
-        #ifdef INCLUDE_SSE
-            // FIXME:
-        #endif
+#ifdef AX_NEON_INTRINSICS
+        MathUtilNeon::negateMatrix(reinterpret_cast<const _xm128_t*>(inMat41), reinterpret_cast<_xm128_t*>(outMat4Opt));
+#elif defined(AX_SSE_INTRINSICS)
+        MathUtilSSE::negateMatrix(reinterpret_cast<const _xm128_t*>(inMat41), reinterpret_cast<_xm128_t*>(outMat4Opt));
+#endif
 
         __checkMathUtilResult("inline static void negateMatrix(const float* m, float* dst);", outMat4C, outMat4Opt,
-                            MAT4_SIZE);
+                              MAT4_SIZE);
         // Clean
         memset(outMat4C, 0, sizeof(outMat4C));
         memset(outMat4Opt, 0, sizeof(outMat4Opt));
@@ -339,20 +304,16 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
         // inline static void transposeMatrix(const float* m, float* dst);
         MathUtilC::transposeMatrix(inMat41, outMat4C);
 
-        #ifdef INCLUDE_NEON32
-            MathUtilNeon::transposeMatrix(inMat41, outMat4Opt);
-        #endif
-
-        #ifdef INCLUDE_NEON64
-            MathUtilNeon64::transposeMatrix(inMat41, outMat4Opt);
-        #endif
-
-        #ifdef INCLUDE_SSE
-            // FIXME:
-        #endif
+#ifdef AX_NEON_INTRINSICS
+        MathUtilNeon::transposeMatrix(reinterpret_cast<const _xm128_t*>(inMat41),
+                                      reinterpret_cast<_xm128_t*>(outMat4Opt));
+#elif defined(AX_SSE_INTRINSICS)
+        MathUtilSSE::transposeMatrix(reinterpret_cast<const _xm128_t*>(inMat41),
+                                     reinterpret_cast<_xm128_t*>(outMat4Opt));
+#endif
 
         __checkMathUtilResult("inline static void transposeMatrix(const float* m, float* dst);", outMat4C, outMat4Opt,
-                            MAT4_SIZE);
+                              MAT4_SIZE);
         // Clean
         memset(outMat4C, 0, sizeof(outMat4C));
         memset(outMat4Opt, 0, sizeof(outMat4Opt));
@@ -360,21 +321,16 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
         // inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
         MathUtilC::transformVec4(inMat41, x, y, z, w, outVec4C);
 
-        #ifdef INCLUDE_NEON32
-            MathUtilNeon::transformVec4(inMat41, x, y, z, w, outVec4Opt);
-        #endif
-
-        #ifdef INCLUDE_NEON64
-            MathUtilNeon64::transformVec4(inMat41, x, y, z, w, outVec4Opt);
-        #endif
-
-        #ifdef INCLUDE_SSE
-            // FIXME:
-        #endif
+#ifdef AX_NEON_INTRINSICS
+        MathUtilNeon::transformVec4(reinterpret_cast<const _xm128_t*>(inMat41), x, y, z, w, outVec4Opt);
+#elif defined(AX_SSE_INTRINSICS)
+        // FIXME:
+        MathUtilSSE::transformVec4(reinterpret_cast<const _xm128_t*>(inMat41), x, y, z, w, outVec4Opt);
+#endif
 
         __checkMathUtilResult(
-            "inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);", outVec4C,
-            outVec4Opt, VEC4_SIZE);
+            "inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);",
+            outVec4C, outVec4Opt, VEC4_SIZE);
         // Clean
         memset(outVec4C, 0, sizeof(outVec4C));
         memset(outVec4Opt, 0, sizeof(outVec4Opt));
@@ -382,20 +338,15 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
         // inline static void transformVec4(const float* m, const float* v, float* dst);
         MathUtilC::transformVec4(inMat41, inVec4, outVec4C);
 
-        #ifdef INCLUDE_NEON32
-            MathUtilNeon::transformVec4(inMat41, inVec4, outVec4Opt);
-        #endif
-
-        #ifdef INCLUDE_NEON64
-            MathUtilNeon64::transformVec4(inMat41, inVec4, outVec4Opt);
-        #endif
-
-        #ifdef INCLUDE_SSE
-            // FIXME:
-        #endif
+#ifdef AX_NEON_INTRINSICS
+        MathUtilNeon::transformVec4(reinterpret_cast<const _xm128_t*>(inMat41), reinterpret_cast<const float*>(inVec4),
+                                    reinterpret_cast<float*>(outVec4Opt));
+#elif defined(AX_SSE_INTRINSICS)
+        MathUtilSSE::transformVec4(reinterpret_cast<const _xm128_t*>(inMat41), reinterpret_cast<const float*>(inVec4), reinterpret_cast<float*>(outVec4Opt));
+#endif
 
         __checkMathUtilResult("inline static void transformVec4(const float* m, const float* v, float* dst);", outVec4C,
-                            outVec4Opt, VEC4_SIZE);
+                              outVec4Opt, VEC4_SIZE);
         // Clean
         memset(outVec4C, 0, sizeof(outVec4C));
         memset(outVec4Opt, 0, sizeof(outVec4Opt));
@@ -403,20 +354,14 @@ TEST_SUITE("math/MathUtil" * SKIP_SIMD_TEST) {
         // inline static void crossVec3(const float* v1, const float* v2, float* dst);
         MathUtilC::crossVec3(inVec4, inVec42, outVec4C);
 
-        #ifdef INCLUDE_NEON32
-            MathUtilNeon::crossVec3(inVec4, inVec42, outVec4Opt);
-        #endif
-
-        #ifdef INCLUDE_NEON64
-            MathUtilNeon64::crossVec3(inVec4, inVec42, outVec4Opt);
-        #endif
-
-        #ifdef INCLUDE_SSE
-            // FIXME:
-        #endif
+#ifdef AX_NEON_INTRINSICS
+        MathUtilNeon::crossVec3(inVec4, inVec42, outVec4Opt);
+#elif defined(AX_SSE_INTRINSICS)
+        MathUtilSSE::crossVec3(inVec4, inVec42, outVec4Opt);
+#endif
 
         __checkMathUtilResult("inline static void crossVec3(const float* v1, const float* v2, float* dst);", outVec4C,
-                            outVec4Opt, VEC4_SIZE);
+                              outVec4Opt, VEC4_SIZE);
         // Clean
         memset(outVec4C, 0, sizeof(outVec4C));
         memset(outVec4Opt, 0, sizeof(outVec4Opt));