From 9943db28b00ec08351cd891849e8d9368e42cf42 Mon Sep 17 00:00:00 2001 From: wjr <1966336874@qq.com> Date: Sat, 17 Aug 2024 19:50:56 +0800 Subject: [PATCH] update --- .gitmodules | 3 + atomic | 1 + .../wjr/arch/generic/math/bignum-config.hpp | 34 +- .../arch/x86/container/generic/bplus_tree.hpp | 32 +- include/wjr/arch/x86/format/charconv.hpp | 18 +- include/wjr/arch/x86/format/utf8/utf8.hpp | 6 +- include/wjr/arch/x86/json/lexer.hpp | 10 +- include/wjr/arch/x86/json/string.hpp | 10 +- include/wjr/arch/x86/math/add.hpp | 50 +-- include/wjr/arch/x86/math/compare.hpp | 10 +- include/wjr/arch/x86/math/div.hpp | 24 +- include/wjr/arch/x86/math/divider.hpp | 14 +- include/wjr/arch/x86/math/find.hpp | 150 +++---- include/wjr/arch/x86/math/gen_addrsblsh_n.hpp | 14 +- include/wjr/arch/x86/math/gen_addsub.hpp | 14 +- .../wjr/arch/x86/math/large-compare-impl.hpp | 185 ++++----- include/wjr/arch/x86/math/large-find-impl.hpp | 318 +++++++------- include/wjr/arch/x86/math/mul-impl.hpp | 76 ++-- include/wjr/arch/x86/math/mul.hpp | 56 +-- include/wjr/arch/x86/math/not.hpp | 12 +- include/wjr/arch/x86/math/prefix_xor.hpp | 10 +- include/wjr/arch/x86/math/set.hpp | 14 +- include/wjr/arch/x86/math/shift.hpp | 62 +-- include/wjr/arch/x86/math/sub.hpp | 50 +-- include/wjr/arch/x86/simd/avx.hpp | 24 +- include/wjr/arch/x86/simd/intrin.hpp | 14 +- include/wjr/arch/x86/simd/simd.hpp | 10 +- include/wjr/arch/x86/simd/simd_cast.hpp | 8 +- include/wjr/arch/x86/simd/sse.hpp | 226 +++++----- include/wjr/assert.hpp | 12 +- include/wjr/atomic.hpp | 4 + include/wjr/capture_leaf.hpp | 3 +- include/wjr/concurrency/pause.hpp | 48 +++ .../wjr/{network => concurrency}/timer.hpp | 10 +- include/wjr/container/generic/bplus_tree.hpp | 4 +- include/wjr/crtp/nonsendable.hpp | 4 +- include/wjr/format/charconv.hpp | 2 +- include/wjr/format/dragonbox.hpp | 3 +- include/wjr/format/fastfloat.hpp | 20 +- include/wjr/format/utf8/utf8.hpp | 2 +- include/wjr/iterator/detail.hpp | 2 +- include/wjr/json/lexer.hpp | 2 +- include/wjr/json/string.hpp | 2 +- include/wjr/math/add.hpp | 25 +- include/wjr/math/bit.hpp | 12 +- include/wjr/math/clz.hpp | 12 +- include/wjr/math/compare.hpp | 6 +- include/wjr/math/ctz.hpp | 12 +- include/wjr/math/detail.hpp | 2 +- include/wjr/math/div.hpp | 2 +- include/wjr/math/divider.hpp | 2 +- include/wjr/math/find.hpp | 2 +- include/wjr/math/mul.hpp | 16 +- include/wjr/math/not.hpp | 2 +- include/wjr/math/popcount.hpp | 22 +- include/wjr/math/prefix_xor.hpp | 2 +- include/wjr/math/set.hpp | 2 +- include/wjr/math/shift.hpp | 2 +- include/wjr/math/sub.hpp | 24 +- include/wjr/memory/detail.hpp | 2 +- include/wjr/memory/safe_pointer.hpp | 2 +- include/wjr/network/pause.hpp | 50 --- include/wjr/preprocessor/arithmatic/dec.hpp | 1 - include/wjr/preprocessor/compiler.hpp | 6 - include/wjr/preprocessor/compiler/arch.hpp | 42 -- .../wjr/preprocessor/compiler/attribute.hpp | 300 -------------- .../wjr/preprocessor/compiler/compiler.hpp | 113 ----- include/wjr/preprocessor/config.hpp | 7 + include/wjr/preprocessor/config/arch.hpp | 42 ++ include/wjr/preprocessor/config/attribute.hpp | 308 ++++++++++++++ include/wjr/preprocessor/config/compiler.hpp | 113 +++++ .../preprocessor/{compiler => config}/has.hpp | 123 +++--- include/wjr/preprocessor/config/platform.hpp | 13 + include/wjr/preprocessor/logical/bool.hpp | 1 - include/wjr/preprocessor/preview.hpp | 30 +- include/wjr/simd/simd.hpp | 2 +- include/wjr/string.hpp | 137 +++--- include/wjr/tp/compiler.hpp | 5 +- include/wjr/type_traits.hpp | 391 ++++++++++++++++++ src/wjr/format/dragonbox.cpp | 2 +- src/wjr/json/json.cpp | 21 +- src/wjr/json/lexer.cpp | 2 + src/wjr/memory/memory_pool.cpp | 5 +- src/wjr/x86/json/lexer.cpp | 48 ++- src/wjr/x86/json/string.cpp | 12 +- src/wjr/x86/math/mul.cpp | 8 +- src/wjr/x86/math/simd.cpp | 2 +- third-party/atomic | 1 + 88 files changed, 1970 insertions(+), 1537 deletions(-) create mode 160000 atomic create mode 100644 include/wjr/atomic.hpp create mode 100644 include/wjr/concurrency/pause.hpp rename include/wjr/{network => concurrency}/timer.hpp (97%) delete mode 100644 include/wjr/network/pause.hpp delete mode 100644 include/wjr/preprocessor/compiler.hpp delete mode 100644 include/wjr/preprocessor/compiler/arch.hpp delete mode 100644 include/wjr/preprocessor/compiler/attribute.hpp delete mode 100644 include/wjr/preprocessor/compiler/compiler.hpp create mode 100644 include/wjr/preprocessor/config.hpp create mode 100644 include/wjr/preprocessor/config/arch.hpp create mode 100644 include/wjr/preprocessor/config/attribute.hpp create mode 100644 include/wjr/preprocessor/config/compiler.hpp rename include/wjr/preprocessor/{compiler => config}/has.hpp (57%) create mode 100644 include/wjr/preprocessor/config/platform.hpp create mode 160000 third-party/atomic diff --git a/.gitmodules b/.gitmodules index e69de29b..6236a225 100644 --- a/.gitmodules +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "third-party/atomic"] + path = third-party/atomic + url = https://github.com/boostorg/atomic.git diff --git a/atomic b/atomic new file mode 160000 index 00000000..fbdb5f44 --- /dev/null +++ b/atomic @@ -0,0 +1 @@ +Subproject commit fbdb5f44d4ace030d8a8836cc4f3738bcc1c6d72 diff --git a/include/wjr/arch/generic/math/bignum-config.hpp b/include/wjr/arch/generic/math/bignum-config.hpp index 4670ea19..e1d6aab0 100644 --- a/include/wjr/arch/generic/math/bignum-config.hpp +++ b/include/wjr/arch/generic/math/bignum-config.hpp @@ -2,71 +2,71 @@ #define WJR_GENERIC_MATH_BIGNUM_CONFIG_HPP__ #ifndef WJR_TOOM22_MUL_THRESHOLD -#define WJR_TOOM22_MUL_THRESHOLD 22 + #define WJR_TOOM22_MUL_THRESHOLD 22 #endif #ifndef WJR_TOOM33_MUL_THRESHOLD -#define WJR_TOOM33_MUL_THRESHOLD 84 + #define WJR_TOOM33_MUL_THRESHOLD 84 #endif #ifndef WJR_TOOM44_MUL_THRESHOLD -#define WJR_TOOM44_MUL_THRESHOLD 208 + #define WJR_TOOM44_MUL_THRESHOLD 208 #endif #ifndef WJR_TOOM55_MUL_THRESHOLD -#define WJR_TOOM55_MUL_THRESHOLD 800 + #define WJR_TOOM55_MUL_THRESHOLD 800 #endif #ifndef WJR_TOOM32_TO_TOOM43_MUL_THRESHOLD -#define WJR_TOOM32_TO_TOOM43_MUL_THRESHOLD 73 + #define WJR_TOOM32_TO_TOOM43_MUL_THRESHOLD 73 #endif #ifndef WJR_TOOM32_TO_TOOM53_MUL_THRESHOLD -#define WJR_TOOM32_TO_TOOM53_MUL_THRESHOLD 153 + #define WJR_TOOM32_TO_TOOM53_MUL_THRESHOLD 153 #endif #ifndef WJR_TOOM42_TO_TOOM53_MUL_THRESHOLD -#define WJR_TOOM42_TO_TOOM53_MUL_THRESHOLD 137 + #define WJR_TOOM42_TO_TOOM53_MUL_THRESHOLD 137 #endif #ifndef WJR_TOOM42_TO_TOOM63_MUL_THRESHOLD -#define WJR_TOOM42_TO_TOOM63_MUL_THRESHOLD 153 + #define WJR_TOOM42_TO_TOOM63_MUL_THRESHOLD 153 #endif #ifndef WJR_TOOM2_SQR_THRESHOLD -#define WJR_TOOM2_SQR_THRESHOLD 34 + #define WJR_TOOM2_SQR_THRESHOLD 34 #endif #ifndef WJR_TOOM3_SQR_THRESHOLD -#define WJR_TOOM3_SQR_THRESHOLD 124 + #define WJR_TOOM3_SQR_THRESHOLD 124 #endif #ifndef WJR_TOOM4_SQR_THRESHOLD -#define WJR_TOOM4_SQR_THRESHOLD 288 + #define WJR_TOOM4_SQR_THRESHOLD 288 #endif #ifndef WJR_TOOM5_SQR_THRESHOLD -#define WJR_TOOM5_SQR_THRESHOLD 980 + #define WJR_TOOM5_SQR_THRESHOLD 980 #endif #ifndef WJR_DC_DIV_QR_THRESHOLD -#define WJR_DC_DIV_QR_THRESHOLD (WJR_TOOM22_MUL_THRESHOLD * 2) + #define WJR_DC_DIV_QR_THRESHOLD (WJR_TOOM22_MUL_THRESHOLD * 2) #endif // WJR_DC_DIV_QR_THRESHOLD #ifndef WJR_DC_BIGNUM_TO_CHARS_THRESHOLD -#define WJR_DC_BIGNUM_TO_CHARS_THRESHOLD 20 + #define WJR_DC_BIGNUM_TO_CHARS_THRESHOLD 20 #endif #ifndef WJR_DC_BIGNUM_TO_CHARS_PRECOMPUTE_THRESHOLD -#define WJR_DC_BIGNUM_TO_CHARS_PRECOMPUTE_THRESHOLD 20 + #define WJR_DC_BIGNUM_TO_CHARS_PRECOMPUTE_THRESHOLD 20 #endif #ifndef WJR_DC_BIGNUM_FROM_CHARS_THRESHOLD -#define WJR_DC_BIGNUM_FROM_CHARS_THRESHOLD 1670 + #define WJR_DC_BIGNUM_FROM_CHARS_THRESHOLD 1670 #endif #ifndef WJR_DC_BIGNUM_FROM_CHARS_PRECOMPUTE_THRESHOLD -#define WJR_DC_BIGNUM_FROM_CHARS_PRECOMPUTE_THRESHOLD 3105 + #define WJR_DC_BIGNUM_FROM_CHARS_PRECOMPUTE_THRESHOLD 3105 #endif #endif // WJR_GENERIC_MATH_BIGNUM_CONFIG_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/container/generic/bplus_tree.hpp b/include/wjr/arch/x86/container/generic/bplus_tree.hpp index bbfd1064..705bd663 100644 --- a/include/wjr/arch/x86/container/generic/bplus_tree.hpp +++ b/include/wjr/arch/x86/container/generic/bplus_tree.hpp @@ -1,5 +1,5 @@ -#ifndef WJR_X86_CONTAINER_GENERIC_BPLUS_TREE_HPP__ -#define WJR_X86_CONTAINER_GENERIC_BPLUS_TREE_HPP__ +#ifndef WJR_ARCH_X86_CONTAINER_GENERIC_BPLUS_TREE_HPP__ +#define WJR_ARCH_X86_CONTAINER_GENERIC_BPLUS_TREE_HPP__ #include @@ -8,7 +8,7 @@ namespace wjr { #if WJR_HAS_SIMD(SSE2) -#define WJR_HAS_BUILTIN_BPLUS_TREE_COPY WJR_HAS_DEF + #define WJR_HAS_BUILTIN_BPLUS_TREE_COPY WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(BPLUS_TREE_COPY) @@ -98,12 +98,12 @@ WJR_INTRINSIC_INLINE void __builtin_bplus_tree_copy_impl(const uint8_t *first, } } -#if WJR_HAS_SIMD(AVX2) + #if WJR_HAS_SIMD(AVX2) const auto x0 = avx::loadu(first); const auto x1 = avx::loadu(last - 32); avx::storeu(dest, x0); avx::storeu(dest + n - 32, x1); -#else + #else const auto x0 = sse::loadu(first); const auto x1 = sse::loadu(first + 16); const auto x2 = sse::loadu(last - 32); @@ -112,13 +112,13 @@ WJR_INTRINSIC_INLINE void __builtin_bplus_tree_copy_impl(const uint8_t *first, sse::storeu((dest + 16), x1); sse::storeu((dest + n - 32), x2); sse::storeu((dest + n - 16), x3); -#endif + #endif return; } while (false); } if constexpr (size == 8) { -#if WJR_HAS_SIMD(AVX2) + #if WJR_HAS_SIMD(AVX2) const auto x0 = avx::loadu(first); const auto x1 = avx::loadu(first + 32); const auto x2 = avx::loadu(last - 64); @@ -127,7 +127,7 @@ WJR_INTRINSIC_INLINE void __builtin_bplus_tree_copy_impl(const uint8_t *first, avx::storeu((dest + 32), x1); avx::storeu((dest + n - 64), x2); avx::storeu((dest + n - 32), x3); -#else + #else const auto x0 = sse::loadu(first); const auto x1 = sse::loadu(first + 16); const auto x2 = sse::loadu(first + 32); @@ -144,7 +144,7 @@ WJR_INTRINSIC_INLINE void __builtin_bplus_tree_copy_impl(const uint8_t *first, sse::storeu((dest + n - 48), x5); sse::storeu((dest + n - 32), x6); sse::storeu((dest + n - 16), x7); -#endif + #endif } } @@ -241,12 +241,12 @@ __builtin_bplus_tree_copy_backward_impl(const uint8_t *first, const uint8_t *las } } -#if WJR_HAS_SIMD(AVX2) + #if WJR_HAS_SIMD(AVX2) const auto x0 = avx::loadu(first); const auto x1 = avx::loadu(last - 32); avx::storeu((dest - n), x0); avx::storeu((dest - 32), x1); -#else + #else const auto x0 = sse::loadu(first); const auto x1 = sse::loadu(first + 16); const auto x2 = sse::loadu(last - 32); @@ -255,13 +255,13 @@ __builtin_bplus_tree_copy_backward_impl(const uint8_t *first, const uint8_t *las sse::storeu((dest - n + 16), x1); sse::storeu((dest - 32), x2); sse::storeu((dest - 16), x3); -#endif + #endif return; } while (false); } if constexpr (size == 8) { -#if WJR_HAS_SIMD(AVX2) + #if WJR_HAS_SIMD(AVX2) const auto x0 = avx::loadu(first); const auto x1 = avx::loadu(first + 32); const auto x2 = avx::loadu(last - 64); @@ -270,7 +270,7 @@ __builtin_bplus_tree_copy_backward_impl(const uint8_t *first, const uint8_t *las avx::storeu((dest - n + 32), x1); avx::storeu((dest - 64), x2); avx::storeu((dest - 32), x3); -#else + #else const auto x0 = sse::loadu(first); const auto x1 = sse::loadu(first + 16); const auto x2 = sse::loadu(first + 32); @@ -287,7 +287,7 @@ __builtin_bplus_tree_copy_backward_impl(const uint8_t *first, const uint8_t *las sse::storeu((dest - 48), x5); sse::storeu((dest - 32), x6); sse::storeu((dest - 16), x7); -#endif + #endif } } @@ -304,4 +304,4 @@ WJR_INTRINSIC_INLINE void builtin_bplus_tree_copy_backward(const Other *first, } // namespace wjr -#endif // WJR_X86_CONTAINER_GENERIC_BPLUS_TREE_HPP__ +#endif // WJR_ARCH_X86_CONTAINER_GENERIC_BPLUS_TREE_HPP__ diff --git a/include/wjr/arch/x86/format/charconv.hpp b/include/wjr/arch/x86/format/charconv.hpp index c503babe..b2058f41 100644 --- a/include/wjr/arch/x86/format/charconv.hpp +++ b/include/wjr/arch/x86/format/charconv.hpp @@ -1,21 +1,21 @@ -#ifndef WJR_X86_FORMAT_CHARCONV_HPP__ -#define WJR_X86_FORMAT_CHARCONV_HPP__ +#ifndef WJR_ARCH_X86_FORMAT_CHARCONV_HPP__ +#define WJR_ARCH_X86_FORMAT_CHARCONV_HPP__ -#include #include +#include #ifndef WJR_X86 -#error "x86 required" + #error "x86 required" #endif namespace wjr { #if WJR_HAS_SIMD(SSE4_1) -#define WJR_HAS_BUILTIN_TO_CHARS_UNROLL_8_FAST WJR_HAS_DEF + #define WJR_HAS_BUILTIN_TO_CHARS_UNROLL_8_FAST WJR_HAS_DEF -#define WJR_HAS_BUILTIN_FROM_CHARS_UNROLL_4_FAST WJR_HAS_DEF -#define WJR_HAS_BUILTIN_FROM_CHARS_UNROLL_8_FAST WJR_HAS_DEF -#define WJR_HAS_BUILTIN_FROM_CHARS_UNROLL_16_FAST WJR_HAS_DEF + #define WJR_HAS_BUILTIN_FROM_CHARS_UNROLL_4_FAST WJR_HAS_DEF + #define WJR_HAS_BUILTIN_FROM_CHARS_UNROLL_8_FAST WJR_HAS_DEF + #define WJR_HAS_BUILTIN_FROM_CHARS_UNROLL_16_FAST WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(TO_CHARS_UNROLL_8_FAST) @@ -194,4 +194,4 @@ uint64_t builtin_from_chars_unroll_16_fast(const void *ptr, origin_converter_t) } // namespace wjr -#endif // WJR_X86_FORMAT_CHARCONV_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_FORMAT_CHARCONV_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/format/utf8/utf8.hpp b/include/wjr/arch/x86/format/utf8/utf8.hpp index 31fd3bd7..86056f99 100644 --- a/include/wjr/arch/x86/format/utf8/utf8.hpp +++ b/include/wjr/arch/x86/format/utf8/utf8.hpp @@ -1,8 +1,8 @@ -#ifndef WJR_X86_FORMAT_UTF8_HPP__ -#define WJR_X86_FORMAT_UTF8_HPP__ +#ifndef WJR_ARCH_X86_FORMAT_UTF8_HPP__ +#define WJR_ARCH_X86_FORMAT_UTF8_HPP__ #include namespace wjr::utf8 {} // namespace wjr::utf8 -#endif // WJR_X86_FORMAT_UTF8_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_FORMAT_UTF8_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/json/lexer.hpp b/include/wjr/arch/x86/json/lexer.hpp index 459bffc1..da919544 100644 --- a/include/wjr/arch/x86/json/lexer.hpp +++ b/include/wjr/arch/x86/json/lexer.hpp @@ -1,11 +1,11 @@ -#ifndef WJR_X86_JSON_LEXER_HPP__ -#define WJR_X86_JSON_LEXER_HPP__ +#ifndef WJR_ARCH_X86_JSON_LEXER_HPP__ +#define WJR_ARCH_X86_JSON_LEXER_HPP__ #include #if WJR_HAS_SIMD(SSSE3) -#define WJR_HAS_BUILTIN_JSON_LEXER_READER_READ_BUF WJR_HAS_DEF -#define WJR_HAS_BUILTIN_JSON_MINIFY_BUF WJR_HAS_DEF + #define WJR_HAS_BUILTIN_JSON_LEXER_READER_READ_BUF WJR_HAS_DEF + #define WJR_HAS_BUILTIN_JSON_MINIFY_BUF WJR_HAS_DEF #endif -#endif // WJR_X86_JSON_LEXER_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_JSON_LEXER_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/json/string.hpp b/include/wjr/arch/x86/json/string.hpp index 0c734328..b60f3b53 100644 --- a/include/wjr/arch/x86/json/string.hpp +++ b/include/wjr/arch/x86/json/string.hpp @@ -1,11 +1,11 @@ -#ifndef WJR_X86_JSON_STRING_HPP__ -#define WJR_X86_JSON_STRING_HPP__ +#ifndef WJR_ARCH_X86_JSON_STRING_HPP__ +#define WJR_ARCH_X86_JSON_STRING_HPP__ #include #if WJR_HAS_SIMD(SSE2) -#define WJR_HAS_BUILTIN_JSON_PARSE_STRING WJR_HAS_DEF -#define WJR_HAS_BUILTIN_JSON_CHECK_STRING WJR_HAS_DEF + #define WJR_HAS_BUILTIN_JSON_PARSE_STRING WJR_HAS_DEF + #define WJR_HAS_BUILTIN_JSON_CHECK_STRING WJR_HAS_DEF #endif -#endif // WJR_X86_JSON_STRING_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_JSON_STRING_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/add.hpp b/include/wjr/arch/x86/math/add.hpp index aa99b692..c5a28284 100644 --- a/include/wjr/arch/x86/math/add.hpp +++ b/include/wjr/arch/x86/math/add.hpp @@ -1,36 +1,36 @@ -#ifndef WJR_X86_MATH_ADD_HPP__ -#define WJR_X86_MATH_ADD_HPP__ +#ifndef WJR_ARCH_X86_MATH_ADD_HPP__ +#define WJR_ARCH_X86_MATH_ADD_HPP__ #include #ifndef WJR_X86 -#error "x86 required" + #error "x86 required" #endif #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_HAS_BUILTIN_ASM_ADDC WJR_HAS_DEF -#define WJR_HAS_BUILTIN_ASM_ADDC_N WJR_HAS_DEF -#define WJR_HAS_BUILTIN___ASM_ADD_128 WJR_HAS_DEF -#define WJR_HAS_BUILTIN___ASM_ADDC_128 WJR_HAS_DEF - -#if WJR_HAS_FEATURE(INLINE_ASM_CCCOND) -#define WJR_HAS_BUILTIN_ASM_ADDC_CC WJR_HAS_DEF -#define WJR_HAS_BUILTIN___ASM_ADDC_CC_128 WJR_HAS_DEF -#endif + #define WJR_HAS_BUILTIN_ASM_ADDC WJR_HAS_DEF + #define WJR_HAS_BUILTIN_ASM_ADDC_N WJR_HAS_DEF + #define WJR_HAS_BUILTIN___ASM_ADD_128 WJR_HAS_DEF + #define WJR_HAS_BUILTIN___ASM_ADDC_128 WJR_HAS_DEF + + #if WJR_HAS_FEATURE(INLINE_ASM_CCCOND) + #define WJR_HAS_BUILTIN_ASM_ADDC_CC WJR_HAS_DEF + #define WJR_HAS_BUILTIN___ASM_ADDC_CC_128 WJR_HAS_DEF + #endif #else -#if defined(WJR_MSVC) -#define WJR_HAS_BUILTIN_ASM_ADDC WJR_HAS_DEF_VAR(2) -#endif + #if defined(WJR_MSVC) + #define WJR_HAS_BUILTIN_ASM_ADDC WJR_HAS_DEF_VAR(2) + #endif -#if defined(WJR_ENABLE_ASSEMBLY) -#define WJR_HAS_BUILTIN_ASM_ADDC_N WJR_HAS_DEF_VAR(3) -#endif + #if defined(WJR_ENABLE_ASSEMBLY) + #define WJR_HAS_BUILTIN_ASM_ADDC_N WJR_HAS_DEF_VAR(3) + #endif #endif #if WJR_HAS_BUILTIN(ASM_ADDC) == 2 -#include + #include #endif namespace wjr { @@ -55,7 +55,7 @@ namespace wjr { template WJR_INTRINSIC_INLINE uint64_t asm_addc(uint64_t a, uint64_t b, U c_in, U &c_out) noexcept { -#if WJR_HAS_BUILTIN(ASM_ADDC) == 1 + #if WJR_HAS_BUILTIN(ASM_ADDC) == 1 if (WJR_BUILTIN_CONSTANT_P_TRUE(c_in == 1)) { if (WJR_BUILTIN_CONSTANT_P(b) && in_range(b)) { asm("stc\n\t" @@ -113,11 +113,11 @@ WJR_INTRINSIC_INLINE uint64_t asm_addc(uint64_t a, uint64_t b, U c_in, } c_out = c_in; return a; -#else + #else uint64_t ret; c_out = fast_cast(_addcarry_u64(fast_cast(c_in), a, b, &ret)); return ret; -#endif + #endif } #endif @@ -189,8 +189,8 @@ WJR_INTRINSIC_INLINE uint64_t asm_addc_cc(uint64_t a, uint64_t b, uint8_t c_in, #endif #if WJR_HAS_BUILTIN(ASM_ADDC_N) -#define WJR_ADDSUB_I 1 -#include + #define WJR_ADDSUB_I 1 + #include #endif #if WJR_HAS_BUILTIN(__ASM_ADD_128) @@ -388,4 +388,4 @@ WJR_INTRINSIC_INLINE uint8_t __asm_addc_cc_128(uint64_t &al, uint64_t &ah, uint6 } // namespace wjr -#endif // WJR_X86_MATH_ADD_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_MATH_ADD_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/compare.hpp b/include/wjr/arch/x86/math/compare.hpp index bfd2bb00..08a0893a 100644 --- a/include/wjr/arch/x86/math/compare.hpp +++ b/include/wjr/arch/x86/math/compare.hpp @@ -1,5 +1,5 @@ -#ifndef WJR_X86_MATH_COMPARE_HPP__ -#define WJR_X86_MATH_COMPARE_HPP__ +#ifndef WJR_ARCH_X86_MATH_COMPARE_HPP__ +#define WJR_ARCH_X86_MATH_COMPARE_HPP__ #include @@ -89,8 +89,8 @@ WJR_INTRINSIC_INLINE int builtin_reverse_compare_n(const T *src0, const T *src1, // __uint128_t has certain bugs in GCC 13.2, resulting in low performance #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_HAS_BUILTIN___ASM_LESS_128 WJR_HAS_DEF -#define WJR_HAS_BUILTIN___ASM_LESS_EQUAL_128 WJR_HAS_DEF + #define WJR_HAS_BUILTIN___ASM_LESS_128 WJR_HAS_DEF + #define WJR_HAS_BUILTIN___ASM_LESS_EQUAL_128 WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(__ASM_LESS_128) @@ -125,4 +125,4 @@ __asm_less_equal_128(uint64_t lo0, uint64_t hi0, uint64_t lo1, uint64_t hi1) noe } // namespace wjr -#endif // WJR_X86_MATH_COMPARE_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_MATH_COMPARE_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/div.hpp b/include/wjr/arch/x86/math/div.hpp index bbbef7a3..4e1d1b1d 100644 --- a/include/wjr/arch/x86/math/div.hpp +++ b/include/wjr/arch/x86/math/div.hpp @@ -1,27 +1,27 @@ -#ifndef WJR_X86_MATH_DIV_HPP__ -#define WJR_X86_MATH_DIV_HPP__ +#ifndef WJR_ARCH_X86_MATH_DIV_HPP__ +#define WJR_ARCH_X86_MATH_DIV_HPP__ #include #ifndef WJR_X86 -#error "x86 required" + #error "x86 required" #endif namespace wjr { #if defined(__BMI2__) -#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_HAS_BUILTIN_ASM_DIVEXACT_DBM1C WJR_HAS_DEF -#elif defined(WJR_ENABLE_ASSEMBLY) -#define WJR_HAS_BUILTIN_ASM_DIVEXACT_DBM1C WJR_HAS_DEF_VAR(3) -#endif + #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) + #define WJR_HAS_BUILTIN_ASM_DIVEXACT_DBM1C WJR_HAS_DEF + #elif defined(WJR_ENABLE_ASSEMBLY) + #define WJR_HAS_BUILTIN_ASM_DIVEXACT_DBM1C WJR_HAS_DEF_VAR(3) + #endif #endif #if WJR_HAS_BUILTIN(ASM_DIVEXACT_DBM1C) -#if WJR_HAS_BUILTIN(ASM_DIVEXACT_DBM1C) == 1 + #if WJR_HAS_BUILTIN(ASM_DIVEXACT_DBM1C) == 1 // TODO : optimize pipeline inline uint64_t asm_divexact_dbm1c(uint64_t *dst, const uint64_t *src, size_t n, @@ -77,7 +77,7 @@ inline uint64_t asm_divexact_dbm1c(uint64_t *dst, const uint64_t *src, size_t n, return r8; } -#else + #else extern "C" WJR_MS_ABI uint64_t __wjr_asm_divexact_dbm1c(uint64_t *dst, const uint64_t *src, size_t n, @@ -89,10 +89,10 @@ WJR_INTRINSIC_INLINE uint64_t asm_divexact_dbm1c(uint64_t *dst, const uint64_t * return __wjr_asm_divexact_dbm1c(dst, src, n, bd, h); } -#endif + #endif #endif } // namespace wjr -#endif // WJR_X86_MATH_DIV_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_MATH_DIV_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/divider.hpp b/include/wjr/arch/x86/math/divider.hpp index 485f3a39..3eff83a1 100644 --- a/include/wjr/arch/x86/math/divider.hpp +++ b/include/wjr/arch/x86/math/divider.hpp @@ -1,16 +1,16 @@ -#ifndef WJR_X86_MATH_DIVIDER_HPP__ -#define WJR_X86_MATH_DIVIDER_HPP__ +#ifndef WJR_ARCH_X86_MATH_DIVIDER_HPP__ +#define WJR_ARCH_X86_MATH_DIVIDER_HPP__ #include namespace wjr { #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_HAS_BUILTIN_ASM_DIV2BY1_ADJUST WJR_HAS_DEF + #define WJR_HAS_BUILTIN_ASM_DIV2BY1_ADJUST WJR_HAS_DEF -#if defined(WJR_COMPILER_CLANG) && !WJR_HAS_CLANG(13, 0, 0) -#define WJR_HAS_BUILTIN_ASM_DIV2BY1_ADJUST_BRANCH WJR_HAS_DEF -#endif + #if defined(WJR_COMPILER_CLANG) && !WJR_HAS_CLANG(13, 0, 0) + #define WJR_HAS_BUILTIN_ASM_DIV2BY1_ADJUST_BRANCH WJR_HAS_DEF + #endif #endif @@ -41,4 +41,4 @@ WJR_INTRINSIC_INLINE void asm_div2by1_adjust_branch(T div, T &lo) noexcept { } // namespace wjr -#endif // WJR_X86_MATH_DIVIDER_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_MATH_DIVIDER_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/find.hpp b/include/wjr/arch/x86/math/find.hpp index e26b8056..8f32da31 100644 --- a/include/wjr/arch/x86/math/find.hpp +++ b/include/wjr/arch/x86/math/find.hpp @@ -1,5 +1,5 @@ -#ifndef WJR_X86_MATH_FIND_HPP__ -#define WJR_X86_MATH_FIND_HPP__ +#ifndef WJR_ARCH_X86_MATH_FIND_HPP__ +#define WJR_ARCH_X86_MATH_FIND_HPP__ #include @@ -9,22 +9,22 @@ namespace wjr { template WJR_PURE size_t large_builtin_find_n(const T *src0, const T *src1, size_t n) noexcept { -#define WJR_REGISTER_FIND_N_AVX(index) \ - do { \ - auto x = avx::loadu(src0 + (index)); \ - auto y = avx::loadu(src1 + (index)); \ - auto r = avx::cmpeq_epi64(x, y); \ + #define WJR_REGISTER_FIND_N_AVX(index) \ + do { \ + auto x = avx::loadu(src0 + (index)); \ + auto y = avx::loadu(src1 + (index)); \ + auto r = avx::cmpeq_epi64(x, y); \ \ - avx::mask_type mask = avx::movemask_epi8(r); \ - if (WJR_LIKELY(mask != 0)) { \ - return (index) + ctz(mask) / 8; \ - } \ - } while (false) + avx::mask_type mask = avx::movemask_epi8(r); \ + if (WJR_LIKELY(mask != 0)) { \ + return (index) + ctz(mask) / 8; \ + } \ + } while (false) size_t rem = n & 7; if (rem > 4) { -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) auto x0 = sse::loadu(src0 + (rem - 4)); auto x1 = sse::loadu(src0 + (rem - 2)); auto y1 = sse::loadu(src1 + (rem - 2)); @@ -42,16 +42,16 @@ WJR_PURE size_t large_builtin_find_n(const T *src0, const T *src1, size_t n) noe mask = sse::movemask_epi8(r1); return (rem - 2) + (mask == 0xFF00); } -#else + #else WJR_REGISTER_FIND_N_AVX(rem - 4); -#endif + #endif } if (WJR_UNLIKELY(rem == n)) { return n; } -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) do { auto x0 = sse::loadu(src0 + rem); auto x1 = sse::loadu(src0 + rem + 2); @@ -91,7 +91,7 @@ WJR_PURE size_t large_builtin_find_n(const T *src0, const T *src1, size_t n) noe rem += 8; } while (WJR_LIKELY(rem != n)); -#else + #else if ((n - rem) & 8) { WJR_REGISTER_FIND_N_AVX(rem); WJR_REGISTER_FIND_N_AVX(rem + 4); @@ -142,11 +142,11 @@ WJR_PURE size_t large_builtin_find_n(const T *src0, const T *src1, size_t n) noe rem += 16; } while (WJR_LIKELY(rem != n)); -#endif + #endif return n; -#undef WJR_REGISTER_FIND_N_AVX + #undef WJR_REGISTER_FIND_N_AVX } template @@ -179,27 +179,27 @@ WJR_INTRINSIC_INLINE size_t builtin_find_n(const T *src0, const T *src1, template WJR_PURE size_t large_builtin_find_n(const T *src, T val, size_t n) noexcept { -#define WJR_REGISTER_FIND_N_AVX(index) \ - do { \ - auto x = avx::loadu(src + (index)); \ - auto r = avx::cmpeq_epi64(x, y); \ + #define WJR_REGISTER_FIND_N_AVX(index) \ + do { \ + auto x = avx::loadu(src + (index)); \ + auto r = avx::cmpeq_epi64(x, y); \ \ - auto mask = avx::movemask_epi8(r); \ - if (WJR_LIKELY(mask != 0)) { \ - return (index) + ctz(mask) / 8; \ - } \ - } while (false) + auto mask = avx::movemask_epi8(r); \ + if (WJR_LIKELY(mask != 0)) { \ + return (index) + ctz(mask) / 8; \ + } \ + } while (false) -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) auto y = sse::set1(val, T()); -#else + #else auto y = avx::set1(val, T()); -#endif + #endif size_t rem = n & 7; if (rem > 4) { -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) auto x0 = sse::loadu(src + (rem - 4)); auto x1 = sse::loadu(src + (rem - 2)); @@ -215,16 +215,16 @@ WJR_PURE size_t large_builtin_find_n(const T *src, T val, size_t n) noexcept { mask = sse::movemask_epi8(r1); return rem - 2 + (mask == 0xFF00); } -#else + #else WJR_REGISTER_FIND_N_AVX(rem - 4); -#endif + #endif } if (WJR_UNLIKELY(rem == n)) { return n; } -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) do { auto x0 = sse::loadu(src + rem); auto x1 = sse::loadu(src + rem + 2); @@ -260,7 +260,7 @@ WJR_PURE size_t large_builtin_find_n(const T *src, T val, size_t n) noexcept { rem += 8; } while (WJR_LIKELY(rem != n)); -#else + #else if ((n - rem) & 8) { WJR_REGISTER_FIND_N_AVX(rem); WJR_REGISTER_FIND_N_AVX(rem + 4); @@ -307,11 +307,11 @@ WJR_PURE size_t large_builtin_find_n(const T *src, T val, size_t n) noexcept { rem += 16; } while (WJR_LIKELY(rem != n)); -#endif + #endif return n; -#undef WJR_REGISTER_FIND_N_AVX + #undef WJR_REGISTER_FIND_N_AVX } template @@ -391,23 +391,23 @@ WJR_INTRINSIC_INLINE size_t builtin_find_not_n(const T *src, T val, size_t n) no template WJR_PURE size_t large_builtin_reverse_find_n(const T *src0, const T *src1, size_t n) noexcept { -#define WJR_REGISTER_REVERSE_FIND_N_AVX(index) \ - do { \ - auto x = avx::loadu(src0 - 4 + (index)); \ - auto y = avx::loadu(src1 - 4 + (index)); \ - auto r = avx::cmpeq_epi64(x, y); \ + #define WJR_REGISTER_REVERSE_FIND_N_AVX(index) \ + do { \ + auto x = avx::loadu(src0 - 4 + (index)); \ + auto y = avx::loadu(src1 - 4 + (index)); \ + auto r = avx::cmpeq_epi64(x, y); \ \ - avx::mask_type mask = avx::movemask_epi8(r); \ - if (WJR_LIKELY(mask != 0)) { \ - return (index)-clz(mask) / 8; \ - } \ - } while (false) + avx::mask_type mask = avx::movemask_epi8(r); \ + if (WJR_LIKELY(mask != 0)) { \ + return (index)-clz(mask) / 8; \ + } \ + } while (false) const size_t rem = n & 7; n -= rem; if (rem > 4) { -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) auto x0 = sse::loadu(src0 + n + 2); auto x1 = sse::loadu(src0 + n); auto y0 = sse::loadu(src1 + n + 2); @@ -425,16 +425,16 @@ WJR_PURE size_t large_builtin_reverse_find_n(const T *src0, const T *src1, mask = sse::movemask_epi8(r1); return n + 2 - (mask == 0x00FF); } -#else + #else WJR_REGISTER_REVERSE_FIND_N_AVX(n + 4); -#endif + #endif } if (WJR_UNLIKELY(n == 0)) { return 0; } -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) do { auto x0 = sse::loadu(src0 + n - 8); auto x1 = sse::loadu(src0 + n - 6); @@ -474,7 +474,7 @@ WJR_PURE size_t large_builtin_reverse_find_n(const T *src0, const T *src1, n -= 8; } while (WJR_LIKELY(n != 0)); -#else + #else if ((n & 8) != 0) { WJR_REGISTER_REVERSE_FIND_N_AVX(n); WJR_REGISTER_REVERSE_FIND_N_AVX(n - 4); @@ -525,11 +525,11 @@ WJR_PURE size_t large_builtin_reverse_find_n(const T *src0, const T *src1, n -= 16; } while (WJR_LIKELY(n != 0)); -#endif + #endif return 0; -#undef WJR_REGISTER_REVERSE_FIND_N_AVX + #undef WJR_REGISTER_REVERSE_FIND_N_AVX } template @@ -563,28 +563,28 @@ WJR_INTRINSIC_INLINE size_t builtin_reverse_find_n(const T *src0, const T *src1, template WJR_PURE size_t large_builtin_reverse_find_n(const T *src, T val, size_t n) noexcept { -#define WJR_REGISTER_REVERSE_FIND_N_AVX(index) \ - do { \ - auto x = avx::loadu(src - 4 + (index)); \ - auto r = avx::cmpeq_epi64(x, y); \ + #define WJR_REGISTER_REVERSE_FIND_N_AVX(index) \ + do { \ + auto x = avx::loadu(src - 4 + (index)); \ + auto r = avx::cmpeq_epi64(x, y); \ \ - avx::mask_type mask = avx::movemask_epi8(r); \ - if (WJR_LIKELY(mask != 0)) { \ - return (index)-clz(mask) / 8; \ - } \ - } while (false) + avx::mask_type mask = avx::movemask_epi8(r); \ + if (WJR_LIKELY(mask != 0)) { \ + return (index)-clz(mask) / 8; \ + } \ + } while (false) -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) auto y = sse::set1(val, T()); -#else + #else auto y = avx::set1(val, T()); -#endif + #endif const size_t rem = n & 7; n -= rem; if (rem > 4) { -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) auto x0 = sse::loadu(src + n + 2); auto x1 = sse::loadu(src + n); @@ -600,16 +600,16 @@ WJR_PURE size_t large_builtin_reverse_find_n(const T *src, T val, size_t n) noex mask = sse::movemask_epi8(r1); return n + 2 - (mask == 0x00FF); } -#else + #else WJR_REGISTER_REVERSE_FIND_N_AVX(n + 4); -#endif + #endif } if (WJR_UNLIKELY(n == 0)) { return 0; } -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) do { auto x0 = sse::loadu(src + n - 8); auto x1 = sse::loadu(src + n - 6); @@ -645,7 +645,7 @@ WJR_PURE size_t large_builtin_reverse_find_n(const T *src, T val, size_t n) noex n -= 8; } while (WJR_LIKELY(n != 0)); -#else + #else if ((n & 8) != 0) { WJR_REGISTER_REVERSE_FIND_N_AVX(n); WJR_REGISTER_REVERSE_FIND_N_AVX(n - 4); @@ -692,11 +692,11 @@ WJR_PURE size_t large_builtin_reverse_find_n(const T *src, T val, size_t n) noex n -= 16; } while (WJR_LIKELY(n != 0)); -#endif + #endif return 0; -#undef WJR_REGISTER_REVERSE_FIND_N_AVX + #undef WJR_REGISTER_REVERSE_FIND_N_AVX } template @@ -777,4 +777,4 @@ WJR_INTRINSIC_INLINE size_t builtin_reverse_find_not_n(const T *src, T val, } // namespace wjr -#endif // WJR_X86_MATH_FIND_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_MATH_FIND_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/gen_addrsblsh_n.hpp b/include/wjr/arch/x86/math/gen_addrsblsh_n.hpp index 0d5cfb96..d7c29fcf 100644 --- a/include/wjr/arch/x86/math/gen_addrsblsh_n.hpp +++ b/include/wjr/arch/x86/math/gen_addrsblsh_n.hpp @@ -3,17 +3,17 @@ // 1 : ADD #ifndef WJR_ADDSUB_I -#error "abort" + #error "abort" #endif #if WJR_ADDSUB_I == 1 -#define WJR_addsub add -#define WJR_adcsbb adc -#define __WJR_TEST_ASSEMBLY ASM_ADDLSH_N + #define WJR_addsub add + #define WJR_adcsbb adc + #define __WJR_TEST_ASSEMBLY ASM_ADDLSH_N #else -#define WJR_addsub rsb -#define WJR_adcsbb sbb -#define __WJR_TEST_ASSEMBLY ASM_RSBLSH_N + #define WJR_addsub rsb + #define WJR_adcsbb sbb + #define __WJR_TEST_ASSEMBLY ASM_RSBLSH_N #endif #if WJR_HAS_BUILTIN(__WJR_TEST_ASSEMBLY) == 1 diff --git a/include/wjr/arch/x86/math/gen_addsub.hpp b/include/wjr/arch/x86/math/gen_addsub.hpp index ba40e4ca..c89662ce 100644 --- a/include/wjr/arch/x86/math/gen_addsub.hpp +++ b/include/wjr/arch/x86/math/gen_addsub.hpp @@ -5,17 +5,17 @@ #include #ifndef WJR_ADDSUB_I -#error "abort" + #error "abort" #endif #if WJR_ADDSUB_I == 1 -#define WJR_addcsubc addc -#define WJR_adcsbb adc -#define __WJR_TEST_ASSEMBLY ASM_ADDC_N + #define WJR_addcsubc addc + #define WJR_adcsbb adc + #define __WJR_TEST_ASSEMBLY ASM_ADDC_N #else -#define WJR_addcsubc subc -#define WJR_adcsbb sbb -#define __WJR_TEST_ASSEMBLY ASM_SUBC_N + #define WJR_addcsubc subc + #define WJR_adcsbb sbb + #define __WJR_TEST_ASSEMBLY ASM_SUBC_N #endif #if WJR_HAS_BUILTIN(__WJR_TEST_ASSEMBLY) == 1 diff --git a/include/wjr/arch/x86/math/large-compare-impl.hpp b/include/wjr/arch/x86/math/large-compare-impl.hpp index 1d4e7803..a13b2e56 100644 --- a/include/wjr/arch/x86/math/large-compare-impl.hpp +++ b/include/wjr/arch/x86/math/large-compare-impl.hpp @@ -1,19 +1,19 @@ -#ifndef WJR_X86_MATH_LARGE_COMPARE_IMPL_HPP__ -#define WJR_X86_MATH_LARGE_COMPARE_IMPL_HPP__ +#ifndef WJR_ARCH_X86_MATH_LARGE_COMPARE_IMPL_HPP__ +#define WJR_ARCH_X86_MATH_LARGE_COMPARE_IMPL_HPP__ +#include #include #include -#include #ifndef WJR_X86 -#error "x86 required" + #error "x86 required" #endif namespace wjr { #if WJR_HAS_SIMD(SSE4_1) -#define WJR_HAS_BUILTIN_COMPARE_N WJR_HAS_DEF -#define WJR_HAS_BUILTIN_REVERSE_COMPARE_N WJR_HAS_DEF + #define WJR_HAS_BUILTIN_COMPARE_N WJR_HAS_DEF + #define WJR_HAS_BUILTIN_REVERSE_COMPARE_N WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(COMPARE_N) @@ -24,44 +24,44 @@ namespace wjr { */ template WJR_PURE int large_builtin_compare_n(const T *src0, const T *src1, size_t n) noexcept { -#define WJR_REGISTER_COMPARE_NOT_N_2(index) \ - do { \ - const auto x = sse::loadu(src0 + (index)); \ - const auto y = sse::loadu(src1 + (index)); \ - const auto r = sse::cmpeq_epi64(x, y); \ + #define WJR_REGISTER_COMPARE_NOT_N_2(index) \ + do { \ + const auto x = sse::loadu(src0 + (index)); \ + const auto y = sse::loadu(src1 + (index)); \ + const auto r = sse::cmpeq_epi64(x, y); \ \ - const sse::mask_type mask = ~sse::movemask_epi8(r); \ - if (WJR_UNLIKELY(mask != 0)) { \ - if (mask == 0xFF00) { \ - return src0[(index) + 1] < src1[(index) + 1] ? -1 : 1; \ + const sse::mask_type mask = ~sse::movemask_epi8(r); \ + if (WJR_UNLIKELY(mask != 0)) { \ + if (mask == 0xFF00) { \ + return src0[(index) + 1] < src1[(index) + 1] ? -1 : 1; \ + } \ + return src0[index] < src1[index] ? -1 : 1; \ } \ - return src0[index] < src1[index] ? -1 : 1; \ - } \ - } while (false) - -#if WJR_HAS_SIMD(AVX2) -#define WJR_REGISTER_COMPARE_NOT_N_4(index) \ - do { \ - const auto x = avx::loadu(src0 + (index)); \ - const auto y = avx::loadu(src1 + (index)); \ - const auto r = avx::cmpeq_epi64(x, y); \ + } while (false) + + #if WJR_HAS_SIMD(AVX2) + #define WJR_REGISTER_COMPARE_NOT_N_4(index) \ + do { \ + const auto x = avx::loadu(src0 + (index)); \ + const auto y = avx::loadu(src1 + (index)); \ + const auto r = avx::cmpeq_epi64(x, y); \ \ - const avx::mask_type mask = ~avx::movemask_epi8(r); \ - if (WJR_UNLIKELY(mask != 0)) { \ - const auto offset = ctz(mask) / 8; \ - return src0[(index) + offset] < src1[(index) + offset] ? -1 : 1; \ - } \ - } while (false) -#else -#define WJR_REGISTER_COMPARE_NOT_N_4(index) \ - WJR_REGISTER_COMPARE_NOT_N_2(index); \ - WJR_REGISTER_COMPARE_NOT_N_2((index) + 2) -#endif - -#define WJR_REGISTER_COMPARE_NOT_N_ADVANCE(index) \ - src0 += index; \ - src1 += index -#define WJR_REGISTER_COMPARE_NOT_N_RET(index) 0 + const avx::mask_type mask = ~avx::movemask_epi8(r); \ + if (WJR_UNLIKELY(mask != 0)) { \ + const auto offset = ctz(mask) / 8; \ + return src0[(index) + offset] < src1[(index) + offset] ? -1 : 1; \ + } \ + } while (false) + #else + #define WJR_REGISTER_COMPARE_NOT_N_4(index) \ + WJR_REGISTER_COMPARE_NOT_N_2(index); \ + WJR_REGISTER_COMPARE_NOT_N_2((index) + 2) + #endif + + #define WJR_REGISTER_COMPARE_NOT_N_ADVANCE(index) \ + src0 += index; \ + src1 += index + #define WJR_REGISTER_COMPARE_NOT_N_RET(index) 0 WJR_ASSUME(n > 2); @@ -69,7 +69,7 @@ WJR_PURE int large_builtin_compare_n(const T *src0, const T *src1, size_t n) noe n, WJR_REGISTER_COMPARE_NOT_N_2, WJR_REGISTER_COMPARE_NOT_N_4, WJR_HAS_SIMD(AVX2), WJR_REGISTER_COMPARE_NOT_N_ADVANCE, , WJR_REGISTER_COMPARE_NOT_N_RET); -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) do { const auto r0 = sse::cmpeq_epi64(sse::loadu(src0), sse::loadu(src1)); const auto r1 = sse::cmpeq_epi64(sse::loadu(src0 + 2), sse::loadu(src1 + 2)); @@ -114,7 +114,7 @@ WJR_PURE int large_builtin_compare_n(const T *src0, const T *src1, size_t n) noe src1 += 8; n -= 8; } while (WJR_LIKELY(n != 0)); -#else + #else do { const auto r0 = avx::cmpeq_epi64(avx::loadu(src0), avx::loadu(src1)); const auto r1 = avx::cmpeq_epi64(avx::loadu(src0 + 4), avx::loadu(src1 + 4)); @@ -151,14 +151,14 @@ WJR_PURE int large_builtin_compare_n(const T *src0, const T *src1, size_t n) noe src1 += 16; n -= 16; } while (WJR_LIKELY(n != 0)); -#endif + #endif return 0; -#undef WJR_REGISTER_COMPARE_NOT_N_RET -#undef WJR_REGISTER_COMPARE_NOT_N_ADVANCE -#undef WJR_REGISTER_COMPARE_NOT_N_4 -#undef WJR_REGISTER_COMPARE_NOT_N_2 + #undef WJR_REGISTER_COMPARE_NOT_N_RET + #undef WJR_REGISTER_COMPARE_NOT_N_ADVANCE + #undef WJR_REGISTER_COMPARE_NOT_N_4 + #undef WJR_REGISTER_COMPARE_NOT_N_2 } extern template WJR_PURE int large_builtin_compare_n(const uint64_t *src0, @@ -178,45 +178,46 @@ extern template WJR_PURE int large_builtin_compare_n(const uint64_t *s template WJR_PURE int large_builtin_reverse_compare_n(const T *src0, const T *src1, size_t n) noexcept { -#define WJR_REGISTER_REVERSE_COMPARE_NOT_N_2(index) \ - do { \ - const auto x = sse::loadu(src0 + (index)); \ - const auto y = sse::loadu(src1 + (index)); \ - const auto r = sse::cmpeq_epi64(x, y); \ + #define WJR_REGISTER_REVERSE_COMPARE_NOT_N_2(index) \ + do { \ + const auto x = sse::loadu(src0 + (index)); \ + const auto y = sse::loadu(src1 + (index)); \ + const auto r = sse::cmpeq_epi64(x, y); \ \ - const sse::mask_type mask = ~sse::movemask_epi8(r); \ - if (WJR_UNLIKELY(mask != 0)) { \ - if (mask == 0x00FF) { \ - return src0[index] < src1[index] ? -1 : 1; \ + const sse::mask_type mask = ~sse::movemask_epi8(r); \ + if (WJR_UNLIKELY(mask != 0)) { \ + if (mask == 0x00FF) { \ + return src0[index] < src1[index] ? -1 : 1; \ + } \ + return src0[(index) + 1] < src1[(index) + 1] ? -1 : 1; \ } \ - return src0[(index) + 1] < src1[(index) + 1] ? -1 : 1; \ - } \ - } while (false) - -#if WJR_HAS_SIMD(AVX2) -#define WJR_REGISTER_REVERSE_COMPARE_NOT_N_4(index) \ - do { \ - const auto x = avx::loadu(src0 + (index)); \ - const auto y = avx::loadu(src1 + (index)); \ - const auto r = avx::cmpeq_epi64(x, y); \ + } while (false) + + #if WJR_HAS_SIMD(AVX2) + #define WJR_REGISTER_REVERSE_COMPARE_NOT_N_4(index) \ + do { \ + const auto x = avx::loadu(src0 + (index)); \ + const auto y = avx::loadu(src1 + (index)); \ + const auto r = avx::cmpeq_epi64(x, y); \ \ - const avx::mask_type mask = ~avx::movemask_epi8(r); \ - if (WJR_UNLIKELY(mask != 0)) { \ - const auto offset = clz(mask) / 8; \ - return src0[(index) + 3 - offset] < src1[(index) + 3 - offset] ? -1 : 1; \ - } \ - } while (false) -#else -#define WJR_REGISTER_REVERSE_COMPARE_NOT_N_4(index) \ - WJR_REGISTER_REVERSE_COMPARE_NOT_N_2((index) + 2); \ - WJR_REGISTER_REVERSE_COMPARE_NOT_N_2(index) -#endif - -#define WJR_REGISTER_REVERSE_COMPARE_NOT_N_ADVANCE(index) \ - src0 += index; \ - src1 += index - -#define WJR_REGISTER_REVERSE_COMPARE_NOT_N_RET(index) 0 + const avx::mask_type mask = ~avx::movemask_epi8(r); \ + if (WJR_UNLIKELY(mask != 0)) { \ + const auto offset = clz(mask) / 8; \ + return src0[(index) + 3 - offset] < src1[(index) + 3 - offset] ? -1 \ + : 1; \ + } \ + } while (false) + #else + #define WJR_REGISTER_REVERSE_COMPARE_NOT_N_4(index) \ + WJR_REGISTER_REVERSE_COMPARE_NOT_N_2((index) + 2); \ + WJR_REGISTER_REVERSE_COMPARE_NOT_N_2(index) + #endif + + #define WJR_REGISTER_REVERSE_COMPARE_NOT_N_ADVANCE(index) \ + src0 += index; \ + src1 += index + + #define WJR_REGISTER_REVERSE_COMPARE_NOT_N_RET(index) 0 WJR_ASSUME(n > 2); @@ -225,7 +226,7 @@ WJR_PURE int large_builtin_reverse_compare_n(const T *src0, const T *src1, WJR_HAS_SIMD(AVX2), WJR_REGISTER_REVERSE_COMPARE_NOT_N_ADVANCE, , WJR_REGISTER_REVERSE_COMPARE_NOT_N_RET); -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) do { const auto r0 = sse::cmpeq_epi64(sse::loadu(src0 - 8), sse::loadu(src1 - 8)); const auto r1 = sse::cmpeq_epi64(sse::loadu(src0 - 6), sse::loadu(src1 - 6)); @@ -270,7 +271,7 @@ WJR_PURE int large_builtin_reverse_compare_n(const T *src0, const T *src1, src1 -= 8; n -= 8; } while (WJR_LIKELY(n != 0)); -#else + #else do { const auto r0 = avx::cmpeq_epi64(avx::loadu(src0 - 16), avx::loadu(src1 - 16)); const auto r1 = avx::cmpeq_epi64(avx::loadu(src0 - 12), avx::loadu(src1 - 12)); @@ -307,14 +308,14 @@ WJR_PURE int large_builtin_reverse_compare_n(const T *src0, const T *src1, src1 -= 16; n -= 16; } while (WJR_LIKELY(n != 0)); -#endif + #endif return 0; -#undef WJR_REGISTER_REVERSE_COMPARE_NOT_N_RET -#undef WJR_REGISTER_REVERSE_COMPARE_NOT_N_ADVANCE -#undef WJR_REGISTER_REVERSE_COMPARE_NOT_N_4 -#undef WJR_REGISTER_REVERSE_COMPARE_NOT_N_2 + #undef WJR_REGISTER_REVERSE_COMPARE_NOT_N_RET + #undef WJR_REGISTER_REVERSE_COMPARE_NOT_N_ADVANCE + #undef WJR_REGISTER_REVERSE_COMPARE_NOT_N_4 + #undef WJR_REGISTER_REVERSE_COMPARE_NOT_N_2 } extern template WJR_PURE int @@ -325,4 +326,4 @@ large_builtin_reverse_compare_n(const uint64_t *src0, const uint64_t * } // namespace wjr -#endif // WJR_X86_MATH_LARGE_COMPARE_IMPL_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_MATH_LARGE_COMPARE_IMPL_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/large-find-impl.hpp b/include/wjr/arch/x86/math/large-find-impl.hpp index d32d7b3a..23e4ddef 100644 --- a/include/wjr/arch/x86/math/large-find-impl.hpp +++ b/include/wjr/arch/x86/math/large-find-impl.hpp @@ -1,21 +1,21 @@ -#ifndef WJR_X86_MATH_LARGE_FIND_IMPL_HPP__ -#define WJR_X86_MATH_LARGE_FIND_IMPL_HPP__ +#ifndef WJR_ARCH_X86_MATH_LARGE_FIND_IMPL_HPP__ +#define WJR_ARCH_X86_MATH_LARGE_FIND_IMPL_HPP__ +#include #include #include -#include #ifndef WJR_X86 -#error "x86 required" + #error "x86 required" #endif namespace wjr { #if WJR_HAS_SIMD(SSE4_1) -#define WJR_HAS_BUILTIN_FIND_N WJR_HAS_DEF -#define WJR_HAS_BUILTIN_REVERSE_FIND_N WJR_HAS_DEF -#define WJR_HAS_BUILTIN_FIND_NOT_N WJR_HAS_DEF -#define WJR_HAS_BUILTIN_REVERSE_FIND_NOT_N WJR_HAS_DEF + #define WJR_HAS_BUILTIN_FIND_N WJR_HAS_DEF + #define WJR_HAS_BUILTIN_REVERSE_FIND_N WJR_HAS_DEF + #define WJR_HAS_BUILTIN_FIND_NOT_N WJR_HAS_DEF + #define WJR_HAS_BUILTIN_REVERSE_FIND_NOT_N WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(FIND_NOT_N) @@ -23,48 +23,48 @@ namespace wjr { template WJR_PURE size_t large_builtin_find_not_n(const T *src0, const T *src1, size_t n) noexcept { -#define WJR_REGISTER_FIND_NOT_N_2(index) \ - do { \ - const auto x = sse::loadu(src0 + (index)); \ - const auto y = sse::loadu(src1 + (index)); \ - const auto r = sse::cmpeq_epi64(x, y); \ + #define WJR_REGISTER_FIND_NOT_N_2(index) \ + do { \ + const auto x = sse::loadu(src0 + (index)); \ + const auto y = sse::loadu(src1 + (index)); \ + const auto r = sse::cmpeq_epi64(x, y); \ \ - const sse::mask_type mask = ~sse::movemask_epi8(r); \ - if (WJR_UNLIKELY(mask != 0)) { \ - return (index) + (mask == 0xFF00); \ - } \ - } while (false) - -#if WJR_HAS_SIMD(AVX2) -#define WJR_REGISTER_FIND_NOT_N_4(index) \ - do { \ - const auto x = avx::loadu(src0 + (index)); \ - const auto y = avx::loadu(src1 + (index)); \ - const auto r = avx::cmpeq_epi64(x, y); \ + const sse::mask_type mask = ~sse::movemask_epi8(r); \ + if (WJR_UNLIKELY(mask != 0)) { \ + return (index) + (mask == 0xFF00); \ + } \ + } while (false) + + #if WJR_HAS_SIMD(AVX2) + #define WJR_REGISTER_FIND_NOT_N_4(index) \ + do { \ + const auto x = avx::loadu(src0 + (index)); \ + const auto y = avx::loadu(src1 + (index)); \ + const auto r = avx::cmpeq_epi64(x, y); \ \ - const avx::mask_type mask = ~avx::movemask_epi8(r); \ - if (WJR_UNLIKELY(mask != 0)) { \ - return (index) + ctz(mask) / 8; \ - } \ - } while (false) -#else -#define WJR_REGISTER_FIND_NOT_N_4(index) \ - WJR_REGISTER_FIND_NOT_N_2(index); \ - WJR_REGISTER_FIND_NOT_N_2((index) + 2) -#endif - -#define WJR_REGISTER_FIND_NOT_N_ADVNCE(index) \ - src0 += index; \ - src1 += index - -#define WJR_REGISTER_FIND_NOT_N_RET(index) index + const avx::mask_type mask = ~avx::movemask_epi8(r); \ + if (WJR_UNLIKELY(mask != 0)) { \ + return (index) + ctz(mask) / 8; \ + } \ + } while (false) + #else + #define WJR_REGISTER_FIND_NOT_N_4(index) \ + WJR_REGISTER_FIND_NOT_N_2(index); \ + WJR_REGISTER_FIND_NOT_N_2((index) + 2) + #endif + + #define WJR_REGISTER_FIND_NOT_N_ADVNCE(index) \ + src0 += index; \ + src1 += index + + #define WJR_REGISTER_FIND_NOT_N_RET(index) index WJR_REGISTER_X86_NORMAL_SIMD_FUNCTION( n, WJR_REGISTER_FIND_NOT_N_2, WJR_REGISTER_FIND_NOT_N_4, WJR_HAS_SIMD(AVX2), WJR_REGISTER_FIND_NOT_N_ADVNCE, const auto __src0 = src0, WJR_REGISTER_FIND_NOT_N_RET); -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) do { const auto r0 = sse::cmpeq_epi64(sse::loadu(src0), sse::loadu(src1)); const auto r1 = sse::cmpeq_epi64(sse::loadu(src0 + 2), sse::loadu(src1 + 2)); @@ -97,7 +97,7 @@ WJR_PURE size_t large_builtin_find_not_n(const T *src0, const T *src1, src1 += 8; n -= 8; } while (WJR_LIKELY(n != 0)); -#else + #else do { const auto r0 = avx::cmpeq_epi64(avx::loadu(src0), avx::loadu(src1)); const auto r1 = avx::cmpeq_epi64(avx::loadu(src0 + 4), avx::loadu(src1 + 4)); @@ -130,14 +130,14 @@ WJR_PURE size_t large_builtin_find_not_n(const T *src0, const T *src1, src1 += 16; n -= 16; } while (WJR_LIKELY(n != 0)); -#endif + #endif return src0 - __src0; -#undef WJR_REGISTER_FIND_NOT_N_RET -#undef WJR_REGISTER_FIND_NOT_N_ADVNCE -#undef WJR_REGISTER_FIND_NOT_N_4 -#undef WJR_REGISTER_FIND_NOT_N_2 + #undef WJR_REGISTER_FIND_NOT_N_RET + #undef WJR_REGISTER_FIND_NOT_N_ADVNCE + #undef WJR_REGISTER_FIND_NOT_N_4 + #undef WJR_REGISTER_FIND_NOT_N_2 } extern template WJR_PURE size_t large_builtin_find_not_n(const uint64_t *src0, @@ -146,47 +146,47 @@ extern template WJR_PURE size_t large_builtin_find_not_n(const uint64_ template WJR_PURE size_t large_builtin_find_not_n(const T *src, T val, size_t n) noexcept { -#define WJR_REGISTER_FIND_NOT_N_2(index) \ - do { \ - const auto r = sse::cmpeq_epi64(sse::loadu(src + (index)), y2); \ + #define WJR_REGISTER_FIND_NOT_N_2(index) \ + do { \ + const auto r = sse::cmpeq_epi64(sse::loadu(src + (index)), y2); \ \ - const sse::mask_type mask = ~sse::movemask_epi8(r); \ - if (WJR_UNLIKELY(mask != 0)) { \ - return (index) + (mask == 0xFF00); \ - } \ - } while (false) - -#if WJR_HAS_SIMD(AVX2) -#define WJR_REGISTER_FIND_NOT_N_4(index) \ - do { \ - const auto r = avx::cmpeq_epi64(avx::loadu(src + (index)), y4); \ + const sse::mask_type mask = ~sse::movemask_epi8(r); \ + if (WJR_UNLIKELY(mask != 0)) { \ + return (index) + (mask == 0xFF00); \ + } \ + } while (false) + + #if WJR_HAS_SIMD(AVX2) + #define WJR_REGISTER_FIND_NOT_N_4(index) \ + do { \ + const auto r = avx::cmpeq_epi64(avx::loadu(src + (index)), y4); \ \ - const avx::mask_type mask = ~avx::movemask_epi8(r); \ - if (WJR_UNLIKELY(mask != 0)) { \ - return (index) + ctz(mask) / 8; \ - } \ - } while (false) -#else -#define WJR_REGISTER_FIND_NOT_N_4(index) \ - WJR_REGISTER_FIND_NOT_N_2(index); \ - WJR_REGISTER_FIND_NOT_N_2((index) + 2) -#endif + const avx::mask_type mask = ~avx::movemask_epi8(r); \ + if (WJR_UNLIKELY(mask != 0)) { \ + return (index) + ctz(mask) / 8; \ + } \ + } while (false) + #else + #define WJR_REGISTER_FIND_NOT_N_4(index) \ + WJR_REGISTER_FIND_NOT_N_2(index); \ + WJR_REGISTER_FIND_NOT_N_2((index) + 2) + #endif -#define WJR_REGISTER_FIND_NOT_N_ADVANCE(index) src += index + #define WJR_REGISTER_FIND_NOT_N_ADVANCE(index) src += index -#define WJR_REGISTER_FIND_NOT_N_RET(index) index + #define WJR_REGISTER_FIND_NOT_N_RET(index) index const auto y2 = sse::set1(val, T()); -#if WJR_HAS_SIMD(AVX2) + #if WJR_HAS_SIMD(AVX2) const auto y4 = broadcast<__m128i_t, __m256i_t>(y2); -#endif + #endif WJR_REGISTER_X86_NORMAL_SIMD_FUNCTION( n, WJR_REGISTER_FIND_NOT_N_2, WJR_REGISTER_FIND_NOT_N_4, WJR_HAS_SIMD(AVX2), WJR_REGISTER_FIND_NOT_N_ADVANCE, const auto __src = src, WJR_REGISTER_FIND_NOT_N_RET); -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) do { const auto r0 = sse::cmpeq_epi64(sse::loadu(src), y2); const auto r1 = sse::cmpeq_epi64(sse::loadu(src + 2), y2); @@ -218,7 +218,7 @@ WJR_PURE size_t large_builtin_find_not_n(const T *src, T val, size_t n) noexcept src += 8; n -= 8; } while (WJR_LIKELY(n != 0)); -#else + #else do { const auto r0 = avx::cmpeq_epi64(avx::loadu(src), y4); const auto r1 = avx::cmpeq_epi64(avx::loadu(src + 4), y4); @@ -250,14 +250,14 @@ WJR_PURE size_t large_builtin_find_not_n(const T *src, T val, size_t n) noexcept src += 16; n -= 16; } while (WJR_LIKELY(n != 0)); -#endif + #endif return src - __src; -#undef WJR_REGISTER_FIND_NOT_N_RET -#undef WJR_REGISTER_FIND_NOT_N_ADVANCE -#undef WJR_REGISTER_FIND_NOT_N_4 -#undef WJR_REGISTER_FIND_NOT_N_2 + #undef WJR_REGISTER_FIND_NOT_N_RET + #undef WJR_REGISTER_FIND_NOT_N_ADVANCE + #undef WJR_REGISTER_FIND_NOT_N_4 + #undef WJR_REGISTER_FIND_NOT_N_2 } extern template WJR_PURE size_t large_builtin_find_not_n(const uint64_t *src, @@ -271,48 +271,48 @@ extern template WJR_PURE size_t large_builtin_find_not_n(const uint64_ template WJR_PURE size_t large_builtin_reverse_find_not_n(const T *src0, const T *src1, size_t n) noexcept { -#define WJR_REGISTER_REVERSE_FIND_NOT_N_2(index) \ - do { \ - const auto x = sse::loadu(src0 + (index)); \ - const auto y = sse::loadu(src1 + (index)); \ - const auto r = sse::cmpeq_epi64(x, y); \ + #define WJR_REGISTER_REVERSE_FIND_NOT_N_2(index) \ + do { \ + const auto x = sse::loadu(src0 + (index)); \ + const auto y = sse::loadu(src1 + (index)); \ + const auto r = sse::cmpeq_epi64(x, y); \ \ - const sse::mask_type mask = ~sse::movemask_epi8(r); \ - if (WJR_UNLIKELY(mask != 0)) { \ - return (index) + 2 - (mask == 0x00FF); \ - } \ - } while (false) - -#if WJR_HAS_SIMD(AVX2) -#define WJR_REGISTER_REVERSE_FIND_NOT_N_4(index) \ - do { \ - const auto x = avx::loadu(src0 + (index)); \ - const auto y = avx::loadu(src1 + (index)); \ - const auto r = avx::cmpeq_epi64(x, y); \ + const sse::mask_type mask = ~sse::movemask_epi8(r); \ + if (WJR_UNLIKELY(mask != 0)) { \ + return (index) + 2 - (mask == 0x00FF); \ + } \ + } while (false) + + #if WJR_HAS_SIMD(AVX2) + #define WJR_REGISTER_REVERSE_FIND_NOT_N_4(index) \ + do { \ + const auto x = avx::loadu(src0 + (index)); \ + const auto y = avx::loadu(src1 + (index)); \ + const auto r = avx::cmpeq_epi64(x, y); \ \ - const avx::mask_type mask = ~avx::movemask_epi8(r); \ - if (WJR_UNLIKELY(mask != 0)) { \ - return (index) + 4 - clz(mask) / 8; \ - } \ - } while (false) -#else -#define WJR_REGISTER_REVERSE_FIND_NOT_N_4(index) \ - WJR_REGISTER_REVERSE_FIND_NOT_N_2((index) + 2); \ - WJR_REGISTER_REVERSE_FIND_NOT_N_2(index); -#endif - -#define WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE(index) \ - src0 += index; \ - src1 += index - -#define WJR_REGISTER_REVERSE_FIND_NOT_N_RET(index) 0 + const avx::mask_type mask = ~avx::movemask_epi8(r); \ + if (WJR_UNLIKELY(mask != 0)) { \ + return (index) + 4 - clz(mask) / 8; \ + } \ + } while (false) + #else + #define WJR_REGISTER_REVERSE_FIND_NOT_N_4(index) \ + WJR_REGISTER_REVERSE_FIND_NOT_N_2((index) + 2); \ + WJR_REGISTER_REVERSE_FIND_NOT_N_2(index); + #endif + + #define WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE(index) \ + src0 += index; \ + src1 += index + + #define WJR_REGISTER_REVERSE_FIND_NOT_N_RET(index) 0 WJR_REGISTER_X86_NORMAL_REVERSE_SIMD_FUNCTION( n, WJR_REGISTER_REVERSE_FIND_NOT_N_2, WJR_REGISTER_REVERSE_FIND_NOT_N_4, WJR_HAS_SIMD(AVX2), WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE, , WJR_REGISTER_REVERSE_FIND_NOT_N_RET); -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) do { const auto r0 = sse::cmpeq_epi64(sse::loadu(src0 - 8), sse::loadu(src1 - 8)); const auto r1 = sse::cmpeq_epi64(sse::loadu(src0 - 6), sse::loadu(src1 - 6)); @@ -345,7 +345,7 @@ WJR_PURE size_t large_builtin_reverse_find_not_n(const T *src0, const T *src1, src1 -= 8; n -= 8; } while (WJR_LIKELY(n != 0)); -#else + #else do { const auto r0 = avx::cmpeq_epi64(avx::loadu(src0 - 16), avx::loadu(src1 - 16)); const auto r1 = avx::cmpeq_epi64(avx::loadu(src0 - 12), avx::loadu(src1 - 12)); @@ -378,14 +378,14 @@ WJR_PURE size_t large_builtin_reverse_find_not_n(const T *src0, const T *src1, src1 -= 16; n -= 16; } while (WJR_LIKELY(n != 0)); -#endif + #endif return 0; -#undef WJR_REGISTER_REVERSE_FIND_NOT_N_RET -#undef WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE -#undef WJR_REGISTER_REVERSE_FIND_NOT_N_4 -#undef WJR_REGISTER_REVERSE_FIND_NOT_N_2 + #undef WJR_REGISTER_REVERSE_FIND_NOT_N_RET + #undef WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE + #undef WJR_REGISTER_REVERSE_FIND_NOT_N_4 + #undef WJR_REGISTER_REVERSE_FIND_NOT_N_2 } extern template WJR_PURE size_t large_builtin_reverse_find_not_n( @@ -393,49 +393,49 @@ extern template WJR_PURE size_t large_builtin_reverse_find_not_n( template WJR_PURE size_t large_builtin_reverse_find_not_n(const T *src, T val, size_t n) noexcept { -#define WJR_REGISTER_REVERSE_FIND_NOT_N_2(index) \ - do { \ - const auto x = sse::loadu(src + (index)); \ - const auto r = sse::cmpeq_epi64(x, y2); \ + #define WJR_REGISTER_REVERSE_FIND_NOT_N_2(index) \ + do { \ + const auto x = sse::loadu(src + (index)); \ + const auto r = sse::cmpeq_epi64(x, y2); \ \ - const sse::mask_type mask = ~sse::movemask_epi8(r); \ - if (WJR_UNLIKELY(mask != 0)) { \ - return (index) + 2 - (mask == 0x00FF); \ - } \ - } while (false) - -#if WJR_HAS_SIMD(AVX2) -#define WJR_REGISTER_REVERSE_FIND_NOT_N_4(index) \ - do { \ - const auto x = avx::loadu(src + (index)); \ - const auto r = avx::cmpeq_epi64(x, y4); \ + const sse::mask_type mask = ~sse::movemask_epi8(r); \ + if (WJR_UNLIKELY(mask != 0)) { \ + return (index) + 2 - (mask == 0x00FF); \ + } \ + } while (false) + + #if WJR_HAS_SIMD(AVX2) + #define WJR_REGISTER_REVERSE_FIND_NOT_N_4(index) \ + do { \ + const auto x = avx::loadu(src + (index)); \ + const auto r = avx::cmpeq_epi64(x, y4); \ \ - const avx::mask_type mask = ~avx::movemask_epi8(r); \ - if (WJR_UNLIKELY(mask != 0)) { \ - return (index) + 4 - clz(mask) / 8; \ - } \ - } while (false) -#else -#define WJR_REGISTER_REVERSE_FIND_NOT_N_4(index) \ - WJR_REGISTER_REVERSE_FIND_NOT_N_2((index) + 2); \ - WJR_REGISTER_REVERSE_FIND_NOT_N_2(index) -#endif + const avx::mask_type mask = ~avx::movemask_epi8(r); \ + if (WJR_UNLIKELY(mask != 0)) { \ + return (index) + 4 - clz(mask) / 8; \ + } \ + } while (false) + #else + #define WJR_REGISTER_REVERSE_FIND_NOT_N_4(index) \ + WJR_REGISTER_REVERSE_FIND_NOT_N_2((index) + 2); \ + WJR_REGISTER_REVERSE_FIND_NOT_N_2(index) + #endif -#define WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE(index) src += index + #define WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE(index) src += index -#define WJR_REGISTER_REVERSE_FIND_NOT_N_RET(index) 0 + #define WJR_REGISTER_REVERSE_FIND_NOT_N_RET(index) 0 const auto y2 = sse::set1(val, T()); -#if WJR_HAS_SIMD(AVX2) + #if WJR_HAS_SIMD(AVX2) const auto y4 = broadcast<__m128i_t, __m256i_t>(y2); -#endif + #endif WJR_REGISTER_X86_NORMAL_REVERSE_SIMD_FUNCTION( n, WJR_REGISTER_REVERSE_FIND_NOT_N_2, WJR_REGISTER_REVERSE_FIND_NOT_N_4, WJR_HAS_SIMD(AVX2), WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE, , WJR_REGISTER_REVERSE_FIND_NOT_N_RET); -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) do { const auto r0 = sse::cmpeq_epi64(sse::loadu(src - 8), y2); const auto r1 = sse::cmpeq_epi64(sse::loadu(src - 6), y2); @@ -467,7 +467,7 @@ WJR_PURE size_t large_builtin_reverse_find_not_n(const T *src, T val, size_t n) src -= 8; n -= 8; } while (WJR_LIKELY(n != 0)); -#else + #else do { const auto r0 = avx::cmpeq_epi64(avx::loadu(src - 16), y4); const auto r1 = avx::cmpeq_epi64(avx::loadu(src - 12), y4); @@ -499,14 +499,14 @@ WJR_PURE size_t large_builtin_reverse_find_not_n(const T *src, T val, size_t n) src -= 16; n -= 16; } while (WJR_LIKELY(n != 0)); -#endif + #endif return 0; -#undef WJR_REGISTER_REVERSE_FIND_NOT_N_RET -#undef WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE -#undef WJR_REGISTER_REVERSE_FIND_NOT_N_4 -#undef WJR_REGISTER_REVERSE_FIND_NOT_N_2 + #undef WJR_REGISTER_REVERSE_FIND_NOT_N_RET + #undef WJR_REGISTER_REVERSE_FIND_NOT_N_ADVANCE + #undef WJR_REGISTER_REVERSE_FIND_NOT_N_4 + #undef WJR_REGISTER_REVERSE_FIND_NOT_N_2 } extern template WJR_PURE size_t large_builtin_reverse_find_not_n( @@ -516,4 +516,4 @@ extern template WJR_PURE size_t large_builtin_reverse_find_not_n( } // namespace wjr -#endif // WJR_X86_MATH_LARGE_FIND_IMPL_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_MATH_LARGE_FIND_IMPL_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/mul-impl.hpp b/include/wjr/arch/x86/math/mul-impl.hpp index 8c134c59..757eaf2b 100644 --- a/include/wjr/arch/x86/math/mul-impl.hpp +++ b/include/wjr/arch/x86/math/mul-impl.hpp @@ -1,10 +1,10 @@ -#ifndef WJR_X86_MATH_MUL_IMPL_HPP__ -#define WJR_X86_MATH_MUL_IMPL_HPP__ +#ifndef WJR_ARCH_X86_MATH_MUL_IMPL_HPP__ +#define WJR_ARCH_X86_MATH_MUL_IMPL_HPP__ #include #ifndef WJR_X86 -#error "x86 required" + #error "x86 required" #endif namespace wjr { @@ -12,73 +12,73 @@ namespace wjr { #define WJR_HAS_BUILTIN_UMUL128 WJR_HAS_DEF #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_HAS_BUILTIN_ASM_UMUL128 WJR_HAS_DEF + #define WJR_HAS_BUILTIN_ASM_UMUL128 WJR_HAS_DEF #elif WJR_HAS_FEATURE(INT128) -#define WJR_HAS_BUILTIN_INT128_UMUL128 WJR_HAS_DEF + #define WJR_HAS_BUILTIN_INT128_UMUL128 WJR_HAS_DEF #elif defined(WJR_MSVC) -#define WJR_HAS_BUILTIN_MSVC_UMUL128 WJR_HAS_DEF + #define WJR_HAS_BUILTIN_MSVC_UMUL128 WJR_HAS_DEF #else -#undef WJR_HAS_BUILTIN_UMUL128 + #undef WJR_HAS_BUILTIN_UMUL128 #endif #if defined(__BMI2__) -#define WJR_HAS_BUILTIN_MULX_U64 WJR_HAS_DEF + #define WJR_HAS_BUILTIN_MULX_U64 WJR_HAS_DEF #endif #if defined(__BMI2__) -#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_HAS_BUILTIN_ASM_MUL_1 WJR_HAS_DEF -#elif defined(WJR_ENABLE_ASSEMBLY) -#define WJR_HAS_BUILTIN_ASM_MUL_1 WJR_HAS_DEF_VAR(3) -#endif + #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) + #define WJR_HAS_BUILTIN_ASM_MUL_1 WJR_HAS_DEF + #elif defined(WJR_ENABLE_ASSEMBLY) + #define WJR_HAS_BUILTIN_ASM_MUL_1 WJR_HAS_DEF_VAR(3) + #endif #endif #if defined(__BMI2__) && defined(__ADX__) -#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_HAS_BUILTIN_ASM_ADDMUL_1 WJR_HAS_DEF -#elif defined(WJR_ENABLE_ASSEMBLY) -#define WJR_HAS_BUILTIN_ASM_ADDMUL_1 WJR_HAS_DEF_VAR(3) -#endif + #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) + #define WJR_HAS_BUILTIN_ASM_ADDMUL_1 WJR_HAS_DEF + #elif defined(WJR_ENABLE_ASSEMBLY) + #define WJR_HAS_BUILTIN_ASM_ADDMUL_1 WJR_HAS_DEF_VAR(3) + #endif #endif #if defined(__BMI2__) && defined(__ADX__) -#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_HAS_BUILTIN_ASM_SUBMUL_1 WJR_HAS_DEF -#elif defined(WJR_ENABLE_ASSEMBLY) -#define WJR_HAS_BUILTIN_ASM_SUBMUL_1 WJR_HAS_DEF_VAR(3) -#endif + #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) + #define WJR_HAS_BUILTIN_ASM_SUBMUL_1 WJR_HAS_DEF + #elif defined(WJR_ENABLE_ASSEMBLY) + #define WJR_HAS_BUILTIN_ASM_SUBMUL_1 WJR_HAS_DEF_VAR(3) + #endif #endif #if defined(__BMI2__) -#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_HAS_BUILTIN_ASM_ADDLSH_N WJR_HAS_DEF -#define WJR_HAS_BUILTIN_ASM_RSBLSH_N WJR_HAS_DEF -#elif defined(WJR_ENABLE_ASSEMBLY) -#define WJR_HAS_BUILTIN_ASM_ADDLSH_N WJR_HAS_DEF_VAR(3) -#define WJR_HAS_BUILTIN_ASM_RSBLSH_N WJR_HAS_DEF_VAR(3) -#endif + #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) + #define WJR_HAS_BUILTIN_ASM_ADDLSH_N WJR_HAS_DEF + #define WJR_HAS_BUILTIN_ASM_RSBLSH_N WJR_HAS_DEF + #elif defined(WJR_ENABLE_ASSEMBLY) + #define WJR_HAS_BUILTIN_ASM_ADDLSH_N WJR_HAS_DEF_VAR(3) + #define WJR_HAS_BUILTIN_ASM_RSBLSH_N WJR_HAS_DEF_VAR(3) + #endif #endif #if defined(__BMI2__) && defined(__ADX__) -#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF -#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF -#elif defined(WJR_ENABLE_ASSEMBLY) -#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF_VAR(3) -#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF_VAR(3) -#endif + #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) + #define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF + #define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF + #elif defined(WJR_ENABLE_ASSEMBLY) + #define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF_VAR(3) + #define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF_VAR(3) + #endif #endif } // namespace wjr -#endif // WJR_X86_MATH_MUL_IMPL_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_MATH_MUL_IMPL_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/mul.hpp b/include/wjr/arch/x86/math/mul.hpp index 33ad8e64..106eca96 100644 --- a/include/wjr/arch/x86/math/mul.hpp +++ b/include/wjr/arch/x86/math/mul.hpp @@ -1,11 +1,11 @@ -#ifndef WJR_X86_MATH_MUL_HPP__ -#define WJR_X86_MATH_MUL_HPP__ +#ifndef WJR_ARCH_X86_MATH_MUL_HPP__ +#define WJR_ARCH_X86_MATH_MUL_HPP__ -#include #include +#include #if WJR_HAS_BUILTIN(MSVC_UMUL128) -#include + #include #endif namespace wjr { @@ -14,30 +14,30 @@ namespace wjr { WJR_INTRINSIC_INLINE uint64_t builtin_umul128(uint64_t a, uint64_t b, uint64_t &hi) noexcept { -#if WJR_HAS_BUILTIN(ASM_UMUL128) + #if WJR_HAS_BUILTIN(ASM_UMUL128) uint64_t lo; asm("mul{q %3| %3}\n\t" : "=a,a"(lo), "=d,d"(hi) : "%a,r"(a), "r,a"(b) : "cc"); return lo; -#elif WJR_HAS_BUILTIN(INT128_UMUL128) + #elif WJR_HAS_BUILTIN(INT128_UMUL128) const __uint128_t x = static_cast<__uint128_t>(a) * b; hi = x >> 64; return static_cast(x); -#else + #else return _umul128(a, b, &hi); -#endif + #endif } #endif #if WJR_HAS_BUILTIN(ASM_MUL_1) -#if WJR_HAS_BUILTIN(ASM_MUL_1) == 1 + #if WJR_HAS_BUILTIN(ASM_MUL_1) == 1 extern uint64_t __wjr_asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept; -#else + #else extern "C" WJR_MS_ABI uint64_t __wjr_asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept; -#endif + #endif WJR_INTRINSIC_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept { @@ -48,13 +48,13 @@ WJR_INTRINSIC_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size #if WJR_HAS_BUILTIN(ASM_ADDMUL_1) -#if WJR_HAS_BUILTIN(ASM_ADDMUL_1) == 1 + #if WJR_HAS_BUILTIN(ASM_ADDMUL_1) == 1 extern uint64_t __wjr_asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept; -#else + #else extern "C" WJR_MS_ABI uint64_t __wjr_asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept; -#endif + #endif WJR_INTRINSIC_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept { @@ -65,14 +65,14 @@ WJR_INTRINSIC_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, s #if WJR_HAS_BUILTIN(ASM_SUBMUL_1) -#if WJR_HAS_BUILTIN(ASM_SUBMUL_1) == 1 + #if WJR_HAS_BUILTIN(ASM_SUBMUL_1) == 1 // slower than asm_addmul_1 extern uint64_t __wjr_asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept; -#else + #else extern "C" WJR_MS_ABI uint64_t __wjr_asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept; -#endif + #endif WJR_INTRINSIC_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept { @@ -82,26 +82,26 @@ WJR_INTRINSIC_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, s #endif #if WJR_HAS_BUILTIN(ASM_ADDLSH_N) -#define WJR_ADDSUB_I 1 -#include + #define WJR_ADDSUB_I 1 + #include #endif #if WJR_HAS_BUILTIN(ASM_RSBLSH_N) -#define WJR_ADDSUB_I 0 -#include + #define WJR_ADDSUB_I 0 + #include #endif #if WJR_HAS_BUILTIN(ASM_BASECASE_MUL_S) -#if WJR_HAS_BUILTIN(ASM_BASECASE_MUL_S) == 1 + #if WJR_HAS_BUILTIN(ASM_BASECASE_MUL_S) == 1 extern void __wjr_asm_basecase_mul_s_impl(uint64_t *dst, const uint64_t *src0, size_t rdx, const uint64_t *src1, size_t m) noexcept; -#else + #else extern "C" WJR_MS_ABI void __wjr_asm_basecase_mul_s_impl(uint64_t *dst, const uint64_t *src0, size_t rdx, const uint64_t *src1, size_t m) noexcept; -#endif + #endif inline void asm_basecase_mul_s(uint64_t *dst, const uint64_t *src0, size_t n, const uint64_t *src1, size_t m) noexcept { @@ -114,13 +114,13 @@ inline void asm_basecase_mul_s(uint64_t *dst, const uint64_t *src0, size_t n, #if WJR_HAS_BUILTIN(ASM_BASECASE_SQR) -#if WJR_HAS_BUILTIN(ASM_BASECASE_SQR) == 1 + #if WJR_HAS_BUILTIN(ASM_BASECASE_SQR) == 1 extern void __wjr_asm_basecase_sqr_impl(uint64_t *dst, const uint64_t *src, size_t rdx) noexcept; -#else + #else extern "C" WJR_MS_ABI void __wjr_asm_basecase_sqr_impl(uint64_t *dst, const uint64_t *src, size_t rdx) noexcept; -#endif + #endif inline void asm_basecase_sqr(uint64_t *dst, const uint64_t *src, size_t n) noexcept { WJR_ASSERT(n >= 1); @@ -131,4 +131,4 @@ inline void asm_basecase_sqr(uint64_t *dst, const uint64_t *src, size_t n) noexc } // namespace wjr -#endif // WJR_X86_MATH_MUL_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_MATH_MUL_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/not.hpp b/include/wjr/arch/x86/math/not.hpp index 3253afad..e323b080 100644 --- a/include/wjr/arch/x86/math/not.hpp +++ b/include/wjr/arch/x86/math/not.hpp @@ -1,17 +1,17 @@ -#ifndef WJR_X86_MATH_NOT_HPP__ -#define WJR_X86_MATH_NOT_HPP__ +#ifndef WJR_ARCH_X86_MATH_NOT_HPP__ +#define WJR_ARCH_X86_MATH_NOT_HPP__ -#include #include +#include #ifndef WJR_X86 -#error "x86 required" + #error "x86 required" #endif namespace wjr { #if WJR_HAS_SIMD(SSE2) -#define WJR_HAS_BUILTIN_COMPLEMENT_N WJR_HAS_DEF + #define WJR_HAS_BUILTIN_COMPLEMENT_N WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(COMPLEMENT_N) @@ -221,4 +221,4 @@ WJR_INTRINSIC_INLINE void builtin_not_n(T *dst, const T *src, size_t n) noexcept } // namespace wjr -#endif // WJR_X86_MATH_NOT_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_MATH_NOT_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/prefix_xor.hpp b/include/wjr/arch/x86/math/prefix_xor.hpp index 91119c7a..e7e384bd 100644 --- a/include/wjr/arch/x86/math/prefix_xor.hpp +++ b/include/wjr/arch/x86/math/prefix_xor.hpp @@ -1,13 +1,13 @@ -#ifndef WJR_X86_MATH_PREFIX_XOR_HPP__ -#define WJR_X86_MATH_PREFIX_XOR_HPP__ +#ifndef WJR_ARCH_X86_MATH_PREFIX_XOR_HPP__ +#define WJR_ARCH_X86_MATH_PREFIX_XOR_HPP__ -#include #include +#include namespace wjr { #if WJR_HAS_SIMD(PCLMUL) -#define WJR_HAS_BUILTIN_PREFIX_XOR WJR_HAS_DEF + #define WJR_HAS_BUILTIN_PREFIX_XOR WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(PREFIX_XOR) @@ -33,4 +33,4 @@ T builtin_prefix_xor(T x) noexcept { } // namespace wjr -#endif // WJR_X86_MATH_PREFIX_XOR_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_MATH_PREFIX_XOR_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/set.hpp b/include/wjr/arch/x86/math/set.hpp index 924dfbf0..66a678d8 100644 --- a/include/wjr/arch/x86/math/set.hpp +++ b/include/wjr/arch/x86/math/set.hpp @@ -1,18 +1,18 @@ -#ifndef WJR_X86_MATH_SET_HPP__ -#define WJR_X86_MATH_SET_HPP__ +#ifndef WJR_ARCH_X86_MATH_SET_HPP__ +#define WJR_ARCH_X86_MATH_SET_HPP__ #include #include #ifndef WJR_X86 -#error "x86 required" + #error "x86 required" #endif namespace wjr { #if WJR_HAS_SIMD(SSE2) -#define WJR_HAS_BUILTIN_SET_N WJR_HAS_DEF + #define WJR_HAS_BUILTIN_SET_N WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(SET_N) @@ -173,14 +173,14 @@ WJR_INTRINSIC_INLINE void builtin_set_n(T *dst, T val, size_t n) noexcept { return; } -#if WJR_HAS_SIMD(AVX2) + #if WJR_HAS_SIMD(AVX2) if constexpr (is_avx) { auto z = broadcast<__m128i_t, __m256i_t>(y); avx::storeu(dst, z); avx::storeu(dst + n - type_width, z); return; } -#endif + #endif WJR_UNREACHABLE(); } @@ -189,4 +189,4 @@ WJR_INTRINSIC_INLINE void builtin_set_n(T *dst, T val, size_t n) noexcept { } // namespace wjr -#endif // WJR_X86_MATH_SET_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_MATH_SET_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/shift.hpp b/include/wjr/arch/x86/math/shift.hpp index c585c717..2aea706f 100644 --- a/include/wjr/arch/x86/math/shift.hpp +++ b/include/wjr/arch/x86/math/shift.hpp @@ -1,10 +1,10 @@ -#ifndef WJR_X86_MATH_SHIFT_HPP__ -#define WJR_X86_MATH_SHIFT_HPP__ +#ifndef WJR_ARCH_X86_MATH_SHIFT_HPP__ +#define WJR_ARCH_X86_MATH_SHIFT_HPP__ #include #ifndef WJR_X86 -#error "x86 required" + #error "x86 required" #endif namespace wjr { @@ -16,8 +16,8 @@ template WJR_INTRINSIC_CONSTEXPR20 T shrd(T lo, T hi, unsigned int c) noexcept; #if WJR_HAS_SIMD(SSE2) -#define WJR_HAS_BUILTIN_LSHIFT_N WJR_HAS_DEF -#define WJR_HAS_BUILTIN_RSHIFT_N WJR_HAS_DEF + #define WJR_HAS_BUILTIN_LSHIFT_N WJR_HAS_DEF + #define WJR_HAS_BUILTIN_RSHIFT_N WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(LSHIFT_N) || WJR_HAS_BUILTIN(RSHIFT_N) @@ -56,21 +56,22 @@ WJR_INTRINSIC_INLINE __m128i __mm_srl_epi64(__m128i x, __m128i c) noexcept { #if WJR_HAS_BUILTIN(LSHIFT_N) -#define WJR_REGISTER_LSHIFT_N_IMPL_UNALIGNED(index) \ - do { \ - __m128i x1 = sse::loadu(src - 3 - (index)); \ - x0 = simd_cast<__m128_t, __m128i_t>(sse::template shuffle_ps<78>( \ - simd_cast<__m128i_t, __m128_t>(x1), simd_cast<__m128i_t, __m128_t>(x0))); \ + #define WJR_REGISTER_LSHIFT_N_IMPL_UNALIGNED(index) \ + do { \ + __m128i x1 = sse::loadu(src - 3 - (index)); \ + x0 = simd_cast<__m128_t, __m128i_t>( \ + sse::template shuffle_ps<78>(simd_cast<__m128i_t, __m128_t>(x1), \ + simd_cast<__m128i_t, __m128_t>(x0))); \ \ - __m128i r0 = __mm_sll_epi64(x0, y); \ - __m128i r1 = __mm_srl_epi64(x1, z); \ + __m128i r0 = __mm_sll_epi64(x0, y); \ + __m128i r1 = __mm_srl_epi64(x1, z); \ \ - __m128i r = sse::Or(r0, r1); \ + __m128i r = sse::Or(r0, r1); \ \ - sse::storeu(dst - 2 - (index), r); \ + sse::storeu(dst - 2 - (index), r); \ \ - x0 = x1; \ - } while (false) + x0 = x1; \ + } while (false) template void large_builtin_lshift_n_impl(T *dst, const T *src, size_t n, @@ -167,7 +168,7 @@ WJR_INTRINSIC_INLINE void builtin_lshift_n_impl(T *dst, const T *src, size_t n, return large_builtin_lshift_n_impl(dst, src, n, c); } -#undef WJR_REGISTER_LSHIFT_N_IMPL_UNALIGNED + #undef WJR_REGISTER_LSHIFT_N_IMPL_UNALIGNED template WJR_INTRINSIC_INLINE T builtin_lshift_n(T *dst, const T *src, size_t n, unsigned int c, @@ -182,21 +183,22 @@ WJR_INTRINSIC_INLINE T builtin_lshift_n(T *dst, const T *src, size_t n, unsigned #if WJR_HAS_BUILTIN(RSHIFT_N) -#define WJR_REGISTER_RSHIFT_N_IMPL_UNALIGNED(index) \ - do { \ - __m128i x1 = sse::loadu(src + 1 + (index)); \ - x0 = simd_cast<__m128_t, __m128i_t>(sse::template shuffle_ps<78>( \ - simd_cast<__m128i_t, __m128_t>(x0), simd_cast<__m128i_t, __m128_t>(x1))); \ + #define WJR_REGISTER_RSHIFT_N_IMPL_UNALIGNED(index) \ + do { \ + __m128i x1 = sse::loadu(src + 1 + (index)); \ + x0 = simd_cast<__m128_t, __m128i_t>( \ + sse::template shuffle_ps<78>(simd_cast<__m128i_t, __m128_t>(x0), \ + simd_cast<__m128i_t, __m128_t>(x1))); \ \ - __m128i r0 = __mm_srl_epi64(x0, y); \ - __m128i r1 = __mm_sll_epi64(x1, z); \ + __m128i r0 = __mm_srl_epi64(x0, y); \ + __m128i r1 = __mm_sll_epi64(x1, z); \ \ - __m128i r = sse::Or(r0, r1); \ + __m128i r = sse::Or(r0, r1); \ \ - sse::storeu(dst + (index), r); \ + sse::storeu(dst + (index), r); \ \ - x0 = x1; \ - } while (false) + x0 = x1; \ + } while (false) template void large_builtin_rshift_n_impl(T *dst, const T *src, size_t n, @@ -293,7 +295,7 @@ WJR_INTRINSIC_INLINE void builtin_rshift_n_impl(T *dst, const T *src, size_t n, return large_builtin_rshift_n_impl(dst, src, n, c); } -#undef WJR_REGISTER_RSHIFT_N_IMPL_UNALIGNED + #undef WJR_REGISTER_RSHIFT_N_IMPL_UNALIGNED template WJR_INTRINSIC_INLINE T builtin_rshift_n(T *dst, const T *src, size_t n, unsigned int c, @@ -308,4 +310,4 @@ WJR_INTRINSIC_INLINE T builtin_rshift_n(T *dst, const T *src, size_t n, unsigned } // namespace wjr -#endif // WJR_X86_MATH_SHIFT_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_MATH_SHIFT_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/math/sub.hpp b/include/wjr/arch/x86/math/sub.hpp index f7bd5cdd..dfc49c4e 100644 --- a/include/wjr/arch/x86/math/sub.hpp +++ b/include/wjr/arch/x86/math/sub.hpp @@ -1,36 +1,36 @@ -#ifndef WJR_X86_SUB_HPP__ -#define WJR_X86_SUB_HPP__ +#ifndef WJR_ARCH_X86_SUB_HPP__ +#define WJR_ARCH_X86_SUB_HPP__ #include #ifndef WJR_X86 -#error "x86 required" + #error "x86 required" #endif #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_HAS_BUILTIN_ASM_SUBC WJR_HAS_DEF -#define WJR_HAS_BUILTIN_ASM_SUBC_N WJR_HAS_DEF -#define WJR_HAS_BUILTIN___ASM_SUB_128 WJR_HAS_DEF -#define WJR_HAS_BUILTIN___ASM_SUBC_128 WJR_HAS_DEF - -#if WJR_HAS_FEATURE(INLINE_ASM_CCCOND) -#define WJR_HAS_BUILTIN_ASM_SUBC_CC WJR_HAS_DEF -#define WJR_HAS_BUILTIN___ASM_SUBC_CC_128 WJR_HAS_DEF -#endif + #define WJR_HAS_BUILTIN_ASM_SUBC WJR_HAS_DEF + #define WJR_HAS_BUILTIN_ASM_SUBC_N WJR_HAS_DEF + #define WJR_HAS_BUILTIN___ASM_SUB_128 WJR_HAS_DEF + #define WJR_HAS_BUILTIN___ASM_SUBC_128 WJR_HAS_DEF + + #if WJR_HAS_FEATURE(INLINE_ASM_CCCOND) + #define WJR_HAS_BUILTIN_ASM_SUBC_CC WJR_HAS_DEF + #define WJR_HAS_BUILTIN___ASM_SUBC_CC_128 WJR_HAS_DEF + #endif #else -#if defined(WJR_MSVC) -#define WJR_HAS_BUILTIN_ASM_SUBC WJR_HAS_DEF_VAR(2) -#endif + #if defined(WJR_MSVC) + #define WJR_HAS_BUILTIN_ASM_SUBC WJR_HAS_DEF_VAR(2) + #endif -#if defined(WJR_ENABLE_ASSEMBLY) -#define WJR_HAS_BUILTIN_ASM_SUBC_N WJR_HAS_DEF_VAR(3) -#endif + #if defined(WJR_ENABLE_ASSEMBLY) + #define WJR_HAS_BUILTIN_ASM_SUBC_N WJR_HAS_DEF_VAR(3) + #endif #endif #if WJR_HAS_BUILTIN(ASM_SUBC) == 2 -#include + #include #endif namespace wjr { @@ -40,7 +40,7 @@ namespace wjr { template WJR_INTRINSIC_INLINE uint64_t asm_subc(uint64_t a, uint64_t b, U c_in, U &c_out) noexcept { -#if WJR_HAS_BUILTIN(ASM_SUBC) == 1 + #if WJR_HAS_BUILTIN(ASM_SUBC) == 1 if (WJR_BUILTIN_CONSTANT_P_TRUE(c_in == 1)) { if (WJR_BUILTIN_CONSTANT_P(b) && in_range(b)) { asm("stc\n\t" @@ -78,11 +78,11 @@ WJR_INTRINSIC_INLINE uint64_t asm_subc(uint64_t a, uint64_t b, U c_in, } c_out = c_in; return a; -#else + #else uint64_t ret; c_out = fast_cast(_subborrow_u64(fast_cast(c_in), a, b, &ret)); return ret; -#endif + #endif } #endif @@ -127,8 +127,8 @@ WJR_INTRINSIC_INLINE uint64_t asm_subc_cc(uint64_t a, uint64_t b, uint8_t c_in, #endif #if WJR_HAS_BUILTIN(ASM_SUBC_N) -#define WJR_ADDSUB_I 0 -#include + #define WJR_ADDSUB_I 0 + #include #endif #if WJR_HAS_BUILTIN(__ASM_SUB_128) @@ -260,4 +260,4 @@ WJR_INTRINSIC_INLINE uint8_t __asm_subc_cc_128(uint64_t &al, uint64_t &ah, uint6 } // namespace wjr -#endif // WJR_X86_SUB_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_SUB_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/simd/avx.hpp b/include/wjr/arch/x86/simd/avx.hpp index 1571d41b..238982d7 100644 --- a/include/wjr/arch/x86/simd/avx.hpp +++ b/include/wjr/arch/x86/simd/avx.hpp @@ -1,5 +1,5 @@ -#ifndef WJR_X86_SIMD_AVX_HPP__ -#define WJR_X86_SIMD_AVX_HPP__ +#ifndef WJR_ARCH_X86_SIMD_AVX_HPP__ +#define WJR_ARCH_X86_SIMD_AVX_HPP__ #include @@ -679,11 +679,11 @@ struct broadcast_fn<__m256i_t, __m256i_t> { template <> struct broadcast_fn<__m128i_t, __m256i_t> { WJR_CONST WJR_INTRINSIC_INLINE __m256i operator()(__m128i v) const { -#if WJR_HAS_SIMD(AVX2) + #if WJR_HAS_SIMD(AVX2) return _mm256_broadcastsi128_si256(v); -#else + #else return _mm256_insertf128_si256(_mm256_castsi128_si256(v), v, 1); -#endif + #endif } }; @@ -723,11 +723,11 @@ int64_t avx::extract(__m256i v, int64_t) { template __m128i avx::extract_si128(__m256i v) { -#if WJR_HAS_SIMD(AV2) + #if WJR_HAS_SIMD(AV2) return _mm256_extracti128_si256(v, imm8); -#else + #else return _mm256_extractf128_si256(v, imm8); -#endif + #endif } __m128i avx::getlow(__m256i a) { return simd_cast<__m256i_t, __m128i_t>(a); } @@ -756,11 +756,11 @@ __m256i avx::insert_epi64(__m256i v, int64_t i) { template __m256i avx::insert_si128(__m256i a, __m128i b) { -#if WJR_HAS_SIMD(AVX2) + #if WJR_HAS_SIMD(AVX2) return _mm256_inserti128_si256(a, b, imm8); -#else + #else return _mm256_insertf128_si256(a, b, imm8); -#endif + #endif } __m256i avx::load(const void *p) { @@ -1654,4 +1654,4 @@ __m256i avx::unpacklo(__m256i a, __m256i b, uint32_t) { return unpacklo_epi32(a, } // namespace wjr -#endif // WJR_X86_SIMD_AVX_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_SIMD_AVX_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/simd/intrin.hpp b/include/wjr/arch/x86/simd/intrin.hpp index 6f910df4..fd3626fe 100644 --- a/include/wjr/arch/x86/simd/intrin.hpp +++ b/include/wjr/arch/x86/simd/intrin.hpp @@ -1,14 +1,14 @@ -#ifndef WJR_X86_SIMD_INTRIN_HPP__ -#define WJR_X86_SIMD_INTRIN_HPP__ +#ifndef WJR_ARCH_X86_SIMD_INTRIN_HPP__ +#define WJR_ARCH_X86_SIMD_INTRIN_HPP__ #include #if defined(_MSC_VER) -/* Microsoft C/C++-compatible compiler */ -#include + /* Microsoft C/C++-compatible compiler */ + #include #elif defined(__GNUC__) -/* GCC-compatible compiler, targeting x86/x86-64 */ -#include + /* GCC-compatible compiler, targeting x86/x86-64 */ + #include #endif -#endif // WJR_X86_SIMD_INTRIN_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_SIMD_INTRIN_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/simd/simd.hpp b/include/wjr/arch/x86/simd/simd.hpp index 91356e8a..885796d4 100644 --- a/include/wjr/arch/x86/simd/simd.hpp +++ b/include/wjr/arch/x86/simd/simd.hpp @@ -1,5 +1,5 @@ -#ifndef WJR_X86_SIMD_SIMD_HPP__ -#define WJR_X86_SIMD_SIMD_HPP__ +#ifndef WJR_ARCH_X86_SIMD_SIMD_HPP__ +#define WJR_ARCH_X86_SIMD_SIMD_HPP__ #include @@ -178,7 +178,7 @@ class __x86_simd_base { }; #if WJR_HAS_SIMD(SSE2) -#define WJR_HAS_SIMD_NATIVE_128BIT WJR_HAS_DEF + #define WJR_HAS_SIMD_NATIVE_128BIT WJR_HAS_DEF template <> class simd> : public __x86_simd_base { @@ -215,7 +215,7 @@ class simd> : public __x86_simd_base class simd> : public __x86_simd_base { @@ -254,4 +254,4 @@ class simd> : public __x86_simd_base -#include #include +#include namespace wjr { @@ -377,4 +377,4 @@ struct simd_cast_fn<__m256i_t, uint64_t> { } // namespace wjr -#endif // WJR_X86_SIMD_SIMD_CAST_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_SIMD_SIMD_CAST_HPP__ \ No newline at end of file diff --git a/include/wjr/arch/x86/simd/sse.hpp b/include/wjr/arch/x86/simd/sse.hpp index dbcf6003..d8207ac4 100644 --- a/include/wjr/arch/x86/simd/sse.hpp +++ b/include/wjr/arch/x86/simd/sse.hpp @@ -1,5 +1,5 @@ -#ifndef WJR_X86_SIMD_SSE_HPP__ -#define WJR_X86_SIMD_SSE_HPP__ +#ifndef WJR_ARCH_X86_SIMD_SSE_HPP__ +#define WJR_ARCH_X86_SIMD_SSE_HPP__ #include @@ -852,9 +852,9 @@ __m128i sse::adds(__m128i a, __m128i b, uint16_t) { return adds_epu16(a, b); } template __m128i sse::alignr(__m128i a, __m128i b) { constexpr int s = imm8 & 0x1F; -#if WJR_HAS_SIMD(SSSE3) + #if WJR_HAS_SIMD(SSSE3) return _mm_alignr_epi8(a, b, s); -#else + #else if constexpr (s == 0) { return b; } @@ -865,7 +865,7 @@ __m128i sse::alignr(__m128i a, __m128i b) { return Or(slli<16 - s>(a), srli(b)); } return srli(a); -#endif // SSSE3 + #endif // SSSE3 } __m128i sse::alignr_epi16(__m128i a, __m128i b, int c) { @@ -913,13 +913,13 @@ __m128i sse::avg(__m128i a, __m128i b, uint16_t) { return avg_epu16(a, b); } // notice that mask must be 0 or 255(every byte) __m128i sse::blendv_epi8(__m128i a, __m128i b, __m128i mask) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return _mm_blendv_epi8(a, b, mask); -#elif defined(WJR_COMPILER_GCC) + #elif defined(WJR_COMPILER_GCC) return ((~mask) & a) | (mask & b); -#else + #else return Or(AndNot(mask, a), And(mask, b)); -#endif + #endif } __m128i sse::blendv_epi16(__m128i a, __m128i b, __m128i mask) { @@ -976,59 +976,59 @@ __m128i sse::cmpeq(__m128i a, __m128i b, uint16_t) { return cmpeq_epi16(a, b); } __m128i sse::cmpeq(__m128i a, __m128i b, uint32_t) { return cmpeq_epi32(a, b); } __m128i sse::cmpge_epi8(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) + #if WJR_HAS_SIMD(XOP) return _mm_comge_epi8(a, b); -#elif WJR_HAS_SIMD(SSE4_1) + #elif WJR_HAS_SIMD(SSE4_1) return cmpeq(min(a, b, int8_t()), b, uint8_t()); -#else + #else return Not(cmpgt(b, a, int8_t())); -#endif + #endif } __m128i sse::cmpge_epi16(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) + #if WJR_HAS_SIMD(XOP) return _mm_comge_epi16(a, b); -#else + #else return cmpeq(min(a, b, int16_t()), b, uint16_t()); -#endif + #endif } __m128i sse::cmpge_epi32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) + #if WJR_HAS_SIMD(XOP) return _mm_comge_epi32(a, b); -#elif WJR_HAS_SIMD(SSE4_1) + #elif WJR_HAS_SIMD(SSE4_1) return cmpeq(min(a, b, int32_t()), b, uint32_t()); -#else + #else return Not(cmpgt(b, a, int32_t())); -#endif + #endif } __m128i sse::cmpge_epu8(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) + #if WJR_HAS_SIMD(XOP) return _mm_comge_epu8(a, b); -#else + #else return cmpeq(min(a, b, uint8_t()), b, uint8_t()); -#endif + #endif } __m128i sse::cmpge_epu16(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) + #if WJR_HAS_SIMD(XOP) return _mm_comge_epu16(a, b); -#elif WJR_HAS_SIMD(SSE4_1) + #elif WJR_HAS_SIMD(SSE4_1) return cmpeq(min(a, b, uint16_t()), b, uint16_t()); -#else + #else return logical_not(subs(b, a, uint16_t()), uint16_t()); -#endif + #endif } __m128i sse::cmpge_epu32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) + #if WJR_HAS_SIMD(XOP) return _mm_comge_epu32(a, b); -#elif WJR_HAS_SIMD(SSE4_1) + #elif WJR_HAS_SIMD(SSE4_1) return cmpeq(min(a, b, uint32_t()), b, uint32_t()); -#else + #else return Not(cmpgt(b, a, uint32_t())); -#endif + #endif } __m128i sse::cmpge(__m128i a, __m128i b, int8_t) { return cmpge_epi8(a, b); } @@ -1043,27 +1043,27 @@ __m128i sse::cmpgt_epi16(__m128i a, __m128i b) { return _mm_cmpgt_epi16(a, b); } __m128i sse::cmpgt_epi32(__m128i a, __m128i b) { return _mm_cmpgt_epi32(a, b); } __m128i sse::cmpgt_epu8(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) + #if WJR_HAS_SIMD(XOP) return _mm_comgt_epu8(a, b); -#else + #else return cmpgt_epi8(Xor(a, setmin_epi8()), Xor(b, setmin_epi8())); -#endif + #endif } __m128i sse::cmpgt_epu16(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) + #if WJR_HAS_SIMD(XOP) return _mm_comgt_epu16(a, b); -#else + #else return cmpgt_epi16(Xor(a, setmin_epi16()), Xor(b, setmin_epi16())); -#endif + #endif } __m128i sse::cmpgt_epu32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) + #if WJR_HAS_SIMD(XOP) return _mm_comgt_epu32(a, b); -#else + #else return cmpgt_epi32(Xor(a, setmin_epi32()), Xor(b, setmin_epi32())); -#endif + #endif } __m128i sse::cmpgt(__m128i a, __m128i b, int8_t) { return cmpgt_epi8(a, b); } @@ -1104,27 +1104,27 @@ __m128i sse::cmplt(__m128i a, __m128i b, uint16_t) { return cmplt_epu16(a, b); } __m128i sse::cmplt(__m128i a, __m128i b, uint32_t) { return cmplt_epu32(a, b); } __m128i sse::cmpne_epi8(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) + #if WJR_HAS_SIMD(XOP) return _mm_comneq_epi8(a, b); -#else + #else return Not(cmpeq_epi8(a, b)); -#endif + #endif } __m128i sse::cmpne_epi16(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) + #if WJR_HAS_SIMD(XOP) return _mm_comneq_epi16(a, b); -#else + #else return Not(cmpeq_epi16(a, b)); -#endif + #endif } __m128i sse::cmpne_epi32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(XOP) + #if WJR_HAS_SIMD(XOP) return _mm_comneq_epi32(a, b); -#else + #else return Not(cmpeq_epi32(a, b)); -#endif + #endif } __m128i sse::cmpne(__m128i a, __m128i b, int8_t) { return cmpne_epi8(a, b); } @@ -1169,15 +1169,15 @@ __m128i sse::concat(uint64_t lo, uint64_t hi) { return set_epi64x(hi, lo); } template int sse::extract_epi8(__m128i a) { static_assert(imm8 >= 0 && imm8 < 16, "imm8 must be in range [0, 15]"); -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return _mm_extract_epi8(a, imm8); -#else + #else if constexpr (imm8 & 1) { return extract_epi16<(imm8 >> 1)>(a) >> 8; } else { return extract_epi16<(imm8 >> 1)>(a) & 0xff; } -#endif + #endif } template @@ -1189,9 +1189,9 @@ int sse::extract_epi16(__m128i a) { template int sse::extract_epi32(__m128i a) { static_assert(imm8 >= 0 && imm8 < 4, "imm8 must be in range [0, 3]"); -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return _mm_extract_epi32(a, imm8); -#else + #else if constexpr (imm8 == 0) { return simd_cast<__m128i_t, uint32_t>(a); } else if constexpr (imm8 == 1) { @@ -1201,21 +1201,21 @@ int sse::extract_epi32(__m128i a) { } else { return simd_cast<__m128i_t, uint32_t>(shuffle_epi32<_MM_SHUFFLE(3, 3, 3, 3)>(a)); } -#endif + #endif } template int64_t sse::extract_epi64(__m128i a) { static_assert(imm8 >= 0 && imm8 < 2, "imm8 must be in range [0, 1]"); -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return _mm_extract_epi64(a, imm8); -#else + #else if constexpr (imm8 == 0) { return simd_cast<__m128i_t, uint64_t>(a); } else { return simd_cast<__m128i_t, uint64_t>(shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); } -#endif + #endif } template @@ -1322,39 +1322,39 @@ void sse::maskmoveu(__m128i a, __m128i mask, char *mem_addr) { } __m128i sse::max_epi8(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return _mm_max_epi8(a, b); -#else + #else return blendv_epi8(b, a, cmpgt_epi8(a, b)); -#endif + #endif } __m128i sse::max_epi16(__m128i a, __m128i b) { return _mm_max_epi16(a, b); } __m128i sse::max_epi32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return _mm_max_epi32(a, b); -#else + #else return blendv_epi8(b, a, cmpgt_epi32(a, b)); -#endif + #endif } __m128i sse::max_epu8(__m128i a, __m128i b) { return _mm_max_epu8(a, b); } __m128i sse::max_epu16(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return _mm_max_epu16(a, b); -#else + #else return add(subs_epu16(b, a), a, uint16_t()); -#endif + #endif } __m128i sse::max_epu32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return _mm_max_epu32(a, b); -#else + #else return blendv_epi8(b, a, cmpgt_epu32(a, b)); -#endif + #endif } __m128i sse::max(__m128i a, __m128i b, int8_t) { return max_epi8(a, b); } @@ -1367,14 +1367,14 @@ __m128i sse::max(__m128i a, __m128i b, uint32_t) { return max_epu32(a, b); } int8_t sse::max_epi8(__m128i a) { return 0x7fu ^ min_epu8(Xor(a, set1_epi8(0x7fu))); } int16_t sse::max_epi16(__m128i a) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return 0x7fffu ^ min_epu16(Xor(a, set1_epi16(0x7fffu))); -#else + #else a = max_epi16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); a = max_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); a = max_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); return simd_cast<__m128i_t, int16_t>(a); -#endif + #endif } int32_t sse::max_epi32(__m128i a) { @@ -1384,26 +1384,26 @@ int32_t sse::max_epi32(__m128i a) { } uint8_t sse::max_epu8(__m128i a) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return 0xffu ^ min_epu8(Xor(a, ones())); -#else + #else a = max_epu8(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); a = max_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); a = max_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); auto X = simd_cast<__m128i_t, uint32_t>(a); return std::max((uint8_t)X, (uint8_t)(X >> 8)); -#endif + #endif } uint16_t sse::max_epu16(__m128i a) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return 0xffffu ^ min_epu16(Xor(a, ones())); -#else + #else a = max_epu16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); a = max_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); a = max_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); return simd_cast<__m128i_t, uint16_t>(a); -#endif + #endif } uint32_t sse::max_epu32(__m128i a) { @@ -1422,39 +1422,39 @@ uint32_t sse::max(__m128i a, uint32_t) { return max_epu32(a); } void sse::mfence() { _mm_mfence(); } __m128i sse::min_epi8(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return _mm_min_epi8(a, b); -#else + #else return blendv_epi8(a, b, cmpgt_epi8(a, b)); -#endif + #endif } __m128i sse::min_epi16(__m128i a, __m128i b) { return _mm_min_epi16(a, b); } __m128i sse::min_epi32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return _mm_min_epi32(a, b); -#else + #else return blendv_epi8(a, b, cmpgt_epi32(a, b)); -#endif + #endif } __m128i sse::min_epu8(__m128i a, __m128i b) { return _mm_min_epu8(a, b); } __m128i sse::min_epu16(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return _mm_min_epu16(a, b); -#else + #else return blendv_epi8(a, b, cmpgt_epu16(a, b)); -#endif + #endif } __m128i sse::min_epu32(__m128i a, __m128i b) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return _mm_min_epu32(a, b); -#else + #else return blendv_epi8(a, b, cmpgt_epu32(a, b)); -#endif + #endif } __m128i sse::min(__m128i a, __m128i b, int8_t) { return min_epi8(a, b); } @@ -1467,14 +1467,14 @@ __m128i sse::min(__m128i a, __m128i b, uint32_t) { return min_epu32(a, b); } int8_t sse::min_epi8(__m128i a) { return 0x80u ^ min_epu8(Xor(a, setmin_epi8())); } int16_t sse::min_epi16(__m128i a) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return 0x8000u ^ min_epu16(Xor(a, setmin_epi16())); -#else + #else a = min_epi16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); a = min_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); a = min_epi16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); return simd_cast<__m128i_t, int16_t>(a); -#endif + #endif } int32_t sse::min_epi32(__m128i a) { @@ -1484,28 +1484,28 @@ int32_t sse::min_epi32(__m128i a) { } uint8_t sse::min_epu8(__m128i a) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) a = min_epu8(a, srli_epi16(a, 8)); a = _mm_minpos_epu16(a); return simd_cast<__m128i_t, uint8_t>(a); -#else + #else a = min_epu8(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); a = min_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); a = min_epu8(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); auto X = simd_cast<__m128i_t, uint32_t>(a); return std::min((uint8_t)X, (uint8_t)(X >> 8)); -#endif + #endif } uint16_t sse::min_epu16(__m128i a) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return simd_cast<__m128i_t, uint16_t>(_mm_minpos_epu16(a)); -#else + #else a = min_epu16(a, shuffle_epi32<_MM_SHUFFLE(3, 2, 3, 2)>(a)); a = min_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 3, 2)>(a)); a = min_epu16(a, shufflelo_epi16<_MM_SHUFFLE(1, 0, 1, 0)>(a)); return simd_cast<__m128i_t, uint16_t>(a); -#endif + #endif } uint32_t sse::min_epu32(__m128i a) { @@ -1550,27 +1550,27 @@ __m128i sse::mulhi_epu16(__m128i a, __m128i b) { return _mm_mulhi_epu16(a, b); } __m128i sse::mullo_epi16(__m128i a, __m128i b) { return _mm_mullo_epi16(a, b); } __m128i sse::negate_epi8(__m128i a) { -#if WJR_HAS_SIMD(SSSE3) + #if WJR_HAS_SIMD(SSSE3) return sign_epi8(a, ones()); -#else + #else return sub_epi8(zeros(), a); -#endif + #endif } __m128i sse::negate_epi16(__m128i a) { -#if WJR_HAS_SIMD(SSSE3) + #if WJR_HAS_SIMD(SSSE3) return sign_epi16(a, ones()); -#else + #else return sub_epi16(zeros(), a); -#endif + #endif } __m128i sse::negate_epi32(__m128i a) { -#if WJR_HAS_SIMD(SSSE3) + #if WJR_HAS_SIMD(SSSE3) return sign_epi32(a, ones()); -#else + #else return sub_epi32(zeros(), a); -#endif + #endif } __m128i sse::negate_epi64(__m128i a) { return sub_epi64(zeros(), a); } @@ -1602,11 +1602,11 @@ __m128i sse::loadu_si80(const void *ptr) { } __m128i sse::loadu_si96(const void *ptr) { -#if WJR_HAS_SIMD(SSE4_1) + #if WJR_HAS_SIMD(SSE4_1) return insert_epi32<2>(loadu_si64(ptr), reinterpret_cast(ptr)[2]); -#else + #else return insert_epi16<5>(loadu_si80(ptr), reinterpret_cast(ptr)[5]); -#endif + #endif } __m128i sse::loadu_si112(const void *ptr) { @@ -2033,4 +2033,4 @@ int sse::testz(__m128i a, __m128i b) { return _mm_testz_si128(a, b); } } // namespace wjr -#endif // WJR_X86_SIMD_SSE_HPP__ \ No newline at end of file +#endif // WJR_ARCH_X86_SIMD_SSE_HPP__ \ No newline at end of file diff --git a/include/wjr/assert.hpp b/include/wjr/assert.hpp index 470548a6..6472e2ee 100644 --- a/include/wjr/assert.hpp +++ b/include/wjr/assert.hpp @@ -32,15 +32,15 @@ #include #ifndef WJR_DEBUG_LEVEL -#if defined(NDEBUG) -#define WJR_DEBUG_LEVEL 0 -#else -#define WJR_DEBUG_LEVEL 1 -#endif + #if defined(NDEBUG) + #define WJR_DEBUG_LEVEL 0 + #else + #define WJR_DEBUG_LEVEL 1 + #endif #endif #if WJR_DEBUG_LEVEL < 0 || WJR_DEBUG_LEVEL > 3 -#error "WJR_DEBUG_LEVEL must be 0 ~ 3" + #error "WJR_DEBUG_LEVEL must be 0 ~ 3" #endif namespace wjr { diff --git a/include/wjr/atomic.hpp b/include/wjr/atomic.hpp new file mode 100644 index 00000000..15cbda52 --- /dev/null +++ b/include/wjr/atomic.hpp @@ -0,0 +1,4 @@ +#ifndef WJR_ATOMIC_HPP__ +#define WJR_ATOMIC_HPP__ + +#endif // WJR_ATOMIC_HPP__ diff --git a/include/wjr/capture_leaf.hpp b/include/wjr/capture_leaf.hpp index 0725ce76..70ab2580 100644 --- a/include/wjr/capture_leaf.hpp +++ b/include/wjr/capture_leaf.hpp @@ -27,8 +27,7 @@ class capture_leaf : enable_special_members_of_args_base { : Mybase(enable_default_constructor), m_value() {} template )> - constexpr capture_leaf(Args &&...args) noexcept( - std::is_constructible_v) + constexpr capture_leaf(Args &&...args) noexcept(std::is_constructible_v) : Mybase(enable_default_constructor), m_value(std::forward(args)...) {} template )> diff --git a/include/wjr/concurrency/pause.hpp b/include/wjr/concurrency/pause.hpp new file mode 100644 index 00000000..8531cece --- /dev/null +++ b/include/wjr/concurrency/pause.hpp @@ -0,0 +1,48 @@ +#ifndef WJR_CONCURRENCY_PAUSE_HPP__ +#define WJR_CONCURRENCY_PAUSE_HPP__ + +#include + +namespace wjr { + +#if defined(_MSC_VER) + #if defined(_M_AMD64) || defined(_M_IX86) +extern "C" void _mm_pause(void); + #if defined(WJR_COMPILER_MSVC) + #pragma intrinsic(_mm_pause) + #endif + #elif defined(_M_ARM64) || defined(_M_ARM) +extern "C" void __yield(void); + #if defined(WJR_COMPILER_MSVC) + #pragma intrinsic(__yield) + #endif + #endif +#endif + +WJR_INTRINSIC_INLINE void pause() noexcept { +#if defined(_MSC_VER) + #if defined(_M_AMD64) || defined(_M_IX86) + _mm_pause(); + #elif defined(_M_ARM64) || defined(_M_ARM) + __yield(); + #endif +#elif defined(__GNUC__) + #if defined(__i386__) || defined(__x86_64__) + asm volatile("pause\n\t" : : : "memory"); + #elif (defined(__ARM_ARCH) && __ARM_ARCH >= 8) || defined(__ARM_ARCH_8A__) || \ + defined(__aarch64__) + asm volatile("yield\n\t" : : : "memory"); + #elif defined(__riscv) && __riscv_xlen == 64 + #if defined(__riscv_zihintpause) + asm volatile("pause\n\t" : : : "memory"); + #else + /* Encoding of the pause instruction */ + asm volatile(".4byte 0x100000F\n\t"); + #endif + #endif +#endif +} + +} // namespace wjr + +#endif // WJR_CONCURRENCY_PAUSE_HPP__ \ No newline at end of file diff --git a/include/wjr/network/timer.hpp b/include/wjr/concurrency/timer.hpp similarity index 97% rename from include/wjr/network/timer.hpp rename to include/wjr/concurrency/timer.hpp index 95996a0d..a1eb1bad 100644 --- a/include/wjr/network/timer.hpp +++ b/include/wjr/concurrency/timer.hpp @@ -4,13 +4,13 @@ * @brief only preview * @version 0.1 * @date 2024-07-14 - * + * * @copyright Copyright (c) 2024 - * + * */ -#ifndef WJR_NETWORK_TIMER_HPP__ -#define WJR_NETWORK_TIMER_HPP__ +#ifndef WJR_CONCURRENCY_TIMER_HPP__ +#define WJR_CONCURRENCY_TIMER_HPP__ /** * @file timing_wheels.hpp @@ -178,4 +178,4 @@ class timing_wheels { } // namespace wjr -#endif // WJR_NETWORK_TIMER_HPP__ \ No newline at end of file +#endif // WJR_CONCURRENCY_TIMER_HPP__ \ No newline at end of file diff --git a/include/wjr/container/generic/bplus_tree.hpp b/include/wjr/container/generic/bplus_tree.hpp index e1517eca..c6a390d0 100644 --- a/include/wjr/container/generic/bplus_tree.hpp +++ b/include/wjr/container/generic/bplus_tree.hpp @@ -10,7 +10,7 @@ * less than or equal to 16. \n * After improvement, the number of queries for the i-th query is * [1, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10], the average number of queries - * is 6.56 times. In fact, the probability of querying smaller nodes is slightly greater + * is 6.56 times. In fact, the probability of querying smaller nodes is sliFghtly greater * than that of larger nodes, so the actual number of queries will be less. If the * comparison operation of key_type is more complex, it is not recommended to use B+ tree, * because the number of queries of B+ tree will be more, thus offsetting the advantages @@ -30,7 +30,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr { diff --git a/include/wjr/crtp/nonsendable.hpp b/include/wjr/crtp/nonsendable.hpp index e2101865..4fa66792 100644 --- a/include/wjr/crtp/nonsendable.hpp +++ b/include/wjr/crtp/nonsendable.hpp @@ -18,11 +18,11 @@ #include #if WJR_DEBUG_LEVEL > 2 -#define WJR_HAS_DEBUG_NONSENDABLE_CHECKER WJR_HAS_DEF + #define WJR_HAS_DEBUG_NONSENDABLE_CHECKER WJR_HAS_DEF #endif #if WJR_HAS_DEBUG(NONSENDABLE_CHECKER) -#include + #include #endif namespace wjr { diff --git a/include/wjr/format/charconv.hpp b/include/wjr/format/charconv.hpp index 11c035b4..95559269 100644 --- a/include/wjr/format/charconv.hpp +++ b/include/wjr/format/charconv.hpp @@ -13,7 +13,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr { diff --git a/include/wjr/format/dragonbox.hpp b/include/wjr/format/dragonbox.hpp index 8a5754e6..42fb5470 100644 --- a/include/wjr/format/dragonbox.hpp +++ b/include/wjr/format/dragonbox.hpp @@ -24,7 +24,7 @@ // ODR-used static data declaration will be decorated with this macro. The users may // define this macro, before including the library headers, into whatever they want. #ifndef WJR_STATIC_DATA_SECTION -#define WJR_STATIC_DATA_SECTION + #define WJR_STATIC_DATA_SECTION #endif namespace wjr { @@ -59,6 +59,7 @@ struct ieee754_binary32 { static constexpr int decimal_significand_digits = 9; static constexpr int decimal_exponent_digits = 2; }; + struct ieee754_binary64 { static constexpr int total_bits = 64; static constexpr int significand_bits = 52; diff --git a/include/wjr/format/fastfloat.hpp b/include/wjr/format/fastfloat.hpp index 0b2b9335..e33058d8 100644 --- a/include/wjr/format/fastfloat.hpp +++ b/include/wjr/format/fastfloat.hpp @@ -1792,8 +1792,8 @@ inline adjusted_mantissa positive_digit_comp(biginteger &bigmant, // we then need to scale by `2^(f- e)`, and then the two significant digits // are of the same magnitude. template -inline adjusted_mantissa negative_digit_comp(biginteger &bigmant, adjusted_mantissa am, - int32_t exponent) noexcept { +adjusted_mantissa negative_digit_comp(biginteger &bigmant, adjusted_mantissa am, + int32_t exponent) noexcept { biginteger &real_digits = bigmant; int32_t real_exp = exponent; @@ -1950,24 +1950,24 @@ WJR_INTRINSIC_INLINE bool rounds_to_nearest() noexcept { // Note: This may fail to be accurate if fast-math has been // enabled, as rounding conventions may not apply. #ifdef WJR_COMPILER_MSVC -#pragma warning(push) + #pragma warning(push) // todo: is there a VS warning? // see // https://stackoverflow.com/questions/46079446/is-there-a-warning-for-floating-point-equality-checking-in-visual-studio-2013 #elif defined(__clang__) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wfloat-equal" + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wfloat-equal" #elif defined(__GNUC__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wfloat-equal" + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wfloat-equal" #endif return (fmini + 1.0f == 1.0f - fmini); #ifdef WJR_COMPILER_MSVC -#pragma warning(pop) + #pragma warning(pop) #elif defined(__clang__) -#pragma clang diagnostic pop + #pragma clang diagnostic pop #elif defined(__GNUC__) -#pragma GCC diagnostic pop + #pragma GCC diagnostic pop #endif } diff --git a/include/wjr/format/utf8/utf8.hpp b/include/wjr/format/utf8/utf8.hpp index 225bce46..8bcc4dc9 100644 --- a/include/wjr/format/utf8/utf8.hpp +++ b/include/wjr/format/utf8/utf8.hpp @@ -6,7 +6,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr::utf8 { diff --git a/include/wjr/iterator/detail.hpp b/include/wjr/iterator/detail.hpp index 3a3b3e52..b15bfc40 100644 --- a/include/wjr/iterator/detail.hpp +++ b/include/wjr/iterator/detail.hpp @@ -123,7 +123,7 @@ using iterator_contiguous_pointer_t = std::add_pointer_t>; #if WJR_DEBUG_LEVEL > 1 -#define WJR_HAS_DEBUG_CONTIGUOUS_ITERATOR_CHECK WJR_HAS_DEF + #define WJR_HAS_DEBUG_CONTIGUOUS_ITERATOR_CHECK WJR_HAS_DEF #endif } // namespace wjr diff --git a/include/wjr/json/lexer.hpp b/include/wjr/json/lexer.hpp index cb2cc295..1cffc907 100644 --- a/include/wjr/json/lexer.hpp +++ b/include/wjr/json/lexer.hpp @@ -4,7 +4,7 @@ #include #if defined(WJR_X86) -#include + #include #endif #endif // WJR_JSON_LEXER_HPP__ \ No newline at end of file diff --git a/include/wjr/json/string.hpp b/include/wjr/json/string.hpp index 097f0235..6ad50579 100644 --- a/include/wjr/json/string.hpp +++ b/include/wjr/json/string.hpp @@ -7,7 +7,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr::json { diff --git a/include/wjr/math/add.hpp b/include/wjr/math/add.hpp index 3e0e4bc8..07e946cd 100644 --- a/include/wjr/math/add.hpp +++ b/include/wjr/math/add.hpp @@ -6,7 +6,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr { @@ -23,7 +23,7 @@ WJR_INTRINSIC_CONSTEXPR T fallback_addc(T a, T b, U c_in, U &c_out) noexcept { } #if WJR_HAS_BUILTIN(__builtin_addc) -#define WJR_HAS_BUILTIN_ADDC WJR_HAS_DEF + #define WJR_HAS_BUILTIN_ADDC WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(ADDC) @@ -32,13 +32,14 @@ template WJR_INTRINSIC_INLINE T builtin_addc(T a, T b, U c_in, U &c_out) noexcept { constexpr auto nd = std::numeric_limits::digits; -#define WJR_REGISTER_BUILTIN_ADDC(suffix, type) \ - if constexpr (nd <= std::numeric_limits::digits) { \ - type __c_out; \ - const T ret = __builtin_addc##suffix(a, b, static_cast(c_in), &__c_out); \ - c_out = static_cast(__c_out); \ - return ret; \ - } else + #define WJR_REGISTER_BUILTIN_ADDC(suffix, type) \ + if constexpr (nd <= std::numeric_limits::digits) { \ + type __c_out; \ + const T ret = \ + __builtin_addc##suffix(a, b, static_cast(c_in), &__c_out); \ + c_out = static_cast(__c_out); \ + return ret; \ + } else WJR_REGISTER_BUILTIN_ADDC(b, unsigned char) WJR_REGISTER_BUILTIN_ADDC(s, unsigned short) @@ -48,7 +49,7 @@ WJR_INTRINSIC_INLINE T builtin_addc(T a, T b, U c_in, U &c_out) noexcept { static_assert(nd <= 64, "not supported yet"); } -#undef WJR_REGISTER_BUILTIN_ADDC + #undef WJR_REGISTER_BUILTIN_ADDC } #endif // WJR_HAS_BUILTIN(ADDC) @@ -144,7 +145,7 @@ WJR_INTRINSIC_CONSTEXPR20 T addc_cc(T a, T b, uint8_t c_in, uint8_t &c_out) noex } #if WJR_HAS_BUILTIN(__builtin_add_overflow) -#define WJR_HAS_BUILTIN_ADD_OVERFLOW WJR_HAS_DEF + #define WJR_HAS_BUILTIN_ADD_OVERFLOW WJR_HAS_DEF #endif template @@ -354,7 +355,7 @@ WJR_INTRINSIC_CONSTEXPR void __fallback_add_128(uint64_t &al, uint64_t &ah, uint } #if WJR_HAS_FEATURE(FAST_INT128_ADDSUB) -#define WJR_HAS_BUILTIN___BUILTIN_ADD_128 WJR_HAS_DEF + #define WJR_HAS_BUILTIN___BUILTIN_ADD_128 WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(__BUILTIN_ADD_128) diff --git a/include/wjr/math/bit.hpp b/include/wjr/math/bit.hpp index 4cadf8ca..5956fd46 100644 --- a/include/wjr/math/bit.hpp +++ b/include/wjr/math/bit.hpp @@ -75,7 +75,7 @@ WJR_CONST WJR_INTRINSIC_CONSTEXPR20 T bit_floor(T x) noexcept { } #if WJR_HAS_BUILTIN(__builtin_bit_cast) || WJR_HAS_MSVC(19, 27) -#define WJR_HAS_BUILTIN_BIT_CAST WJR_HAS_DEF + #define WJR_HAS_BUILTIN_BIT_CAST WJR_HAS_DEF #endif template +template < + typename To, typename From, + WJR_REQUIRES(std::is_trivially_copyable_v &&std::is_trivially_copyable_v)> WJR_INTRINSIC_INLINE To bitwise_cast(From const &from) noexcept { return bit_detail::bitwise_cast_impl( - from, std::bool_constant<( - sizeof(From) == sizeof(To) && - atomics::detail::has_unique_object_representations::value)>()); + from, + std::bool_constant<(sizeof(From) == sizeof(To) && + std::has_unique_object_representations::value)>()); } } // namespace wjr diff --git a/include/wjr/math/clz.hpp b/include/wjr/math/clz.hpp index 307863fd..4d07bb0d 100644 --- a/include/wjr/math/clz.hpp +++ b/include/wjr/math/clz.hpp @@ -5,13 +5,13 @@ #include #if WJR_HAS_BUILTIN(__builtin_clz) -#define WJR_HAS_BUILTIN_CLZ WJR_HAS_DEF + #define WJR_HAS_BUILTIN_CLZ WJR_HAS_DEF #elif defined(WJR_MSVC) && defined(WJR_X86) -#define WJR_HAS_BUILTIN_CLZ WJR_HAS_DEF_VAR(2) + #define WJR_HAS_BUILTIN_CLZ WJR_HAS_DEF_VAR(2) #endif #if WJR_HAS_BUILTIN(CLZ) == 2 -#include + #include #endif namespace wjr { @@ -92,7 +92,7 @@ WJR_CONST WJR_INTRINSIC_INLINE int builtin_clz(T x) noexcept { if constexpr (nd < 32) { return builtin_clz(static_cast(x)) - (32 - nd); } else { -#if WJR_HAS_BUILTIN(CLZ) == 1 + #if WJR_HAS_BUILTIN(CLZ) == 1 if constexpr (nd <= std::numeric_limits::digits) { constexpr auto delta = std::numeric_limits::digits - nd; return __builtin_clz(static_cast(x)) - delta; @@ -105,7 +105,7 @@ WJR_CONST WJR_INTRINSIC_INLINE int builtin_clz(T x) noexcept { } else { static_assert(nd <= 64, "not supported yet"); } -#else + #else if constexpr (nd == 32) { unsigned long result; (void)_BitScanReverse(&result, x); @@ -115,7 +115,7 @@ WJR_CONST WJR_INTRINSIC_INLINE int builtin_clz(T x) noexcept { (void)_BitScanReverse64(&result, x); return 63 - result; } -#endif + #endif } } diff --git a/include/wjr/math/compare.hpp b/include/wjr/math/compare.hpp index f30d4d68..f44336c8 100644 --- a/include/wjr/math/compare.hpp +++ b/include/wjr/math/compare.hpp @@ -5,7 +5,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr { @@ -88,8 +88,8 @@ WJR_PURE WJR_INTRINSIC_CONSTEXPR20 int reverse_compare_n(const T *src0, const T } #if WJR_HAS_FEATURE(FAST_INT128_COMPARE) -#define WJR_HAS_BUILTIN___BUILTIN_LESS_128 WJR_HAS_DEF -#define WJR_HAS_BUILTIN___BUILTIN_LESS_EQUAL_128 WJR_HAS_DEF + #define WJR_HAS_BUILTIN___BUILTIN_LESS_128 WJR_HAS_DEF + #define WJR_HAS_BUILTIN___BUILTIN_LESS_EQUAL_128 WJR_HAS_DEF #endif WJR_INTRINSIC_CONSTEXPR20 bool __fallback_less_128(uint64_t lo0, uint64_t hi0, diff --git a/include/wjr/math/ctz.hpp b/include/wjr/math/ctz.hpp index a09bfba4..46807a20 100644 --- a/include/wjr/math/ctz.hpp +++ b/include/wjr/math/ctz.hpp @@ -5,13 +5,13 @@ #include #if WJR_HAS_BUILTIN(__builtin_ctz) -#define WJR_HAS_BUILTIN_CTZ WJR_HAS_DEF + #define WJR_HAS_BUILTIN_CTZ WJR_HAS_DEF #elif defined(WJR_MSVC) && defined(WJR_X86) -#define WJR_HAS_BUILTIN_CTZ WJR_HAS_DEF_VAR(2) + #define WJR_HAS_BUILTIN_CTZ WJR_HAS_DEF_VAR(2) #endif #if WJR_HAS_BUILTIN(CTZ) == 2 -#include + #include #endif namespace wjr { @@ -53,7 +53,7 @@ WJR_CONST WJR_INTRINSIC_INLINE int builtin_ctz(T x) noexcept { if constexpr (nd < 32) { return builtin_ctz(static_cast(x)); } else { -#if WJR_HAS_BUILTIN(CTZ) == 1 + #if WJR_HAS_BUILTIN(CTZ) == 1 if constexpr (nd <= std::numeric_limits::digits) { return __builtin_ctz(static_cast(x)); } else if constexpr (nd <= std::numeric_limits::digits) { @@ -63,7 +63,7 @@ WJR_CONST WJR_INTRINSIC_INLINE int builtin_ctz(T x) noexcept { } else { static_assert(nd <= 64, "not supported yet"); } -#else + #else if constexpr (nd == 32) { unsigned long result; (void)_BitScanForward(&result, x); @@ -73,7 +73,7 @@ WJR_CONST WJR_INTRINSIC_INLINE int builtin_ctz(T x) noexcept { (void)_BitScanForward64(&result, x); return result; } -#endif + #endif } } diff --git a/include/wjr/math/detail.hpp b/include/wjr/math/detail.hpp index 340a96ed..d37ac43b 100644 --- a/include/wjr/math/detail.hpp +++ b/include/wjr/math/detail.hpp @@ -94,7 +94,7 @@ WJR_CONST constexpr T __fasts_negate_with(T condition, T x) noexcept { template )> WJR_CONST constexpr T __fasts_increment(T x) noexcept { WJR_ASSERT_L2(x != std::numeric_limits::min() && - x != std::numeric_limits::max()); + x != std::numeric_limits::max()); return x < 0 ? x - 1 : x + 1; } diff --git a/include/wjr/math/div.hpp b/include/wjr/math/div.hpp index 4008d488..519b8bc3 100644 --- a/include/wjr/math/div.hpp +++ b/include/wjr/math/div.hpp @@ -4,7 +4,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr { diff --git a/include/wjr/math/divider.hpp b/include/wjr/math/divider.hpp index 80b1a852..ba66477a 100644 --- a/include/wjr/math/divider.hpp +++ b/include/wjr/math/divider.hpp @@ -7,7 +7,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr { diff --git a/include/wjr/math/find.hpp b/include/wjr/math/find.hpp index 24860d89..137c18b2 100644 --- a/include/wjr/math/find.hpp +++ b/include/wjr/math/find.hpp @@ -4,7 +4,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr { diff --git a/include/wjr/math/mul.hpp b/include/wjr/math/mul.hpp index b5d0fae6..d54cacdf 100644 --- a/include/wjr/math/mul.hpp +++ b/include/wjr/math/mul.hpp @@ -14,15 +14,15 @@ #include #if defined(WJR_MSVC) && defined(WJR_X86) -#define WJR_HAS_BUILTIN_MSVC_MULH64 WJR_HAS_DEF + #define WJR_HAS_BUILTIN_MSVC_MULH64 WJR_HAS_DEF #endif #if defined(WJR_X86) -#include + #include #endif #if WJR_HAS_BUILTIN(MSVC_MULH64) -#include + #include #endif namespace wjr { @@ -85,20 +85,20 @@ WJR_INTRINSIC_CONSTEXPR20 uint64_t __mul_u64(uint64_t a, uint64_t b, #if WJR_HAS_BUILTIN(UMUL128) if (is_constant_evaluated() -#if WJR_HAS_BUILTIN(ASM_UMUL128) + #if WJR_HAS_BUILTIN(ASM_UMUL128) || (WJR_BUILTIN_CONSTANT_P(a) && WJR_BUILTIN_CONSTANT_P(b)) -#endif + #endif ) { return fallback_mul64(a, b, hi); } -#if WJR_HAS_BUILTIN(ASM_UMUL128) + #if WJR_HAS_BUILTIN(ASM_UMUL128) // mov b to rax, then mul a // instead of mov a to rax, mov b to register, then mul if (WJR_BUILTIN_CONSTANT_P(b)) { return builtin_umul128(b, a, hi); } -#endif + #endif return builtin_umul128(a, b, hi); #else return fallback_mul64(a, b, hi); @@ -138,7 +138,7 @@ WJR_CONST WJR_INTRINSIC_CONSTEXPR T mullo(T a, T b) noexcept { } #if WJR_HAS_BUILTIN(__builtin_mul_overflow) -#define WJR_HAS_BUILTIN_MUL_OVERFLOW WJR_HAS_DEF + #define WJR_HAS_BUILTIN_MUL_OVERFLOW WJR_HAS_DEF #endif template diff --git a/include/wjr/math/not.hpp b/include/wjr/math/not.hpp index c4518bc4..3c84f821 100644 --- a/include/wjr/math/not.hpp +++ b/include/wjr/math/not.hpp @@ -4,7 +4,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr { diff --git a/include/wjr/math/popcount.hpp b/include/wjr/math/popcount.hpp index 2319e6d3..468c42aa 100644 --- a/include/wjr/math/popcount.hpp +++ b/include/wjr/math/popcount.hpp @@ -5,15 +5,15 @@ #if WJR_HAS_SIMD(POPCNT) -#if WJR_HAS_BUILTIN(__builtin_popcount) -#define WJR_HAS_BUILTIN_POPCOUNT WJR_HAS_DEF -#elif defined(WJR_MSVC) -#define WJR_HAS_BUILTIN_POPCOUNT WJR_HAS_DEF_VAR(2) -#endif + #if WJR_HAS_BUILTIN(__builtin_popcount) + #define WJR_HAS_BUILTIN_POPCOUNT WJR_HAS_DEF + #elif defined(WJR_MSVC) + #define WJR_HAS_BUILTIN_POPCOUNT WJR_HAS_DEF_VAR(2) + #endif -#if WJR_HAS_BUILTIN(POPCOUNT) == 2 -#include -#endif + #if WJR_HAS_BUILTIN(POPCOUNT) == 2 + #include + #endif #endif @@ -78,7 +78,7 @@ WJR_CONST WJR_INTRINSIC_CONSTEXPR int fallback_popcount(T x) noexcept { template WJR_CONST WJR_INTRINSIC_INLINE int builtin_popcount(T x) noexcept { constexpr auto nd = std::numeric_limits::digits; -#if WJR_HAS_BUILTIN(POPCOUNT) == 1 + #if WJR_HAS_BUILTIN(POPCOUNT) == 1 if constexpr (nd < 32) { return builtin_popcount(static_cast(x)); } else { @@ -92,7 +92,7 @@ WJR_CONST WJR_INTRINSIC_INLINE int builtin_popcount(T x) noexcept { static_assert(nd <= 64, "not support yet"); } } -#else + #else if constexpr (nd < 32) { return builtin_popcount(static_cast(x)); } else { @@ -105,7 +105,7 @@ WJR_CONST WJR_INTRINSIC_INLINE int builtin_popcount(T x) noexcept { } } -#endif // WJR_HAS_BUILTIN(POPCOUNT) + #endif // WJR_HAS_BUILTIN(POPCOUNT) } #endif diff --git a/include/wjr/math/prefix_xor.hpp b/include/wjr/math/prefix_xor.hpp index dd72a21f..2a95d19c 100644 --- a/include/wjr/math/prefix_xor.hpp +++ b/include/wjr/math/prefix_xor.hpp @@ -4,7 +4,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr { diff --git a/include/wjr/math/set.hpp b/include/wjr/math/set.hpp index f2dcc5b9..7eedfb2d 100644 --- a/include/wjr/math/set.hpp +++ b/include/wjr/math/set.hpp @@ -7,7 +7,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr { diff --git a/include/wjr/math/shift.hpp b/include/wjr/math/shift.hpp index a1f830b8..e3789748 100644 --- a/include/wjr/math/shift.hpp +++ b/include/wjr/math/shift.hpp @@ -5,7 +5,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr { diff --git a/include/wjr/math/sub.hpp b/include/wjr/math/sub.hpp index 5d764199..ad248e13 100644 --- a/include/wjr/math/sub.hpp +++ b/include/wjr/math/sub.hpp @@ -7,7 +7,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr { @@ -24,7 +24,7 @@ WJR_INTRINSIC_CONSTEXPR T fallback_subc(T a, T b, U c_in, U &c_out) noexcept { } #if WJR_HAS_BUILTIN(__builtin_subc) -#define WJR_HAS_BUILTIN_SUBC WJR_HAS_DEF + #define WJR_HAS_BUILTIN_SUBC WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(SUBC) @@ -33,13 +33,13 @@ template WJR_INTRINSIC_INLINE T builtin_subc(T a, T b, U c_in, U &c_out) noexcept { constexpr auto nd = std::numeric_limits::digits; -#define WJR_REGISTER_BUILTIN_SUBC(suffix, type) \ - if constexpr (nd <= std::numeric_limits::digits) { \ - type __c_out; \ - T ret = __builtin_subc##suffix(a, b, static_cast(c_in), &__c_out); \ - c_out = static_cast(__c_out); \ - return ret; \ - } else + #define WJR_REGISTER_BUILTIN_SUBC(suffix, type) \ + if constexpr (nd <= std::numeric_limits::digits) { \ + type __c_out; \ + T ret = __builtin_subc##suffix(a, b, static_cast(c_in), &__c_out); \ + c_out = static_cast(__c_out); \ + return ret; \ + } else WJR_REGISTER_BUILTIN_SUBC(b, unsigned char) WJR_REGISTER_BUILTIN_SUBC(s, unsigned short) @@ -49,7 +49,7 @@ WJR_INTRINSIC_INLINE T builtin_subc(T a, T b, U c_in, U &c_out) noexcept { static_assert(nd <= 64, "not supported yet"); } -#undef WJR_REGISTER_BUILTIN_SUBC + #undef WJR_REGISTER_BUILTIN_SUBC } #endif // WJR_HAS_BUILTIN(SUBC) @@ -126,7 +126,7 @@ WJR_INTRINSIC_CONSTEXPR20 T subc_cc(T a, T b, uint8_t c_in, uint8_t &c_out) noex } #if WJR_HAS_BUILTIN(__builtin_sub_overflow) -#define WJR_HAS_BUILTIN_SUB_OVERFLOW WJR_HAS_DEF + #define WJR_HAS_BUILTIN_SUB_OVERFLOW WJR_HAS_DEF #endif template @@ -547,7 +547,7 @@ WJR_INTRINSIC_CONSTEXPR void __fallback_sub_128(uint64_t &al, uint64_t &ah, uint } #if WJR_HAS_FEATURE(FAST_INT128_ADDSUB) -#define WJR_HAS_BUILTIN___BUILTIN_SUB_128 WJR_HAS_DEF + #define WJR_HAS_BUILTIN___BUILTIN_SUB_128 WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(__BUILTIN_SUBC_128) diff --git a/include/wjr/memory/detail.hpp b/include/wjr/memory/detail.hpp index 728ca4ec..714a0e75 100644 --- a/include/wjr/memory/detail.hpp +++ b/include/wjr/memory/detail.hpp @@ -103,7 +103,7 @@ WJR_CONST WJR_INTRINSIC_CONSTEXPR T fallback_byteswap(T x) noexcept { } #if WJR_HAS_BUILTIN(__builtin_bswap16) -#define WJR_HAS_BUILTIN_BYTESWAP WJR_HAS_DEF + #define WJR_HAS_BUILTIN_BYTESWAP WJR_HAS_DEF #endif #if WJR_HAS_BUILTIN(BYTESWAP) diff --git a/include/wjr/memory/safe_pointer.hpp b/include/wjr/memory/safe_pointer.hpp index 38511d2e..64760181 100644 --- a/include/wjr/memory/safe_pointer.hpp +++ b/include/wjr/memory/safe_pointer.hpp @@ -6,7 +6,7 @@ namespace wjr { #if WJR_DEBUG_LEVEL > 2 -#define WJR_HAS_DEBUG_SAFE_POINTER WJR_HAS_DEF + #define WJR_HAS_DEBUG_SAFE_POINTER WJR_HAS_DEF #endif #if WJR_HAS_DEBUG(SAFE_POINTER) diff --git a/include/wjr/network/pause.hpp b/include/wjr/network/pause.hpp deleted file mode 100644 index 41571728..00000000 --- a/include/wjr/network/pause.hpp +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef WJR_NETWORK_PAUSE_HPP__ -#define WJR_NETWORK_PAUSE_HPP__ - -#include - -namespace wjr { - -#if defined(_MSC_VER) -#if defined(_M_AMD64) || defined(_M_IX86) -extern "C" void _mm_pause(void); -#if defined(WJR_COMPILER_MSVC) -#pragma intrinsic(_mm_pause) -#endif -#elif defined(_M_ARM64) || defined(_M_ARM) -extern "C" void __yield(void); -#if defined(WJR_COMPILER_MSVC) -#pragma intrinsic(__yield) -#endif -#endif -#endif - -WJR_INTRINSIC_INLINE void pause() noexcept { -#if defined(_MSC_VER) -#if defined(_M_AMD64) || defined(_M_IX86) - _mm_pause(); -#elif defined(_M_ARM64) || defined(_M_ARM) - __yield(); -#endif -#elif defined(__GNUC__) -#if defined(__i386__) || defined(__x86_64__) - asm volatile("pause\n\t" : : : "memory"); -#elif (defined(__ARM_ARCH) && __ARM_ARCH >= 8) || defined(__ARM_ARCH_8A__) || \ - defined(__aarch64__) - asm volatile("yield\n\t" : : : "memory"); -#elif defined(__riscv) && __riscv_xlen == 64 -#if defined(__riscv_zihintpause) - asm volatile("pause\n\t" : : : "memory"); -#else - /* Encoding of the pause instruction */ - asm volatile(".4byte 0x100000F\n\t"); -#endif -#endif -#endif -} - -} // namespace wjr - -#include - -#endif // WJR_NETWORK_PAUSE_HPP__ diff --git a/include/wjr/preprocessor/arithmatic/dec.hpp b/include/wjr/preprocessor/arithmatic/dec.hpp index 675a0368..471b433a 100644 --- a/include/wjr/preprocessor/arithmatic/dec.hpp +++ b/include/wjr/preprocessor/arithmatic/dec.hpp @@ -2,7 +2,6 @@ #define WJR_PREPROCESSOR_ARITHMATIC_DEC_HPP__ #define WJR_PP_DEC(x) WJR_PP_DEC_I(x) - #define WJR_PP_DEC_I(x) WJR_PP_DEC_##x #define WJR_PP_DEC_0 63 diff --git a/include/wjr/preprocessor/compiler.hpp b/include/wjr/preprocessor/compiler.hpp deleted file mode 100644 index e3fc002e..00000000 --- a/include/wjr/preprocessor/compiler.hpp +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef WJR_PREPROCESSOR_COMPILER_HPP__ -#define WJR_PREPROCESSOR_COMPILER_HPP__ - -#include - -#endif // ! WJR_PREPROCESSOR_COMPILER_HPP__ \ No newline at end of file diff --git a/include/wjr/preprocessor/compiler/arch.hpp b/include/wjr/preprocessor/compiler/arch.hpp deleted file mode 100644 index 0d912d8c..00000000 --- a/include/wjr/preprocessor/compiler/arch.hpp +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef WJR_PREPROCESSOR_COMPILER_ARCH_HPP__ -#define WJR_PREPROCESSOR_COMPILER_ARCH_HPP__ - -#if defined(__pnacl__) || defined(__CLR_VER) -#define WJR_VM -#endif - -#if (defined(_M_IX86) || defined(__i386__)) && !defined(WJR_VM) -#define WJR_X86_32 -#endif - -#if (defined(_M_X64) || defined(__x86_64__)) && !defined(WJR_VM) -#define WJR_X86_64 -#endif - -#if defined(WJR_X86_32) || defined(WJR_X86_64) -#define WJR_X86 -#endif - -#if (defined(__arm__) || defined(_M_ARM)) -#define WJR_ARM -#endif - -#if defined(__aarch64__) -#define WJR_AARCH64 -#endif - -#if defined(__powerpc64__) -#define WJR_PPC64 -#endif - -#if defined(WJR_X86_64) -#if defined(__i386__) || defined(_M_IX86) || defined(_X86_) -#define CPU_INTEL -#elif defined(_M_AMD64) -#define CPU_AMD -#endif -#else -#define CPU_UNKNOWN -#endif - -#endif // !WJR_PREPROCESSOR_COMPILER_ARCH_HPP__ \ No newline at end of file diff --git a/include/wjr/preprocessor/compiler/attribute.hpp b/include/wjr/preprocessor/compiler/attribute.hpp deleted file mode 100644 index 40a859b0..00000000 --- a/include/wjr/preprocessor/compiler/attribute.hpp +++ /dev/null @@ -1,300 +0,0 @@ -#ifndef WJR_PREPROCESSOR_COMPILER_ATTRIBUTE_HPP__ -#define WJR_PREPROCESSOR_COMPILER_ATTRIBUTE_HPP__ - -#include - -#if defined(WJR_CXX_20) -#include -#endif - -#if WJR_HAS_CPP_ATTRIBUTE(fallthrough) -#define WJR_FALLTHROUGH [[fallthrough]] -#elif WJR_HAS_ATTRIBUTE(fallthrough) -#define WJR_FALLTHROUGH __attribute__((fallthrough)) -#elif defined(_MSC_VER) && defined(__fallthrough) -#define WJR_FALLTHROUGH __fallthrough -#else -#define WJR_FALLTHROUGH -#endif - -#if WJR_HAS_CPP_ATTRIBUTE(noreturn) -#define WJR_NORETURN [[noreturn]] -#elif WJR_HAS_ATTRIBUTE(noreturn) -#define WJR_NORETURN __attribute__((noreturn)) -#elif defined(_MSC_VER) -#define WJR_NORETURN __declspec(noreturn) -#else -#define WJR_NORETURN -#endif - -#if WJR_HAS_CPP_ATTRIBUTE(nodiscard) -#define WJR_NODISCARD [[nodiscard]] -#elif WJR_HAS_ATTRIBUTE(nodiscard) -#define WJR_NODISCARD __attribute__((nodiscard)) -#elif defined(_MSC_VER) -#define WJR_NODISCARD _Check_return_ -#else -#define WJR_NODISCARD -#endif - -#if WJR_HAS_CPP_ATTRIBUTE(deprecated) -#define WJR_DEPRECATED [[deprecated]] -#elif WJR_HAS_ATTRIBUTE(deprecated) -#define WJR_DEPRECATED __attribute__((deprecated)) -#elif defined(_MSC_VER) -#define WJR_DEPRECATED __declspec(deprecated) -#else -#define WJR_DEPRECATED -#endif - -#if WJR_HAS_CPP_ATTRIBUTE(maybe_unused) -#define WJR_MAYBE_UNUSED [[maybe_unused]] -#elif WJR_HAS_ATTRIBUTE(maybe_unused) -#define WJR_MAYBE_UNUSED __attribute__((maybe_unused)) -#elif defined(_MSC_VER) -#define WJR_MAYBE_UNUSED -#else -#define WJR_MAYBE_UNUSED -#endif - -#if WJR_HAS_ATTRIBUTE(always_inline) -#define WJR_FORCEINLINE __attribute__((always_inline)) -#elif defined(_MSC_VER) -#define WJR_FORCEINLINE __forceinline -#else -#define WJR_FORCEINLINE -#endif - -#if defined(_MSV_VER) -#define WJR_SAFEBUFFERS __declspec(safebuffers) -#else -#define WJR_SAFEBUFFERS -#endif - -#if WJR_HAS_ATTRIBUTE(flatten) -#define WJR_FLATTEN __attribute__((flatten)) -#elif WJR_HAS_MSVC(17, 00) -#define WJR_FLATTEN [[msvc::flatten]] -#else -#define WJR_FLATTEN -#endif - -#if WJR_HAS_FEATURE(FORCEINLINE_LAMBDA) -#define WJR_FORCEINLINE_LAMBDA WJR_FORCEINLINE -#else -#define WJR_FORCEINLINE_LAMBDA -#endif - -// NOINLINE for MSVC/GCC/CLANG ... -#if WJR_HAS_ATTRIBUTE(noinline) -#define WJR_NOINLINE __attribute__((noinline)) -#elif defined(_MSC_VER) -#define WJR_NOINLINE __declspec(noinline) -#else -#define WJR_NOINLINE -#endif - -#if WJR_HAS_ATTRIBUTE(hot) -#define WJR_HOT __attribute__((hot)) -#elif defined(_MSC_VER) -#define WJR_HOT -#else -#define WJR_HOT -#endif - -#if WJR_HAS_ATTRIBUTE(cold) -#define WJR_COLD __attribute__((cold)) -#elif defined(_MSC_VER) -#define WJR_COLD -#else -#define WJR_COLD -#endif - -#if WJR_HAS_ATTRIBUTE(aligned) -#define WJR_ALIGNED(size) __attribute__((aligned(size))) -#elif defined(_MSC_VER) -#define WJR_ALIGNED(size) -#else -#define WJR_ALIGNED(size) -#endif - -#if defined(__cpp_lib_unreachable) -#define WJR_UNREACHABLE() std::unreachable() -#elif WJR_HAS_BUILTIN(__builtin_unreachable) -#define WJR_UNREACHABLE() __builtin_unreachable() -#elif defined(WJR_COMPILER_MSVC) -#define WJR_UNREACHABLE() __assume(0) -#else -#define WJR_UNREACHABLE() -#endif - -#if defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_GCC) -#define WJR_RESTRICT __restrict -#else -#define WJR_RESTRICT -#endif - -#if defined(WJR_COMPILER_MSVC) -#define WJR_MS_ABI -#define WJR_HAS_FEATURE_MS_ABI WJR_HAS_DEF -#elif WJR_HAS_ATTRIBUTE(__ms_abi__) -#define WJR_MS_ABI __attribute__((__ms_abi__)) -#define WJR_HAS_FEATURE_MS_ABI WJR_HAS_DEF -#elif defined(WJR_ENABLE_ASSEMBLY) -#undef WJR_ENABLE_ASSEMBLY -#endif - -#define WJR_ASSUME_MAY_NOT_PURE(expr) \ - do { \ - if (!(expr)) { \ - WJR_UNREACHABLE(); \ - } \ - } while (false) - -#if WJR_HAS_BUILTIN(__builtin_assume) -#define WJR_ASSUME(expr) __builtin_assume(expr) -#elif defined(WJR_COMPILER_MSVC) -#define WJR_ASSUME(expr) __assume(expr) -#elif WJR_HAS_CPP_ATTRIBUTE(assume) -#define WJR_ASSUME(expr) [[assume(expr)]] -#else -#define WJR_ASSUME(expr) WJR_ASSUME_MAY_NOT_PURE(expr) -#endif - -#define WJR_BOOL_EXPR(expr) (!!(expr)) - -#if WJR_HAS_BUILTIN(__builtin_expect) -#define WJR_EXPECT(expr, expect) __builtin_expect((expr), (expect)) -#else -#define WJR_EXPECT(expr, expect) (expr) -#endif - -#define WJR_LIKELY(expr) WJR_EXPECT(WJR_BOOL_EXPR(expr), true) -#define WJR_UNLIKELY(expr) WJR_EXPECT(WJR_BOOL_EXPR(expr), false) - -#define WJR_HAS_FEATURE_IS_CONSTANT_EVALUATED WJR_HAS_DEF - -#if WJR_HAS_BUILTIN(__builtin_expect_with_probability) -#define WJR_EXPECT_WITH_PROBABILITY(exp, c, probability) \ - __builtin_expect_with_probability(exp, c, probability) -#else -#define WJR_EXPECT_WITH_PROBABILITY(exp, c, probability) (expr) -#endif - -#if WJR_HAS_BUILTIN(__builtin_expect_with_probability) -#define WJR_VERY_LIKELY(exp, probability) \ - WJR_EXPECT_WITH_PROBABILITY(exp, true, probability) -#define WJR_VERY_UNLIKELY(exp, probability) \ - WJR_EXPECT_WITH_PROBABILITY(exp, false, probability) -#else -#define WJR_VERY_LIKELY(exp, probability) WJR_LIKELY((exp)) -#define WJR_VERY_UNLIKELY(exp, probability) WJR_UNLIKELY((exp)) -#endif - -#if defined(__cpp_lib_is_constant_evaluated) -#define WJR_IS_CONSTANT_EVALUATED() std::is_constant_evaluated() -#elif WJR_HAS_BUILTIN(__builtin_is_constant_evaluated) -#define WJR_IS_CONSTANT_EVALUATED() __builtin_is_constant_evaluated() -#else -#define WJR_IS_CONSTANT_EVALUATED() false -#undef WJR_HAS_FEATURE_IS_CONSTANT_EVALUATED -#endif - -/** - * @details For Clang version 14.0.0-, __builtin_constant_p have some performance issue. - * \n - * 1. Cannot propagate __builtin_constant_p beyond function \n - * 2. Maybe prevent function be inlined. For low version clang, it will inline all path - * even if only one path will be executed, then the function that calls it will not be - * inlined. - */ -#if WJR_HAS_BUILTIN(__builtin_constant_p) -#define WJR_BUILTIN_CONSTANT_P(expr) __builtin_constant_p(expr) -#else -#define WJR_BUILTIN_CONSTANT_P(expr) false -#endif - -#if WJR_HAS_BUILTIN(__builtin_constant_p) -#define WJR_BUILTIN_CONSTANT_P_TRUE(expr) (WJR_BUILTIN_CONSTANT_P(expr) && (expr)) -#else -#define WJR_BUILTIN_CONSTANT_P_TRUE(expr) false -#endif - -#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_COMPILER_BARRIER() asm volatile("" ::: "memory") -#define WJR_COMPILER_EMPTY_ASM() asm("") -#else -#define WJR_COMPILER_BARRIER() -#define WJR_COMPILER_EMPTY_ASM() -#endif - -#define WJR_CONSTEXPR_COMPILER_BARRIER() \ - do { \ - if (!(WJR_IS_CONSTANT_EVALUATED())) { \ - WJR_COMPILER_BARRIER(); \ - } \ - } while (false) - -#if defined(WJR_FORCEINLINE) -#define WJR_INTRINSIC_INLINE inline WJR_FORCEINLINE -#else -#define WJR_INTRINSIC_INLINE inline -#endif - -// pure attribute -#if WJR_HAS_ATTRIBUTE(pure) -#define WJR_PURE __attribute__((pure)) -#else -#define WJR_PURE -#endif - -// const attribute -#if WJR_HAS_ATTRIBUTE(const) -#define WJR_CONST __attribute__((const)) -#else -#define WJR_CONST -#endif - -#if WJR_HAS_ATTRIBUTE(malloc) -#define WJR_MALLOC __attribute__((malloc)) -#else -#define WJR_MALLOC -#endif - -#if WJR_HAS_ATTRIBUTE(nonnull) -#define WJR_NONNULL(...) __attribute__((__VA_ARGS__)) -#else -#define WJR_NONNULL(...) -#endif - -#if WJR_HAS_ATTRIBUTE(__may_alias__) -#define WJR_MAY_ALIAS __attribute__((__may_alias__)) -#else -#define WJR_MAY_ALIAS -#endif - -#define WJR_INLINE inline -#define WJR_CONSTEXPR constexpr - -#if defined(WJR_CXX_20) -#define WJR_CONSTEXPR20 constexpr -#else -#define WJR_CONSTEXPR20 -#endif - -#define WJR_INTRINSIC_CONSTEXPR WJR_INTRINSIC_INLINE constexpr -#define WJR_INTRINSIC_CONSTEXPR20 WJR_INTRINSIC_INLINE WJR_CONSTEXPR20 - -#define WJR_INLINE_CONSTEXPR inline constexpr -#define WJR_INLINE_CONSTEXPR20 inline WJR_CONSTEXPR20 - -#define WJR_ATTRIBUTE(attribute) WJR_ATTRIBUTE_I(attribute) -#define WJR_ATTRIBUTE_I(attribute) WJR_##attribute - -#if defined(_MSC_VER) -#define WJR_EMPTY_BASES __declspec(empty_bases) -#else -#define WJR_EMPTY_BASES -#endif - -#endif // WJR_PREPROCESSOR_COMPILER_ATTRIBUTE_HPP__ \ No newline at end of file diff --git a/include/wjr/preprocessor/compiler/compiler.hpp b/include/wjr/preprocessor/compiler/compiler.hpp deleted file mode 100644 index bd37017b..00000000 --- a/include/wjr/preprocessor/compiler/compiler.hpp +++ /dev/null @@ -1,113 +0,0 @@ -#ifndef WJR_PREPROCESSOR_COMPILER_COMPILER_HPP__ -#define WJR_PREPROCESSOR_COMPILER_COMPILER_HPP__ - -#if defined(__clang__) -#define WJR_COMPILER_CLANG -#elif defined(__GNUC__) -#define WJR_COMPILER_GCC -#elif defined(_MSC_VER) -#define WJR_COMPILER_MSVC -#endif - -#if defined(_MSC_VER) -#define WJR_MSVC -#endif // _MSC_VER - -#if defined(__GNUC__) -#define WJR_HAS_GCC(major, minor, patchlevel) \ - ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ > (minor)) || \ - (__GNUC__ == (major) && __GNUC_MINOR__ == (minor) && \ - __GNUC_PATCHLEVEL__ >= (patchlevel))) -#else -#define WJR_HAS_GCC(major, minor, patchlevel) 0 -#endif // __GNUC__ - -#if defined(__clang__) -#define WJR_HAS_CLANG(major, minor, patchlevel) \ - ((__clang_major__ > (major)) || \ - (__clang_major__ == (major) && __clang_minor__ > (minor)) || \ - (__clang_major__ == (major) && __clang_minor__ == (minor) && \ - __clang_patchlevel__ >= (patchlevel))) -#else -#define WJR_HAS_CLANG(major, minor, patchlevel) 0 -#endif - -#if defined(_MSC_VER) -#define WJR_HAS_MSVC(minor, level) (_MSC_VER >= (minor)*100 + (level)) -#else -#define WJR_HAS_MSVC(minor, level) 0 -#endif - -#if (defined(WJR_COMPILER_GCC) && !WJR_HAS_GCC(7, 1, 0)) || \ - (defined(WJR_COMPILER_CLANG) && !WJR_HAS_CLANG(5, 0, 0)) -#error "GCC 7.1.0 or Clang 5.0.0 or later is required" -#endif - -#if defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_GCC) -#define WJR_CXX_STANDARD __cplusplus -#elif defined(WJR_COMPILER_MSVC) -#define WJR_CXX_STANDARD _MSVC_LANG -#endif - -#if WJR_CXX_STANDARD >= 199711L -#define WJR_CXX_03 -#endif -#if WJR_CXX_STANDARD >= 201103L -#define WJR_CXX_11 -#endif -#if WJR_CXX_STANDARD >= 201402L -#define WJR_CXX_14 -#endif -#if WJR_CXX_STANDARD >= 201703L -#define WJR_CXX_17 -#endif -#if WJR_CXX_STANDARD >= 202002L -#define WJR_CXX_20 -#endif - -#ifndef WJR_CXX_17 -#error "required C++17 or later" -#endif // c++17 - -#if defined(__cpp_char8_t) -#define WJR_CHAR8_T -#endif // __cpp_char8_t - -#if defined(__LINE__) -#define WJR_LINE __LINE__ -#elif defined(__COUNTER__) -#define WJR_LINE __COUNTER__ -#else -#define WJR_LINE -1 -#endif - -#ifdef __FILE__ -#define WJR_FILE __FILE__ -#else -#define WJR_FILE "" -#endif - -// reference: boost BOOST_CURRENT_FUNCTION -#if defined(WJR_DISABLE_CURRENT_FUNCTION) -#define WJR_CURRENT_FUNCTION "(unknown)" -#elif defined(__GNUC__) || (defined(__MWERKS__) && (__MWERKS__ >= 0x3000)) || \ - (defined(__ICC) && (__ICC >= 600)) || defined(__ghs__) -#define WJR_CURRENT_FUNCTION __PRETTY_FUNCTION__ -#elif defined(__DMC__) && (__DMC__ >= 0x810) -#define WJR_CURRENT_FUNCTION __PRETTY_FUNCTION__ -#elif defined(__FUNCSIG__) -#define WJR_CURRENT_FUNCTION __FUNCSIG__ -#elif (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 600)) || \ - (defined(__IBMCPP__) && (__IBMCPP__ >= 500)) -#define WJR_CURRENT_FUNCTION __FUNCTION__ -#elif defined(__BORLANDC__) && (__BORLANDC__ >= 0x550) -#define WJR_CURRENT_FUNCTION __FUNC__ -#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901) -#define WJR_CURRENT_FUNCTION __func__ -#elif defined(__cplusplus) && (__cplusplus >= 201103) -#define WJR_CURRENT_FUNCTION __func__ -#else -#define WJR_CURRENT_FUNCTION "(unknown)" -#endif - -#endif // !WJR_PREPROCESSOR_COMPILER_COMPILER_HPP__ \ No newline at end of file diff --git a/include/wjr/preprocessor/config.hpp b/include/wjr/preprocessor/config.hpp new file mode 100644 index 00000000..680cd49c --- /dev/null +++ b/include/wjr/preprocessor/config.hpp @@ -0,0 +1,7 @@ +#ifndef WJR_PREPROCESSOR_CONFIG_HPP__ +#define WJR_PREPROCESSOR_CONFIG_HPP__ + +#include +#include + +#endif // ! WJR_PREPROCESSOR_CONFIG_HPP__ \ No newline at end of file diff --git a/include/wjr/preprocessor/config/arch.hpp b/include/wjr/preprocessor/config/arch.hpp new file mode 100644 index 00000000..f21420b2 --- /dev/null +++ b/include/wjr/preprocessor/config/arch.hpp @@ -0,0 +1,42 @@ +#ifndef WJR_PREPROCESSOR_CONFIG_ARCH_HPP__ +#define WJR_PREPROCESSOR_CONFIG_ARCH_HPP__ + +#if defined(__pnacl__) || defined(__CLR_VER) + #define WJR_VM +#endif + +#if (defined(_M_IX86) || defined(__i386__)) && !defined(WJR_VM) + #define WJR_X86_32 +#endif + +#if (defined(_M_X64) || defined(__x86_64__)) && !defined(WJR_VM) + #define WJR_X86_64 +#endif + +#if defined(WJR_X86_32) || defined(WJR_X86_64) + #define WJR_X86 +#endif + +#if (defined(__arm__) || defined(_M_ARM)) + #define WJR_ARM +#endif + +#if defined(__aarch64__) + #define WJR_AARCH64 +#endif + +#if defined(__powerpc64__) + #define WJR_PPC64 +#endif + +#if defined(WJR_X86_64) + #if defined(__i386__) || defined(_M_IX86) || defined(_X86_) + #define CPU_INTEL + #elif defined(_M_AMD64) + #define CPU_AMD + #endif +#else + #define CPU_UNKNOWN +#endif + +#endif // !WJR_PREPROCESSOR_CONFIG_ARCH_HPP__ \ No newline at end of file diff --git a/include/wjr/preprocessor/config/attribute.hpp b/include/wjr/preprocessor/config/attribute.hpp new file mode 100644 index 00000000..c18942e8 --- /dev/null +++ b/include/wjr/preprocessor/config/attribute.hpp @@ -0,0 +1,308 @@ +#ifndef WJR_PREPROCESSOR_CONFIG_ATTRIBUTE_HPP__ +#define WJR_PREPROCESSOR_CONFIG_ATTRIBUTE_HPP__ + +#include + +#if defined(WJR_CXX_20) + #include +#endif + +#if WJR_HAS_CPP_ATTRIBUTE(fallthrough) + #define WJR_FALLTHROUGH [[fallthrough]] +#elif WJR_HAS_ATTRIBUTE(fallthrough) + #define WJR_FALLTHROUGH __attribute__((fallthrough)) +#elif defined(_MSC_VER) && defined(__fallthrough) + #define WJR_FALLTHROUGH __fallthrough +#else + #define WJR_FALLTHROUGH +#endif + +#if WJR_HAS_CPP_ATTRIBUTE(noreturn) + #define WJR_NORETURN [[noreturn]] +#elif WJR_HAS_ATTRIBUTE(noreturn) + #define WJR_NORETURN __attribute__((noreturn)) +#elif defined(_MSC_VER) + #define WJR_NORETURN __declspec(noreturn) +#else + #define WJR_NORETURN +#endif + +#if WJR_HAS_CPP_ATTRIBUTE(nodiscard) + #define WJR_NODISCARD [[nodiscard]] +#elif WJR_HAS_ATTRIBUTE(nodiscard) + #define WJR_NODISCARD __attribute__((nodiscard)) +#elif defined(_MSC_VER) + #define WJR_NODISCARD _Check_return_ +#else + #define WJR_NODISCARD +#endif + +#if WJR_HAS_CPP_ATTRIBUTE(deprecated) + #define WJR_DEPRECATED [[deprecated]] +#elif WJR_HAS_ATTRIBUTE(deprecated) + #define WJR_DEPRECATED __attribute__((deprecated)) +#elif defined(_MSC_VER) + #define WJR_DEPRECATED __declspec(deprecated) +#else + #define WJR_DEPRECATED +#endif + +#if WJR_HAS_CPP_ATTRIBUTE(maybe_unused) + #define WJR_MAYBE_UNUSED [[maybe_unused]] +#elif WJR_HAS_ATTRIBUTE(maybe_unused) + #define WJR_MAYBE_UNUSED __attribute__((maybe_unused)) +#elif defined(_MSC_VER) + #define WJR_MAYBE_UNUSED +#else + #define WJR_MAYBE_UNUSED +#endif + +#if WJR_HAS_ATTRIBUTE(always_inline) + #define WJR_FORCEINLINE __attribute__((always_inline)) +#elif defined(_MSC_VER) + #define WJR_FORCEINLINE __forceinline +#else + #define WJR_FORCEINLINE +#endif + +#if defined(_MSV_VER) + #define WJR_SAFEBUFFERS __declspec(safebuffers) +#else + #define WJR_SAFEBUFFERS +#endif + +#if WJR_HAS_ATTRIBUTE(flatten) + #define WJR_FLATTEN __attribute__((flatten)) +#elif WJR_HAS_MSVC(17, 00) + #define WJR_FLATTEN [[msvc::flatten]] +#else + #define WJR_FLATTEN +#endif + +#if WJR_HAS_FEATURE(FORCEINLINE_LAMBDA) + #define WJR_FORCEINLINE_LAMBDA WJR_FORCEINLINE +#else + #define WJR_FORCEINLINE_LAMBDA +#endif + +// NOINLINE for MSVC/GCC/CLANG ... +#if WJR_HAS_ATTRIBUTE(noinline) + #define WJR_NOINLINE __attribute__((noinline)) +#elif defined(_MSC_VER) + #define WJR_NOINLINE __declspec(noinline) +#else + #define WJR_NOINLINE +#endif + +#if WJR_HAS_ATTRIBUTE(hot) + #define WJR_HOT __attribute__((hot)) +#elif defined(_MSC_VER) + #define WJR_HOT +#else + #define WJR_HOT +#endif + +#if WJR_HAS_ATTRIBUTE(cold) + #define WJR_COLD __attribute__((cold)) +#elif defined(_MSC_VER) + #define WJR_COLD +#else + #define WJR_COLD +#endif + +#if WJR_HAS_ATTRIBUTE(aligned) + #define WJR_ALIGNED(size) __attribute__((aligned(size))) +#elif defined(_MSC_VER) + #define WJR_ALIGNED(size) +#else + #define WJR_ALIGNED(size) +#endif + +#if defined(__cpp_lib_unreachable) + #define WJR_UNREACHABLE() std::unreachable() +#elif WJR_HAS_BUILTIN(__builtin_unreachable) + #define WJR_UNREACHABLE() __builtin_unreachable() +#elif defined(WJR_COMPILER_MSVC) + #define WJR_UNREACHABLE() __assume(0) +#else + #define WJR_UNREACHABLE() +#endif + +#if defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_GCC) + #define WJR_RESTRICT __restrict +#else + #define WJR_RESTRICT +#endif + +#if defined(WJR_COMPILER_MSVC) + #define WJR_MS_ABI + #define WJR_HAS_FEATURE_MS_ABI WJR_HAS_DEF +#elif WJR_HAS_ATTRIBUTE(__ms_abi__) + #define WJR_MS_ABI __attribute__((__ms_abi__)) + #define WJR_HAS_FEATURE_MS_ABI WJR_HAS_DEF +#elif defined(WJR_ENABLE_ASSEMBLY) + #undef WJR_ENABLE_ASSEMBLY +#endif + +#define WJR_ASSUME_MAY_NOT_PURE(expr) \ + do { \ + if (!(expr)) { \ + WJR_UNREACHABLE(); \ + } \ + } while (false) + +#if WJR_HAS_BUILTIN(__builtin_assume) + #define WJR_ASSUME(expr) __builtin_assume(expr) +#elif defined(WJR_COMPILER_MSVC) + #define WJR_ASSUME(expr) __assume(expr) +#elif WJR_HAS_CPP_ATTRIBUTE(assume) + #define WJR_ASSUME(expr) [[assume(expr)]] +#else + #define WJR_ASSUME(expr) WJR_ASSUME_MAY_NOT_PURE(expr) +#endif + +#define WJR_BOOL_EXPR(expr) (!!(expr)) + +#if WJR_HAS_BUILTIN(__builtin_expect) + #define WJR_EXPECT(expr, expect) __builtin_expect((expr), (expect)) +#else + #define WJR_EXPECT(expr, expect) (expr) +#endif + +#define WJR_LIKELY(expr) WJR_EXPECT(WJR_BOOL_EXPR(expr), true) +#define WJR_UNLIKELY(expr) WJR_EXPECT(WJR_BOOL_EXPR(expr), false) + +#define WJR_HAS_FEATURE_IS_CONSTANT_EVALUATED WJR_HAS_DEF + +#if WJR_HAS_BUILTIN(__builtin_expect_with_probability) + #define WJR_EXPECT_WITH_PROBABILITY(exp, c, probability) \ + __builtin_expect_with_probability(exp, c, probability) +#else + #define WJR_EXPECT_WITH_PROBABILITY(exp, c, probability) (expr) +#endif + +#if WJR_HAS_BUILTIN(__builtin_expect_with_probability) + #define WJR_VERY_LIKELY(exp, probability) \ + WJR_EXPECT_WITH_PROBABILITY(exp, true, probability) + #define WJR_VERY_UNLIKELY(exp, probability) \ + WJR_EXPECT_WITH_PROBABILITY(exp, false, probability) +#else + #define WJR_VERY_LIKELY(exp, probability) WJR_LIKELY((exp)) + #define WJR_VERY_UNLIKELY(exp, probability) WJR_UNLIKELY((exp)) +#endif + +#if defined(__cpp_lib_is_constant_evaluated) + #define WJR_IS_CONSTANT_EVALUATED() std::is_constant_evaluated() +#elif WJR_HAS_BUILTIN(__builtin_is_constant_evaluated) + #define WJR_IS_CONSTANT_EVALUATED() __builtin_is_constant_evaluated() +#else + #define WJR_IS_CONSTANT_EVALUATED() false + #undef WJR_HAS_FEATURE_IS_CONSTANT_EVALUATED +#endif + +/** + * @details For Clang version 14.0.0-, __builtin_constant_p have some performance issue. + * \n + * 1. Cannot propagate __builtin_constant_p beyond function \n + * 2. Maybe prevent function be inlined. For low version clang, it will inline all path + * even if only one path will be executed, then the function that calls it will not be + * inlined. + */ +#if WJR_HAS_BUILTIN(__builtin_constant_p) + #define WJR_BUILTIN_CONSTANT_P(expr) __builtin_constant_p(expr) +#else + #define WJR_BUILTIN_CONSTANT_P(expr) false +#endif + +#if WJR_HAS_BUILTIN(__builtin_constant_p) + #define WJR_BUILTIN_CONSTANT_P_TRUE(expr) (WJR_BUILTIN_CONSTANT_P(expr) && (expr)) +#else + #define WJR_BUILTIN_CONSTANT_P_TRUE(expr) false +#endif + +#if WJR_HAS_BUILTIN(__builtin_clear_padding) + #define WJR_HAS_BULTIN_CLEAR_PADDING WJR_HAS_DEF + #define WJR_BUILTIN_CLEAR_PADDING(x) __builtin_clear_padding(x) +#elif WJR_HAS_BUILTIN(__builtin_zero_non_value_bits) || \ + (defined(WJR_COMPILER_MSVC) && WJR_HAS_MSVC(19, 27)) + #define WJR_HAS_BULTIN_CLEAR_PADDING WJR_HAS_DEF + #define WJR_BUILTIN_CLEAR_PADDING(x) __builtin_zero_non_value_bits(x) +#endif + +#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) + #define WJR_COMPILER_BARRIER() asm volatile("" ::: "memory") + #define WJR_COMPILER_EMPTY_ASM() asm("") +#else + #define WJR_COMPILER_BARRIER() + #define WJR_COMPILER_EMPTY_ASM() +#endif + +#define WJR_CONSTEXPR_COMPILER_BARRIER() \ + do { \ + if (!(WJR_IS_CONSTANT_EVALUATED())) { \ + WJR_COMPILER_BARRIER(); \ + } \ + } while (false) + +#if defined(WJR_FORCEINLINE) + #define WJR_INTRINSIC_INLINE inline WJR_FORCEINLINE +#else + #define WJR_INTRINSIC_INLINE inline +#endif + +// pure attribute +#if WJR_HAS_ATTRIBUTE(pure) + #define WJR_PURE __attribute__((pure)) +#else + #define WJR_PURE +#endif + +// const attribute +#if WJR_HAS_ATTRIBUTE(const) + #define WJR_CONST __attribute__((const)) +#else + #define WJR_CONST +#endif + +#if WJR_HAS_ATTRIBUTE(malloc) + #define WJR_MALLOC __attribute__((malloc)) +#else + #define WJR_MALLOC +#endif + +#if WJR_HAS_ATTRIBUTE(nonnull) + #define WJR_NONNULL(...) __attribute__((__VA_ARGS__)) +#else + #define WJR_NONNULL(...) +#endif + +#if WJR_HAS_ATTRIBUTE(__may_alias__) + #define WJR_MAY_ALIAS __attribute__((__may_alias__)) +#else + #define WJR_MAY_ALIAS +#endif + +#define WJR_INLINE inline + +#if defined(WJR_CXX_20) + #define WJR_CONSTEXPR20 constexpr +#else + #define WJR_CONSTEXPR20 +#endif + +#define WJR_INTRINSIC_CONSTEXPR WJR_INTRINSIC_INLINE constexpr +#define WJR_INTRINSIC_CONSTEXPR20 WJR_INTRINSIC_INLINE WJR_CONSTEXPR20 + +#define WJR_INLINE_CONSTEXPR inline constexpr +#define WJR_INLINE_CONSTEXPR20 inline WJR_CONSTEXPR20 + +#define WJR_ATTRIBUTE(attribute) WJR_ATTRIBUTE_I(attribute) +#define WJR_ATTRIBUTE_I(attribute) WJR_##attribute + +#if defined(_MSC_VER) + #define WJR_EMPTY_BASES __declspec(empty_bases) +#else + #define WJR_EMPTY_BASES +#endif + +#endif // WJR_PREPROCESSOR_CONFIG_ATTRIBUTE_HPP__ \ No newline at end of file diff --git a/include/wjr/preprocessor/config/compiler.hpp b/include/wjr/preprocessor/config/compiler.hpp new file mode 100644 index 00000000..56dbbd55 --- /dev/null +++ b/include/wjr/preprocessor/config/compiler.hpp @@ -0,0 +1,113 @@ +#ifndef WJR_PREPROCESSOR_CONFIG_COMPILER_HPP__ +#define WJR_PREPROCESSOR_CONFIG_COMPILER_HPP__ + +#if defined(__clang__) + #define WJR_COMPILER_CLANG +#elif defined(__GNUC__) + #define WJR_COMPILER_GCC +#elif defined(_MSC_VER) + #define WJR_COMPILER_MSVC +#endif + +#if defined(_MSC_VER) + #define WJR_MSVC +#endif // _MSC_VER + +#if defined(__GNUC__) + #define WJR_HAS_GCC(major, minor, patchlevel) \ + ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ > (minor)) || \ + (__GNUC__ == (major) && __GNUC_MINOR__ == (minor) && \ + __GNUC_PATCHLEVEL__ >= (patchlevel))) +#else + #define WJR_HAS_GCC(major, minor, patchlevel) 0 +#endif // __GNUC__ + +#if defined(__clang__) + #define WJR_HAS_CLANG(major, minor, patchlevel) \ + ((__clang_major__ > (major)) || \ + (__clang_major__ == (major) && __clang_minor__ > (minor)) || \ + (__clang_major__ == (major) && __clang_minor__ == (minor) && \ + __clang_patchlevel__ >= (patchlevel))) +#else + #define WJR_HAS_CLANG(major, minor, patchlevel) 0 +#endif + +#if defined(_MSC_VER) + #define WJR_HAS_MSVC(minor, level) (_MSC_VER >= (minor)*100 + (level)) +#else + #define WJR_HAS_MSVC(minor, level) 0 +#endif + +#if (defined(WJR_COMPILER_GCC) && !WJR_HAS_GCC(7, 1, 0)) || \ + (defined(WJR_COMPILER_CLANG) && !WJR_HAS_CLANG(5, 0, 0)) + #error "GCC 7.1.0 or Clang 5.0.0 or later is required" +#endif + +#if defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_GCC) + #define WJR_CXX_STANDARD __cplusplus +#elif defined(WJR_COMPILER_MSVC) + #define WJR_CXX_STANDARD _MSVC_LANG +#endif + +#if WJR_CXX_STANDARD >= 199711L + #define WJR_CXX_03 +#endif +#if WJR_CXX_STANDARD >= 201103L + #define WJR_CXX_11 +#endif +#if WJR_CXX_STANDARD >= 201402L + #define WJR_CXX_14 +#endif +#if WJR_CXX_STANDARD >= 201703L + #define WJR_CXX_17 +#endif +#if WJR_CXX_STANDARD >= 202002L + #define WJR_CXX_20 +#endif + +#ifndef WJR_CXX_17 + #error "required C++17 or later" +#endif // c++17 + +#if defined(__cpp_char8_t) + #define WJR_CHAR8_T +#endif // __cpp_char8_t + +#if defined(__LINE__) + #define WJR_LINE __LINE__ +#elif defined(__COUNTER__) + #define WJR_LINE __COUNTER__ +#else + #define WJR_LINE -1 +#endif + +#ifdef __FILE__ + #define WJR_FILE __FILE__ +#else + #define WJR_FILE "" +#endif + +// reference: boost BOOST_CURRENT_FUNCTION +#if defined(WJR_DISABLE_CURRENT_FUNCTION) + #define WJR_CURRENT_FUNCTION "(unknown)" +#elif defined(__GNUC__) || (defined(__MWERKS__) && (__MWERKS__ >= 0x3000)) || \ + (defined(__ICC) && (__ICC >= 600)) || defined(__ghs__) + #define WJR_CURRENT_FUNCTION __PRETTY_FUNCTION__ +#elif defined(__DMC__) && (__DMC__ >= 0x810) + #define WJR_CURRENT_FUNCTION __PRETTY_FUNCTION__ +#elif defined(__FUNCSIG__) + #define WJR_CURRENT_FUNCTION __FUNCSIG__ +#elif (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 600)) || \ + (defined(__IBMCPP__) && (__IBMCPP__ >= 500)) + #define WJR_CURRENT_FUNCTION __FUNCTION__ +#elif defined(__BORLANDC__) && (__BORLANDC__ >= 0x550) + #define WJR_CURRENT_FUNCTION __FUNC__ +#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901) + #define WJR_CURRENT_FUNCTION __func__ +#elif defined(__cplusplus) && (__cplusplus >= 201103) + #define WJR_CURRENT_FUNCTION __func__ +#else + #define WJR_CURRENT_FUNCTION "(unknown)" +#endif + +#endif // !WJR_PREPROCESSOR_CONFIG_COMPILER_HPP__ \ No newline at end of file diff --git a/include/wjr/preprocessor/compiler/has.hpp b/include/wjr/preprocessor/config/has.hpp similarity index 57% rename from include/wjr/preprocessor/compiler/has.hpp rename to include/wjr/preprocessor/config/has.hpp index b921ce82..a944ed90 100644 --- a/include/wjr/preprocessor/compiler/has.hpp +++ b/include/wjr/preprocessor/config/has.hpp @@ -1,8 +1,8 @@ -#ifndef WJR_PREPROCESSOR_COMPILER_HAS_HPP__ -#define WJR_PREPROCESSOR_COMPILER_HAS_HPP__ +#ifndef WJR_PREPROCESSOR_CONFIG_HAS_HPP__ +#define WJR_PREPROCESSOR_CONFIG_HAS_HPP__ -#include -#include +#include +#include #include #include @@ -38,30 +38,31 @@ (defined(WJR_COMPILER_CLANG) && WJR_HAS_CLANG(10, 0, 0)) || \ (!defined(WJR_COMPILER_GCC) && !defined(WJR_COMPILER_CLANG) && \ defined(__has_builtin)) -#define WJR_HAS_BUILTIN(x) WJR_HAS_BUILTIN_I(x, WJR_HAS_BUILTIN_FIND(x)) -#define WJR_HAS_BUILTIN_I(x, VAR) WJR_PP_BOOL_IF(WJR_PP_BOOL(VAR), VAR, __has_builtin(x)) + #define WJR_HAS_BUILTIN(x) WJR_HAS_BUILTIN_I(x, WJR_HAS_BUILTIN_FIND(x)) + #define WJR_HAS_BUILTIN_I(x, VAR) \ + WJR_PP_BOOL_IF(WJR_PP_BOOL(VAR), VAR, __has_builtin(x)) #else -#define WJR_HAS_BUILTIN(x) WJR_HAS_BUILTIN_FIND(x) + #define WJR_HAS_BUILTIN(x) WJR_HAS_BUILTIN_FIND(x) #endif #if defined(__has_include) -#define WJR_HAS_INCLUDE(x) __has_include(x) + #define WJR_HAS_INCLUDE(x) __has_include(x) #else -#define WJR_HAS_INCLUDE(x) 0 + #define WJR_HAS_INCLUDE(x) 0 #endif // __has_include #if defined(__has_attribute) -#define WJR_HAS_ATTRIBUTE(x) WJR_HAS_ATTRIBUTE_I(x, WJR_HAS_ATTRIBUTE_FIND(x)) -#define WJR_HAS_ATTRIBUTE_I(x, VAR) \ - WJR_PP_BOOL_IF(WJR_PP_BOOL(VAR), VAR, __has_attribute(x)) + #define WJR_HAS_ATTRIBUTE(x) WJR_HAS_ATTRIBUTE_I(x, WJR_HAS_ATTRIBUTE_FIND(x)) + #define WJR_HAS_ATTRIBUTE_I(x, VAR) \ + WJR_PP_BOOL_IF(WJR_PP_BOOL(VAR), VAR, __has_attribute(x)) #else -#define WJR_HAS_ATTRIBUTE(x) WJR_HAS_ATTRIBUTE_FIND(x) + #define WJR_HAS_ATTRIBUTE(x) WJR_HAS_ATTRIBUTE_FIND(x) #endif #if defined(__has_cpp_attribute) -#define WJR_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) + #define WJR_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) #else -#define WJR_HAS_CPP_ATTRIBUTE(x) 0 + #define WJR_HAS_CPP_ATTRIBUTE(x) 0 #endif #define WJR_HAS_FEATURE(x) WJR_HAS_FEATURE_FIND(x) @@ -72,58 +73,58 @@ // WJR_HAS_BUILTIN #if WJR_HAS_GCC(7, 1, 0) || WJR_HAS_CLANG(5, 0, 0) -#define WJR_HAS_BUILTIN___builtin_unreachable WJR_HAS_DEF -#define WJR_HAS_BUILTIN___builtin_expect WJR_HAS_DEF -#define WJR_HAS_BUILTIN___builtin_constant_p WJR_HAS_DEF -#define WJR_HAS_BUILTIN___builtin_clz WJR_HAS_DEF -#define WJR_HAS_BUILTIN___builtin_ctz WJR_HAS_DEF -#define WJR_HAS_BUILTIN___builtin_popcount WJR_HAS_DEF + #define WJR_HAS_BUILTIN___builtin_unreachable WJR_HAS_DEF + #define WJR_HAS_BUILTIN___builtin_expect WJR_HAS_DEF + #define WJR_HAS_BUILTIN___builtin_constant_p WJR_HAS_DEF + #define WJR_HAS_BUILTIN___builtin_clz WJR_HAS_DEF + #define WJR_HAS_BUILTIN___builtin_ctz WJR_HAS_DEF + #define WJR_HAS_BUILTIN___builtin_popcount WJR_HAS_DEF #endif #if WJR_HAS_GCC(9, 1, 0) || WJR_HAS_CLANG(9, 0, 0) -#define WJR_HAS_BUILTIN___builtin_is_constant_evaluated WJR_HAS_DEF + #define WJR_HAS_BUILTIN___builtin_is_constant_evaluated WJR_HAS_DEF #endif #if WJR_HAS_CLANG(5, 0, 0) -#define WJR_HAS_BUILTIN___builtin_addc WJR_HAS_DEF -#define WJR_HAS_BUILTIN___builtin_subc WJR_HAS_DEF + #define WJR_HAS_BUILTIN___builtin_addc WJR_HAS_DEF + #define WJR_HAS_BUILTIN___builtin_subc WJR_HAS_DEF #endif #if WJR_HAS_GCC(9, 1, 0) || WJR_HAS_CLANG(11, 0, 0) -#define WJR_HAS_BUILTIN___builtin_expect_with_probability WJR_HAS_DEF + #define WJR_HAS_BUILTIN___builtin_expect_with_probability WJR_HAS_DEF #endif // WJR_HAS_FEATURE #if WJR_HAS_GCC(7, 1, 0) || WJR_HAS_CLANG(5, 0, 0) -#define WJR_HAS_FEATURE_PRAGMA_UNROLL WJR_HAS_DEF + #define WJR_HAS_FEATURE_PRAGMA_UNROLL WJR_HAS_DEF #endif #if defined(WJR_COMPILER_GCC) || defined(WJR_COMPILER_CLANG) -#define WJR_HAS_FEATURE_GCC_STYLE_INLINE_ASM WJR_HAS_DEF + #define WJR_HAS_FEATURE_GCC_STYLE_INLINE_ASM WJR_HAS_DEF #endif #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#if WJR_HAS_GCC(7, 1, 0) || WJR_HAS_CLANG(9, 0, 0) -#define WJR_HAS_FEATURE_INLINE_ASM_GOTO WJR_HAS_DEF -#endif + #if WJR_HAS_GCC(7, 1, 0) || WJR_HAS_CLANG(9, 0, 0) + #define WJR_HAS_FEATURE_INLINE_ASM_GOTO WJR_HAS_DEF + #endif -#if WJR_HAS_GCC(11, 1, 0) || WJR_HAS_CLANG(11, 0, 0) -#define WJR_HAS_FEATURE_INLINE_ASM_GOTO_OUTPUT WJR_HAS_DEF -#endif + #if WJR_HAS_GCC(11, 1, 0) || WJR_HAS_CLANG(11, 0, 0) + #define WJR_HAS_FEATURE_INLINE_ASM_GOTO_OUTPUT WJR_HAS_DEF + #endif -#if defined(WJR_COMPILER_GCC) || WJR_HAS_CLANG(9, 0, 0) -#define WJR_HAS_FEATURE_INLINE_ASM_CCCOND WJR_HAS_DEF -#endif + #if defined(__GCC_ASM_FLAG_OUTPUTS__) + #define WJR_HAS_FEATURE_INLINE_ASM_CCCOND WJR_HAS_DEF + #endif #endif #if defined(__SIZEOF_INT128__) -#define WJR_HAS_FEATURE_INT128 WJR_HAS_DEF -#if !(defined(__clang__) && defined(LIBDIVIDE_VC)) -#define WJR_HAS_FEATURE_INT128_DIV WJR_HAS_DEF -#endif + #define WJR_HAS_FEATURE_INT128 WJR_HAS_DEF + #if !(defined(__clang__) && defined(LIBDIVIDE_VC)) + #define WJR_HAS_FEATURE_INT128_DIV WJR_HAS_DEF + #endif #endif // There are some issues with the optimization of int128 in both lower and higher versions @@ -131,7 +132,7 @@ #if WJR_HAS_FEATURE(INT128) && \ (defined(WJR_COMPILER_CLANG) || \ (defined(WJR_COMPILER_GCC) && WJR_HAS_GCC(8, 1, 0) && !WJR_HAS_GCC(13, 1, 0))) -#define WJR_HAS_FEATURE_FAST_INT128_COMPARE WJR_HAS_DEF + #define WJR_HAS_FEATURE_FAST_INT128_COMPARE WJR_HAS_DEF #endif // performance bug @@ -140,82 +141,82 @@ #endif #if WJR_HAS_GCC(11, 1, 0) || WJR_HAS_CLANG(5, 0, 0) -#define WJR_HAS_FEATURE_FORCEINLINE_LAMBDA WJR_HAS_DEF + #define WJR_HAS_FEATURE_FORCEINLINE_LAMBDA WJR_HAS_DEF #endif #if WJR_HAS_GCC(8, 1, 0) || WJR_HAS_CLANG(7, 0, 0) -#define WJR_HAS_FEATURE_GOTO_POINTER WJR_HAS_DEF + #define WJR_HAS_FEATURE_GOTO_POINTER WJR_HAS_DEF #endif #if defined(__AVX512VL__) -#define WJR_HAS_SIMD_AVX512VL WJR_HAS_DEF + #define WJR_HAS_SIMD_AVX512VL WJR_HAS_DEF #endif #if defined(__AVX512BW__) -#define WJR_HAS_SIMD_AVX512BW WJR_HAS_DEF + #define WJR_HAS_SIMD_AVX512BW WJR_HAS_DEF #endif #if defined(__AVX512DQ__) -#define WJR_HAS_SIMD_AVX512DQ WJR_HAS_DEF + #define WJR_HAS_SIMD_AVX512DQ WJR_HAS_DEF #endif #if defined(__AVX512F__) || \ (WJR_HAS_SIMD(AVX512VL) && WJR_HAS_SIMD(AVX512BW) && WJR_HAS_SIMD(AVX512DQ)) -#define WJR_HAS_SIMD_AVX512F WJR_HAS_DEF + #define WJR_HAS_SIMD_AVX512F WJR_HAS_DEF #endif #if defined(__AVX512__) || \ (WJR_HAS_SIMD(AVX512F) && WJR_HAS_SIMD(AVX512BW) && WJR_HAS_SIMD(AVX512DQ)) -#define WJR_HAS_SIMD_AVX512 WJR_HAS_DEF + #define WJR_HAS_SIMD_AVX512 WJR_HAS_DEF #endif #if defined(__AVX2__) || (WJR_HAS_SIMD(AVX512) || WJR_HAS_SIMD(AVX512F)) -#define WJR_HAS_SIMD_AVX2 WJR_HAS_DEF + #define WJR_HAS_SIMD_AVX2 WJR_HAS_DEF #endif #if defined(__AVX__) || WJR_HAS_SIMD(AVX2) -#define WJR_HAS_SIMD_AVX WJR_HAS_DEF + #define WJR_HAS_SIMD_AVX WJR_HAS_DEF #endif #if defined(__SSE4_2__) || WJR_HAS_SIMD(AVX) -#define WJR_HAS_SIMD_SSE4_2 WJR_HAS_DEF + #define WJR_HAS_SIMD_SSE4_2 WJR_HAS_DEF #endif #if defined(__SSE4_1__) || WJR_HAS_SIMD(SSE4_2) -#define WJR_HAS_SIMD_SSE4_1 WJR_HAS_DEF + #define WJR_HAS_SIMD_SSE4_1 WJR_HAS_DEF #endif #if defined(__SSSE3__) || WJR_HAS_SIMD(SSE4_1) -#define WJR_HAS_SIMD_SSSE3 WJR_HAS_DEF + #define WJR_HAS_SIMD_SSSE3 WJR_HAS_DEF #endif #if defined(__SSE3__) || WJR_HAS_SIMD(SSSE3) -#define WJR_HAS_SIMD_SSE3 WJR_HAS_DEF + #define WJR_HAS_SIMD_SSE3 WJR_HAS_DEF #endif #if defined(__SSE2__) || WJR_HAS_SIMD(SSE3) || _M_IX86_FP >= 2 || \ (defined(_MSC_VER) && (defined(_M_AMD64) || defined(_M_X64))) -#define WJR_HAS_SIMD_SSE2 WJR_HAS_DEF + #define WJR_HAS_SIMD_SSE2 WJR_HAS_DEF #endif #if defined(__SSE__) || WJR_HAS_SIMD(SSE2) || _M_IX86_FP >= 1 -#define WJR_HAS_SIMD_SSE WJR_HAS_DEF + #define WJR_HAS_SIMD_SSE WJR_HAS_DEF #endif #if defined(__MMX__) || WJR_HAS_SIMD(SSE) -#define WJR_HAS_SIMD_MMX WJR_HAS_DEF + #define WJR_HAS_SIMD_MMX WJR_HAS_DEF #endif #if defined(__XOP__) -#define WJR_HAS_SIMD_XOP WJR_HAS_DEF + #define WJR_HAS_SIMD_XOP WJR_HAS_DEF #endif #if defined(__POPCNT__) -#define WJR_HAS_SIMD_POPCNT WJR_HAS_DEF + #define WJR_HAS_SIMD_POPCNT WJR_HAS_DEF #endif #if defined(__PCLMUL__) -#define WJR_HAS_SIMD_PCLMUL WJR_HAS_DEF + #define WJR_HAS_SIMD_PCLMUL WJR_HAS_DEF #endif -#endif // WJR_PREPROCESSOR_COMPILER_HAS_HPP__ \ No newline at end of file +#endif // WJR_PREPROCESSOR_CONFIG_HAS_HPP__ \ No newline at end of file diff --git a/include/wjr/preprocessor/config/platform.hpp b/include/wjr/preprocessor/config/platform.hpp new file mode 100644 index 00000000..a7509e70 --- /dev/null +++ b/include/wjr/preprocessor/config/platform.hpp @@ -0,0 +1,13 @@ +#ifndef WJR_PREPROCESSOR_CONFIG_PLATFORM_HPP__ +#define WJR_PREPROCESSOR_CONFIG_PLATFORM_HPP__ + +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) + // define something for Windows (32-bit and 64-bit, this part is common) + #define WJR_WINDOWS +#elif defined(__linux__) + #define WJR_LINUX +#elif defined(__unix__) // all unices not caught above + #define WJR_UNIX +#endif + +#endif // WJR_PREPROCESSOR_CONFIG_PLATFORM_HPP__ \ No newline at end of file diff --git a/include/wjr/preprocessor/logical/bool.hpp b/include/wjr/preprocessor/logical/bool.hpp index b6590441..dd93d939 100644 --- a/include/wjr/preprocessor/logical/bool.hpp +++ b/include/wjr/preprocessor/logical/bool.hpp @@ -2,7 +2,6 @@ #define WJR_PREPROCESSOR_LOGICAL_BOOL_HPP__ #define WJR_PP_BOOL(x) WJR_PP_BOOL_I(x) - #define WJR_PP_BOOL_I(x) WJR_PP_BOOL_##x #define WJR_PP_BOOL_0 0 diff --git a/include/wjr/preprocessor/preview.hpp b/include/wjr/preprocessor/preview.hpp index 140617f1..cf4f3d94 100644 --- a/include/wjr/preprocessor/preview.hpp +++ b/include/wjr/preprocessor/preview.hpp @@ -4,7 +4,7 @@ // testing ... #include -#include +#include #include #include @@ -27,19 +27,19 @@ #define WJR_PRAGMA_I(expr) _Pragma(#expr) #if defined(WJR_COMPILER_GCC) || defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_MSVC) -#define WJR_PRAGMA(expr) WJR_PRAGMA_I(expr) + #define WJR_PRAGMA(expr) WJR_PRAGMA_I(expr) #else -#define WJR_PRAGMA(expr) + #define WJR_PRAGMA(expr) #endif #if WJR_HAS_FEATURE(PRAGMA_UNROLL) -#if defined(WJR_COMPILER_GCC) -#define WJR_UNROLL(loop) WJR_PRAGMA(GCC unroll(loop)) + #if defined(WJR_COMPILER_GCC) + #define WJR_UNROLL(loop) WJR_PRAGMA(GCC unroll(loop)) + #else + #define WJR_UNROLL(loop) WJR_PRAGMA(unroll(loop)) + #endif #else -#define WJR_UNROLL(loop) WJR_PRAGMA(unroll(loop)) -#endif -#else -#define WJR_UNROLL(loop) + #define WJR_UNROLL(loop) #endif #define WJR_IS_OVERLAP_P(p, pn, q, qn) ((p) + (pn) > (q) && (q) + (qn) > (p)) @@ -55,17 +55,17 @@ #define WJR_ASM_NOPIC_JMPL(LABEL) ".quad " #LABEL #if WJR_HAS_FEATURE(INLINE_ASM_CCCOND) -#define WJR_ASM_CCSET(c) "/* set condition codes */\n\t" -#define WJR_ASM_CCOUT(c) "=@cc" #c + #define WJR_ASM_CCSET(c) "/* set condition codes */\n\t" + #define WJR_ASM_CCOUT(c) "=@cc" #c #else -#define WJR_ASM_CCSET(c) "set" #c " %[_cc_" #c "]\n\t" -#define WJR_ASM_CCOUT(c) [_cc_##c] "=r" + #define WJR_ASM_CCSET(c) "set" #c " %[_cc_" #c "]\n\t" + #define WJR_ASM_CCOUT(c) [_cc_##c] "=r" #endif #if defined(WJR_DISABLE_EXCEPTIONS) -#define WJR_EXCEPTIONS_IF(ENABLE, DISABLE) DISABLE + #define WJR_EXCEPTIONS_IF(ENABLE, DISABLE) DISABLE #else -#define WJR_EXCEPTIONS_IF(ENABLE, DISABLE) ENABLE + #define WJR_EXCEPTIONS_IF(ENABLE, DISABLE) ENABLE #endif #define WJR_ENABLE_EXCEPTIONS_TRY_I try diff --git a/include/wjr/simd/simd.hpp b/include/wjr/simd/simd.hpp index 1186720b..aa27b74e 100644 --- a/include/wjr/simd/simd.hpp +++ b/include/wjr/simd/simd.hpp @@ -7,7 +7,7 @@ #include #if defined(WJR_X86) -#include + #include #endif namespace wjr { diff --git a/include/wjr/string.hpp b/include/wjr/string.hpp index 8388e7c5..94fce100 100644 --- a/include/wjr/string.hpp +++ b/include/wjr/string.hpp @@ -9,9 +9,9 @@ namespace wjr { #define WJR_HAS_FEATURE_STRING_UNINITIALIZED_RESIZE WJR_HAS_DEF #ifdef __cpp_lib_string_resize_and_overwrite -#define WJR_STRINF_RESIZE_AND_OVERWRITE __cpp_lib_string_resize_and_overwrite + #define WJR_STRINF_RESIZE_AND_OVERWRITE __cpp_lib_string_resize_and_overwrite #else -#define WJR_STRINF_RESIZE_AND_OVERWRITE 0 + #define WJR_STRINF_RESIZE_AND_OVERWRITE 0 #endif #if WJR_STRINF_RESIZE_AND_OVERWRITE >= 202110L @@ -23,84 +23,87 @@ __uninitialized_resize(std::basic_string &str, str.resize_and_overwrite(sz, [](char *, Size sz) { return sz; }); } -#define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_CLASS(...) -#define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_TEMPLATE(...) -#define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_HACKER(...) + #define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_CLASS(...) + #define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_TEMPLATE(...) + #define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_HACKER(...) #elif (defined(__clang_major__) && __clang_major__ <= 11) || \ (defined(_MSC_VER) && _MSC_VER <= 1920) -#undef WJR_HAS_FEATURE_STRING_UNINITIALIZED_RESIZE + #undef WJR_HAS_FEATURE_STRING_UNINITIALIZED_RESIZE #elif defined(__GLIBCXX__) || defined(_LIBCPP_VERSION) || defined(_MSVC_STL_VERSION) template void string_set_length_hacker(Container &bank, typename Container::size_type sz); -#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION) -#define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_CLASS(Name, Container) \ - inline void WJR_PP_CONCAT(string_set_length_hacker_of_, \ - Name)(Container & bank, typename Container::size_type sz); \ - template \ - struct WJR_PP_CONCAT(string_thief_of_, Name) { \ - friend void WJR_PP_CONCAT(string_set_length_hacker_of_, \ - Name)(Container & bank, \ - typename Container::size_type sz) { \ - (bank.*p)(sz); \ - } \ - } -#else -#define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_CLASS(Name, Container) \ - inline void WJR_PP_CONCAT(string_set_length_hacker_of_, \ - Name)(Container & bank, typename Container::size_type sz); \ - template \ - struct WJR_PP_CONCAT(string_thief_of_, Name) { \ - friend void WJR_PP_CONCAT(string_set_length_hacker_of_, \ - Name)(Container & bank, \ - typename Container::size_type sz) { \ - (bank.*p)._Myval2._Mysize = sz; \ - } \ - } -#endif - -#define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_HACKER(Name, Container) \ - template <> \ - inline void string_set_length_hacker(Container & bank, \ - typename Container::size_type sz) { \ - WJR_PP_CONCAT(string_set_length_hacker_of_, Name)(bank, sz); \ - }; - -#if defined(__GLIBCXX__) -#define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_TEMPLATE(Name, Container) \ - template struct WJR_PP_CONCAT( \ - string_thief_of_, Name) -#elif defined(_LIBCPP_VERSION) -#define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_TEMPLATE(Name, Container) \ - template struct WJR_PP_CONCAT( \ - string_thief_of_, Name) -#else -#define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_TEMPLATE(Name, Container) \ - template struct WJR_PP_CONCAT( \ - string_thief_of_, Name) -#endif + #if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION) + #define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_CLASS(Name, Container) \ + inline void WJR_PP_CONCAT(string_set_length_hacker_of_, Name)( \ + Container & bank, typename Container::size_type sz); \ + template \ + struct WJR_PP_CONCAT(string_thief_of_, Name) { \ + friend void WJR_PP_CONCAT(string_set_length_hacker_of_, \ + Name)(Container & bank, \ + typename Container::size_type sz) { \ + (bank.*p)(sz); \ + } \ + } + #else + #define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_CLASS(Name, Container) \ + inline void WJR_PP_CONCAT(string_set_length_hacker_of_, Name)( \ + Container & bank, typename Container::size_type sz); \ + template \ + struct WJR_PP_CONCAT(string_thief_of_, Name) { \ + friend void WJR_PP_CONCAT(string_set_length_hacker_of_, \ + Name)(Container & bank, \ + typename Container::size_type sz) { \ + (bank.*p)._Myval2._Mysize = sz; \ + } \ + } + #endif + + #define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_HACKER(Name, Container) \ + template <> \ + inline void string_set_length_hacker( \ + Container & bank, typename Container::size_type sz) { \ + WJR_PP_CONCAT(string_set_length_hacker_of_, Name)(bank, sz); \ + }; + + #if defined(__GLIBCXX__) + #define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_TEMPLATE(Name, Container) \ + template struct WJR_PP_CONCAT( \ + string_thief_of_, \ + Name) + #elif defined(_LIBCPP_VERSION) + #define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_TEMPLATE(Name, Container) \ + template struct WJR_PP_CONCAT( \ + string_thief_of_, \ + Name) + #else + #define __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_TEMPLATE(Name, Container) \ + template struct WJR_PP_CONCAT( \ + string_thief_of_, \ + Name) + #endif template WJR_INTRINSIC_INLINE void __uninitialized_resize(std::basic_string &str, typename std::basic_string::size_type sz) { -#if !defined(WJR_CXX_20) + #if !defined(WJR_CXX_20) if (sz > str.capacity()) { -#endif + #endif str.reserve(sz); -#if !defined(WJR_CXX_20) + #if !defined(WJR_CXX_20) } -#endif + #endif string_set_length_hacker(str, sz); WJR_ASSERT_L2(str.size() == sz); str[sz] = '\0'; } #else -#undef WJR_HAS_FEATURE_STRING_UNINITIALIZED_RESIZE -#define WJR_REGISTER_STRING_UNINITIALIZED_RESIZE(Name, Container) + #undef WJR_HAS_FEATURE_STRING_UNINITIALIZED_RESIZE + #define WJR_REGISTER_STRING_UNINITIALIZED_RESIZE(Name, Container) #endif #if WJR_HAS_FEATURE(STRING_UNINITIALIZED_RESIZE) @@ -123,16 +126,16 @@ struct __uninitialized_append_fn_impl : append_fn_impl_base { } }; -#define WJR_REGISTER_STRING_UNINITIALIZED_RESIZE(Name, Container) \ - __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_CLASS(Name, Container); \ - __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_TEMPLATE(Name, Container); \ - __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_HACKER(Name, Container); \ - template <> \ - struct resize_fn_impl : __uninitialized_resize_fn_impl {}; \ - template <> \ - struct append_fn_impl : __uninitialized_append_fn_impl {} + #define WJR_REGISTER_STRING_UNINITIALIZED_RESIZE(Name, Container) \ + __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_CLASS(Name, Container); \ + __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_TEMPLATE(Name, Container); \ + __WJR_REGISTER_STRING_UNINITIALIZED_RESIZE_HACKER(Name, Container); \ + template <> \ + struct resize_fn_impl : __uninitialized_resize_fn_impl {}; \ + template <> \ + struct append_fn_impl : __uninitialized_append_fn_impl {} #else -#define WJR_REGISTER_STRING_UNINITIALIZED_RESIZE(Name, Container) + #define WJR_REGISTER_STRING_UNINITIALIZED_RESIZE(Name, Container) #endif namespace string_detail { diff --git a/include/wjr/tp/compiler.hpp b/include/wjr/tp/compiler.hpp index 6f416cc4..8c9d48f7 100644 --- a/include/wjr/tp/compiler.hpp +++ b/include/wjr/tp/compiler.hpp @@ -3,7 +3,6 @@ #include -namespace wjr { -} // namespace wjr +namespace wjr {} // namespace wjr -#endif // WJR_TP_COMPILER_HPP__ \ No newline at end of file +#endif // WJR_TP_COMPILER_HPP__ \ No newline at end of file diff --git a/include/wjr/type_traits.hpp b/include/wjr/type_traits.hpp index d6eca2f8..4a6914ba 100644 --- a/include/wjr/type_traits.hpp +++ b/include/wjr/type_traits.hpp @@ -10,6 +10,397 @@ #include +// GCC and compatible compilers define internal macros with builtin type traits + +#if defined(__SIZEOF_INT__) + #define WJR_SIZEOF_INT __SIZEOF_INT__ +#endif + +#if defined(__SIZEOF_LONG__) + #define WJR_SIZEOF_LONG __SIZEOF_LONG__ +#endif + +#if defined(__SIZEOF_LONG_LONG__) + #define WJR_SIZEOF_LONG_LONG __SIZEOF_LONG_LONG__ +#endif + +#if defined(__SIZEOF_SHORT__) + #define WJR_SIZEOF_SHORT __SIZEOF_SHORT__ +#endif + +#if defined(__SIZEOF_POINTER__) + #define WJR_SIZEOF_POINTER __SIZEOF_POINTER__ +#elif defined(_MSC_VER) + #if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_IA64) + #define WJR_SIZEOF_POINTER 8 + #else + #define WJR_SIZEOF_POINTER 4 + #endif +#endif + +#if defined(__SIZEOF_WCHAR_T__) + #define WJR_SIZEOF_WCHAR_T __SIZEOF_WCHAR_T__ +#endif + +#if !defined(WJR_SIZEOF_SHORT) || !defined(WJR_SIZEOF_INT) || \ + !defined(WJR_SIZEOF_LONG) || !defined(WJR_SIZEOF_LONG_LONG) || \ + !defined(WJR_SIZEOF_POINTER) + + #if !defined(WJR_SIZEOF_SHORT) + #if (USHRT_MAX + 0) == 0xff + #define WJR_SIZEOF_SHORT 1 + #elif (USHRT_MAX + 0) == 0xffff + #define WJR_SIZEOF_SHORT 2 + #elif (USHRT_MAX + 0) == 0xffffffff + #define WJR_SIZEOF_SHORT 4 + #elif (USHRT_MAX + 0) == UINT64_C(0xffffffffffffffff) + #define WJR_SIZEOF_SHORT 8 + #endif + #endif // !defined(WJR_SIZEOF_SHORT) + + #if !defined(WJR_SIZEOF_INT) + #if (UINT_MAX + 0) == 0xff + #define WJR_SIZEOF_INT 1 + #elif (UINT_MAX + 0) == 0xffff + #define WJR_SIZEOF_INT 2 + #elif (UINT_MAX + 0) == 0xffffffff + #define WJR_SIZEOF_INT 4 + #elif (UINT_MAX + 0) == UINT64_C(0xffffffffffffffff) + #define WJR_SIZEOF_INT 8 + #endif + #endif // !defined(WJR_SIZEOF_INT) + + #if !defined(WJR_SIZEOF_LONG) + #if (ULONG_MAX + 0) == 0xff + #define WJR_SIZEOF_LONG 1 + #elif (ULONG_MAX + 0) == 0xffff + #define WJR_SIZEOF_LONG 2 + #elif (ULONG_MAX + 0) == 0xffffffff + #define WJR_SIZEOF_LONG 4 + #elif (ULONG_MAX + 0) == UINT64_C(0xffffffffffffffff) + #define WJR_SIZEOF_LONG 8 + #endif + #endif // !defined(WJR_SIZEOF_LONG) + + #if !defined(WJR_SIZEOF_LONG_LONG) + #if defined(__hpux) // HP-UX's value of ULONG_LONG_MAX is unusable in preprocessor + // expressions + #define WJR_SIZEOF_LONG_LONG 8 + #else + + // The list of the non-standard macros (the ones except ULLONG_MAX) is taken + // from cstdint.hpp + #if defined(ULLONG_MAX) + #define WJR_ULLONG_MAX ULLONG_MAX + #elif defined(ULONG_LONG_MAX) + #define WJR_ULLONG_MAX ULONG_LONG_MAX + #elif defined(ULONGLONG_MAX) + #define WJR_ULLONG_MAX ULONGLONG_MAX + #elif defined(_LLONG_MAX) // strangely enough, this one seems to be holding + // the limit for the unsigned integer + #define WJR_ULLONG_MAX _LLONG_MAX + #endif + + #if (WJR_ULLONG_MAX + 0) == 0xff + #define WJR_SIZEOF_LONG_LONG 1 + #elif (WJR_ULLONG_MAX + 0) == 0xffff + #define WJR_SIZEOF_LONG_LONG 2 + #elif (WJR_ULLONG_MAX + 0) == 0xffffffff + #define WJR_SIZEOF_LONG_LONG 4 + #elif (WJR_ULLONG_MAX + 0) == UINT64_C(0xffffffffffffffff) + #define WJR_SIZEOF_LONG_LONG 8 + #endif + + #endif // defined(__hpux) + #endif // !defined(WJR_SIZEOF_LONG_LONG) + + #if !defined(WJR_SIZEOF_POINTER) && defined(UINTPTR_MAX) + #if (UINTPTR_MAX + 0) == 0xffff + #define WJR_SIZEOF_POINTER 2 + #elif (UINTPTR_MAX + 0) == 0xffffffff + #define WJR_SIZEOF_POINTER 4 + #elif (UINTPTR_MAX + 0) == UINT64_C(0xffffffffffffffff) + #define WJR_SIZEOF_POINTER 8 + #endif + #endif // !defined(WJR_SIZEOF_POINTER) && defined(UINTPTR_MAX) + +#endif + +#if !defined(WJR_SIZEOF_WCHAR_T) + + #include + + #if defined(_MSC_VER) && (_MSC_VER <= 1310 || defined(UNDER_CE) && _MSC_VER <= 1500) + // MSVC 7.1 and MSVC 8 (arm) define WCHAR_MAX to a value not suitable for constant + // expressions + #define WJR_SIZEOF_WCHAR_T 2 + #elif (WCHAR_MAX + 0) == 0xff || (WCHAR_MAX + 0) == 0x7f + #define WJR_SIZEOF_WCHAR_T 1 + #elif (WCHAR_MAX + 0) == 0xffff || (WCHAR_MAX + 0) == 0x7fff + #define WJR_SIZEOF_WCHAR_T 2 + #elif (WCHAR_MAX + 0) == 0xffffffff || (WCHAR_MAX + 0) == 0x7fffffff + #define WJR_SIZEOF_WCHAR_T 4 + #elif (WCHAR_MAX + 0) == UINT64_C(0xffffffffffffffff) || \ + (WCHAR_MAX + 0) == INT64_C(0x7fffffffffffffff) + #define WJR_SIZEOF_WCHAR_T 8 + #endif +#endif + +#if !defined(WJR_SIZEOF_SHORT) || !defined(WJR_SIZEOF_INT) || \ + !defined(WJR_SIZEOF_LONG) || !defined(WJR_SIZEOF_LONG_LONG) || \ + !defined(WJR_SIZEOF_WCHAR_T) + #error "Not support" +#endif + +// Detect value sizes of the different floating point types. The value sizes may be less +// than the corresponding type sizes if the type contains padding bits. This is typical +// e.g. with x87 80-bit extended double types, which are often represented as 96 or +// 128-bit types. See: https://en.wikipedia.org/wiki/IEEE_754 For Intel x87 extended +// double see: +// https://en.wikipedia.org/wiki/Extended_precision#x86_Architecture_Extended_Precision_Format +// For IBM extended double (a.k.a. double-double) see: +// https://en.wikipedia.org/wiki/Long_double#Implementations, +// https://gcc.gnu.org/wiki/Ieee128PowerPC + +#if defined(__FLT_RADIX__) && defined(__FLT_MANT_DIG__) && defined(__FLT_MAX_EXP__) && \ + defined(__DBL_MANT_DIG__) && defined(__DBL_MAX_EXP__) && \ + defined(__LDBL_MANT_DIG__) && defined(__LDBL_MAX_EXP__) + + #if (__FLT_RADIX__ == 2) + + #if (__FLT_MANT_DIG__ == 11) && (__FLT_MAX_EXP__ == 16) // IEEE 754 binary16 + #define WJR_SIZEOF_FLOAT_VALUE 2 + #elif (__FLT_MANT_DIG__ == 24) && (__FLT_MAX_EXP__ == 128) // IEEE 754 binary32 + #define WJR_SIZEOF_FLOAT_VALUE 4 + #elif (__FLT_MANT_DIG__ == 53) && (__FLT_MAX_EXP__ == 1024) // IEEE 754 binary64 + #define WJR_SIZEOF_FLOAT_VALUE 8 + #elif (__FLT_MANT_DIG__ == 64 || __FLT_MANT_DIG__ == 53 || \ + __FLT_MANT_DIG__ == 24) && \ + (__FLT_MAX_EXP__ == 16384) // x87 extended double, with full 64-bit + // significand or reduced to 53 or 24 bits + #define WJR_SIZEOF_FLOAT_VALUE 10 + #elif (__FLT_MANT_DIG__ == 106) && \ + (__FLT_MAX_EXP__ == 1024) // IBM extended double + #define WJR_SIZEOF_FLOAT_VALUE 16 + #elif (__FLT_MANT_DIG__ == 113) && \ + (__FLT_MAX_EXP__ == 16384) // IEEE 754 binary128 + #define WJR_SIZEOF_FLOAT_VALUE 16 + #elif (__FLT_MANT_DIG__ == 237) && \ + (__FLT_MAX_EXP__ == 262144) // IEEE 754 binary256 + #define WJR_SIZEOF_FLOAT_VALUE 32 + #endif + + #if (__DBL_MANT_DIG__ == 11) && (__DBL_MAX_EXP__ == 16) // IEEE 754 binary16 + #define WJR_SIZEOF_DOUBLE_VALUE 2 + #elif (__DBL_MANT_DIG__ == 24) && (__DBL_MAX_EXP__ == 128) // IEEE 754 binary32 + #define WJR_SIZEOF_DOUBLE_VALUE 4 + #elif (__DBL_MANT_DIG__ == 53) && (__DBL_MAX_EXP__ == 1024) // IEEE 754 binary64 + #define WJR_SIZEOF_DOUBLE_VALUE 8 + #elif (__DBL_MANT_DIG__ == 64 || __DBL_MANT_DIG__ == 53 || \ + __DBL_MANT_DIG__ == 24) && \ + (__DBL_MAX_EXP__ == 16384) // x87 extended double, with full 64-bit + // significand or reduced to 53 or 24 bits + #define WJR_SIZEOF_DOUBLE_VALUE 10 + #elif (__DBL_MANT_DIG__ == 106) && \ + (__DBL_MAX_EXP__ == 1024) // IBM extended double + #define WJR_SIZEOF_DOUBLE_VALUE 16 + #elif (__DBL_MANT_DIG__ == 113) && \ + (__DBL_MAX_EXP__ == 16384) // IEEE 754 binary128 + #define WJR_SIZEOF_DOUBLE_VALUE 16 + #elif (__DBL_MANT_DIG__ == 237) && \ + (__DBL_MAX_EXP__ == 262144) // IEEE 754 binary256 + #define WJR_SIZEOF_DOUBLE_VALUE 32 + #endif + + #if (__LDBL_MANT_DIG__ == 11) && (__LDBL_MAX_EXP__ == 16) // IEEE 754 binary16 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 2 + #elif (__LDBL_MANT_DIG__ == 24) && (__LDBL_MAX_EXP__ == 128) // IEEE 754 binary32 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 4 + #elif (__LDBL_MANT_DIG__ == 53) && (__LDBL_MAX_EXP__ == 1024) // IEEE 754 binary64 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 8 + #elif (__LDBL_MANT_DIG__ == 64 || __LDBL_MANT_DIG__ == 53 || \ + __LDBL_MANT_DIG__ == 24) && \ + (__LDBL_MAX_EXP__ == 16384) // x87 extended double, with full 64-bit + // significand or reduced to 53 or 24 bits + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 10 + #elif (__LDBL_MANT_DIG__ == 106) && \ + (__LDBL_MAX_EXP__ == 1024) // IBM extended double + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 16 + #elif (__LDBL_MANT_DIG__ == 113) && \ + (__LDBL_MAX_EXP__ == 16384) // IEEE 754 binary128 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 16 + #elif (__LDBL_MANT_DIG__ == 237) && \ + (__LDBL_MAX_EXP__ == 262144) // IEEE 754 binary256 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 32 + #endif + + #elif (__FLT_RADIX__ == 10) + + #if (__FLT_MANT_DIG__ == 7) && (__FLT_MAX_EXP__ == 97) // IEEE 754 decimal32 + #define WJR_SIZEOF_FLOAT_VALUE 4 + #elif (__FLT_MANT_DIG__ == 16) && (__FLT_MAX_EXP__ == 385) // IEEE 754 decimal64 + #define WJR_SIZEOF_FLOAT_VALUE 8 + #elif (__FLT_MANT_DIG__ == 34) && (__FLT_MAX_EXP__ == 6145) // IEEE 754 decimal128 + #define WJR_SIZEOF_FLOAT_VALUE 16 + #endif + + #if (__DBL_MANT_DIG__ == 7) && (__DBL_MAX_EXP__ == 97) // IEEE 754 decimal32 + #define WJR_SIZEOF_DOUBLE_VALUE 4 + #elif (__DBL_MANT_DIG__ == 16) && (__DBL_MAX_EXP__ == 385) // IEEE 754 decimal64 + #define WJR_SIZEOF_DOUBLE_VALUE 8 + #elif (__DBL_MANT_DIG__ == 34) && (__DBL_MAX_EXP__ == 6145) // IEEE 754 decimal128 + #define WJR_SIZEOF_DOUBLE_VALUE 16 + #endif + + #if (__LDBL_MANT_DIG__ == 7) && (__LDBL_MAX_EXP__ == 97) // IEEE 754 decimal32 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 4 + #elif (__LDBL_MANT_DIG__ == 16) && (__LDBL_MAX_EXP__ == 385) // IEEE 754 decimal64 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 8 + #elif (__LDBL_MANT_DIG__ == 34) && \ + (__LDBL_MAX_EXP__ == 6145) // IEEE 754 decimal128 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 16 + #endif + + #endif + +#else // defined(__FLT_RADIX__) ... + + #include + + #if (FLT_RADIX == 2) + + #if (FLT_MANT_DIG == 11) && (FLT_MAX_EXP == 16) // IEEE 754 binary16 + #define WJR_SIZEOF_FLOAT_VALUE 2 + #elif (FLT_MANT_DIG == 24) && (FLT_MAX_EXP == 128) // IEEE 754 binary32 + #define WJR_SIZEOF_FLOAT_VALUE 4 + #elif (FLT_MANT_DIG == 53) && (FLT_MAX_EXP == 1024) // IEEE 754 binary64 + #define WJR_SIZEOF_FLOAT_VALUE 8 + #elif (FLT_MANT_DIG == 64 || FLT_MANT_DIG == 53 || FLT_MANT_DIG == 24) && \ + (FLT_MAX_EXP == 16384) // x87 extended double, with full 64-bit significand or + // reduced to 53 or 24 bits + #define WJR_SIZEOF_FLOAT_VALUE 10 + #elif (FLT_MANT_DIG == 106) && (FLT_MAX_EXP == 1024) // IBM extended double + #define WJR_SIZEOF_FLOAT_VALUE 16 + #elif (FLT_MANT_DIG == 113) && (FLT_MAX_EXP == 16384) // IEEE 754 binary128 + #define WJR_SIZEOF_FLOAT_VALUE 16 + #elif (FLT_MANT_DIG == 237) && (FLT_MAX_EXP == 262144) // IEEE 754 binary256 + #define WJR_SIZEOF_FLOAT_VALUE 32 + #endif + + #if (DBL_MANT_DIG == 11) && (DBL_MAX_EXP == 16) // IEEE 754 binary16 + #define WJR_SIZEOF_DOUBLE_VALUE 2 + #elif (DBL_MANT_DIG == 24) && (DBL_MAX_EXP == 128) // IEEE 754 binary32 + #define WJR_SIZEOF_DOUBLE_VALUE 4 + #elif (DBL_MANT_DIG == 53) && (DBL_MAX_EXP == 1024) // IEEE 754 binary64 + #define WJR_SIZEOF_DOUBLE_VALUE 8 + #elif (DBL_MANT_DIG == 64 || DBL_MANT_DIG == 53 || DBL_MANT_DIG == 24) && \ + (DBL_MAX_EXP == 16384) // x87 extended double, with full 64-bit significand or + // reduced to 53 or 24 bits + #define WJR_SIZEOF_DOUBLE_VALUE 10 + #elif (DBL_MANT_DIG == 106) && (DBL_MAX_EXP == 1024) // IBM extended double + #define WJR_SIZEOF_DOUBLE_VALUE 16 + #elif (DBL_MANT_DIG == 113) && (DBL_MAX_EXP == 16384) // IEEE 754 binary128 + #define WJR_SIZEOF_DOUBLE_VALUE 16 + #elif (DBL_MANT_DIG == 237) && (DBL_MAX_EXP == 262144) // IEEE 754 binary256 + #define WJR_SIZEOF_DOUBLE_VALUE 32 + #endif + + #if (LDBL_MANT_DIG == 11) && (LDBL_MAX_EXP == 16) // IEEE 754 binary16 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 2 + #elif (LDBL_MANT_DIG == 24) && (LDBL_MAX_EXP == 128) // IEEE 754 binary32 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 4 + #elif (LDBL_MANT_DIG == 53) && (LDBL_MAX_EXP == 1024) // IEEE 754 binary64 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 8 + #elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 53 || LDBL_MANT_DIG == 24) && \ + (LDBL_MAX_EXP == 16384) // x87 extended double, with full 64-bit significand + // or reduced to 53 or 24 bits + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 10 + #elif (LDBL_MANT_DIG == 106) && (LDBL_MAX_EXP == 1024) // IBM extended double + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 16 + #elif (LDBL_MANT_DIG == 113) && (LDBL_MAX_EXP == 16384) // IEEE 754 binary128 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 16 + #elif (LDBL_MANT_DIG == 237) && (LDBL_MAX_EXP == 262144) // IEEE 754 binary256 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 32 + #endif + + #elif (FLT_RADIX == 10) + + #if (FLT_MANT_DIG == 7) && (FLT_MAX_EXP == 97) // IEEE 754 decimal32 + #define WJR_SIZEOF_FLOAT_VALUE 4 + #elif (FLT_MANT_DIG == 16) && (FLT_MAX_EXP == 385) // IEEE 754 decimal64 + #define WJR_SIZEOF_FLOAT_VALUE 8 + #elif (FLT_MANT_DIG == 34) && (FLT_MAX_EXP == 6145) // IEEE 754 decimal128 + #define WJR_SIZEOF_FLOAT_VALUE 16 + #endif + + #if (DBL_MANT_DIG == 7) && (DBL_MAX_EXP == 97) // IEEE 754 decimal32 + #define WJR_SIZEOF_DOUBLE_VALUE 4 + #elif (DBL_MANT_DIG == 16) && (DBL_MAX_EXP == 385) // IEEE 754 decimal64 + #define WJR_SIZEOF_DOUBLE_VALUE 8 + #elif (DBL_MANT_DIG == 34) && (DBL_MAX_EXP == 6145) // IEEE 754 decimal128 + #define WJR_SIZEOF_DOUBLE_VALUE 16 + #endif + + #if (LDBL_MANT_DIG == 7) && (LDBL_MAX_EXP == 97) // IEEE 754 decimal32 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 4 + #elif (LDBL_MANT_DIG == 16) && (LDBL_MAX_EXP == 385) // IEEE 754 decimal64 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 8 + #elif (LDBL_MANT_DIG == 34) && (LDBL_MAX_EXP == 6145) // IEEE 754 decimal128 + #define WJR_SIZEOF_LONG_DOUBLE_VALUE 16 + #endif + + #endif + +#endif // defined(__FLT_RADIX__) ... + +// GCC and compatible compilers define internal macros with builtin type traits +#if defined(__SIZEOF_FLOAT__) + #define WJR_SIZEOF_FLOAT __SIZEOF_FLOAT__ +#endif +#if defined(__SIZEOF_DOUBLE__) + #define WJR_SIZEOF_DOUBLE __SIZEOF_DOUBLE__ +#endif +#if defined(__SIZEOF_LONG_DOUBLE__) + #define WJR_SIZEOF_LONG_DOUBLE __SIZEOF_LONG_DOUBLE__ +#endif + +#if !defined(WJR_SIZEOF_FLOAT) || !defined(WJR_SIZEOF_DOUBLE) || \ + !defined(WJR_SIZEOF_LONG_DOUBLE) + + #define WJR_ALIGN_SIZE_TO_POWER_OF_2(x) \ + ((x) == 1u \ + ? 1u \ + : ((x) == 2u \ + ? 2u \ + : ((x) <= 4u \ + ? 4u \ + : ((x) <= 8u \ + ? 8u \ + : ((x) <= 16u ? 16u : ((x) <= 32u ? 32u : (x))))))) + + // Make our best guess. These sizes may not be accurate, but they are good enough to + // estimate the size of the storage required to hold these types. + #if !defined(WJR_SIZEOF_FLOAT) && defined(WJR_SIZEOF_FLOAT_VALUE) + #define WJR_SIZEOF_FLOAT WJR_ALIGN_SIZE_TO_POWER_OF_2(WJR_SIZEOF_FLOAT_VALUE) + #endif + #if !defined(WJR_SIZEOF_DOUBLE) && defined(WJR_SIZEOF_DOUBLE_VALUE) + #define WJR_SIZEOF_DOUBLE WJR_ALIGN_SIZE_TO_POWER_OF_2(WJR_SIZEOF_DOUBLE_VALUE) + #endif + #if !defined(WJR_SIZEOF_LONG_DOUBLE) && defined(WJR_SIZEOF_LONG_DOUBLE_VALUE) + #define WJR_SIZEOF_LONG_DOUBLE \ + WJR_ALIGN_SIZE_TO_POWER_OF_2(WJR_SIZEOF_LONG_DOUBLE_VALUE) + #endif + +#endif // !defined(WJR_SIZEOF_FLOAT) || + // !defined(WJR_SIZEOF_DOUBLE) || + // !defined(WJR_SIZEOF_LONG_DOUBLE) + +#if !defined(WJR_SIZEOF_FLOAT_VALUE) || !defined(WJR_SIZEOF_FLOAT) || \ + !defined(WJR_SIZEOF_DOUBLE_VALUE) || !defined(WJR_SIZEOF_DOUBLE) || \ + !defined(WJR_SIZEOF_LONG_DOUBLE_VALUE) || !defined(WJR_SIZEOF_LONG_DOUBLE) + #error "Not support" +#endif + namespace wjr { enum class branch { diff --git a/src/wjr/format/dragonbox.cpp b/src/wjr/format/dragonbox.cpp index 8333eefa..19bf37ef 100644 --- a/src/wjr/format/dragonbox.cpp +++ b/src/wjr/format/dragonbox.cpp @@ -19,7 +19,7 @@ #include #ifndef JKJ_STATIC_DATA_SECTION -#define JKJ_STATIC_DATA_SECTION + #define JKJ_STATIC_DATA_SECTION #endif namespace wjr { diff --git a/src/wjr/json/json.cpp b/src/wjr/json/json.cpp index 44c40952..42bd7762 100644 --- a/src/wjr/json/json.cpp +++ b/src/wjr/json/json.cpp @@ -82,8 +82,8 @@ class check_parser { return check_string(first, last); } - WJR_INTRINSIC_INLINE result visit_object_key_string(const char *first, - const char *last) noexcept { + WJR_INTRINSIC_INLINE result + visit_object_key_string(const char *first, const char *last) const noexcept { return check_string(first, last); } @@ -91,11 +91,11 @@ class check_parser { return {}; } - WJR_INTRINSIC_INLINE result visit_object_start_object(uint32_t) noexcept { + WJR_INTRINSIC_INLINE result visit_object_start_object(uint32_t) const noexcept { return {}; } - WJR_INTRINSIC_INLINE result visit_array_start_object(uint32_t) noexcept { + WJR_INTRINSIC_INLINE result visit_array_start_object(uint32_t) const noexcept { return {}; } @@ -103,19 +103,20 @@ class check_parser { return {}; } - WJR_INTRINSIC_INLINE result visit_object_start_array(uint32_t) noexcept { + WJR_INTRINSIC_INLINE result visit_object_start_array(uint32_t) const noexcept { return {}; } - WJR_INTRINSIC_INLINE result visit_array_start_array(uint32_t) noexcept { + WJR_INTRINSIC_INLINE result visit_array_start_array(uint32_t) const noexcept { return {}; } - WJR_INTRINSIC_INLINE result visit_end_object_to_object(uint32_t) noexcept { + WJR_INTRINSIC_INLINE result + visit_end_object_to_object(uint32_t) const noexcept { return {}; } - WJR_INTRINSIC_INLINE result visit_end_object_to_array(uint32_t) noexcept { + WJR_INTRINSIC_INLINE result visit_end_object_to_array(uint32_t) const noexcept { return {}; } @@ -123,11 +124,11 @@ class check_parser { return {}; } - WJR_INTRINSIC_INLINE result visit_end_array_to_object(uint32_t) noexcept { + WJR_INTRINSIC_INLINE result visit_end_array_to_object(uint32_t) const noexcept { return {}; } - WJR_INTRINSIC_INLINE result visit_end_array_to_array(uint32_t) noexcept { + WJR_INTRINSIC_INLINE result visit_end_array_to_array(uint32_t) const noexcept { return {}; } diff --git a/src/wjr/json/lexer.cpp b/src/wjr/json/lexer.cpp index bb2cdc9d..d1958dac 100644 --- a/src/wjr/json/lexer.cpp +++ b/src/wjr/json/lexer.cpp @@ -103,6 +103,8 @@ typename lexer::result_type lexer::read(uint32_t *token_buf, const auto num = popcount(S); do { + WJR_ASSUME(!(idx & 0x3F)); + token_buf[0] = idx + ctz(S); S &= S - 1; token_buf[1] = idx + ctz(S); diff --git a/src/wjr/memory/memory_pool.cpp b/src/wjr/memory/memory_pool.cpp index 23a8189e..02430e25 100644 --- a/src/wjr/memory/memory_pool.cpp +++ b/src/wjr/memory/memory_pool.cpp @@ -56,9 +56,8 @@ char *__default_alloc_template__::object::chunk_alloc(unsigned int idx, return (chunk_alloc(idx, nobjs)); } -static constexpr std::array __nobjs_table = { - 32, 32, 32, 32, 16, 16, 16, 16, 8, 8, 4, 4 -}; +static constexpr std::array __nobjs_table = {32, 32, 32, 32, 16, 16, + 16, 16, 8, 8, 4, 4}; void *__default_alloc_template__::object::refill(unsigned int idx) noexcept { auto nobjs = static_cast(__nobjs_table[idx]); diff --git a/src/wjr/x86/json/lexer.cpp b/src/wjr/x86/json/lexer.cpp index 55ae9ca8..5bace22e 100644 --- a/src/wjr/x86/json/lexer.cpp +++ b/src/wjr/x86/json/lexer.cpp @@ -103,7 +103,7 @@ const static std::array pshufb_combine_table = { 0xff, 0xff, 0xff, 0xff, }; -#if !WJR_HAS_SIMD(AVX2) + #if !WJR_HAS_SIMD(AVX2) const static __m128i lh8_mask = sse::set1_epi8(0x0f); const static __m128i lo8_lookup = @@ -126,7 +126,7 @@ WJR_INTRINSIC_INLINE void compress(char *dst, __m128i x, uint16_t mask) noexcept sse::storeu(dst, almostthere); } -#else + #else const static __m256i lh8_mask = avx::set1_epi8(0x0f); const static __m256i lo8_lookup = avx::set_epi8(0, 0, 12, 1, 4, 10, 8, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 12, 1, 4, 10, @@ -159,7 +159,7 @@ WJR_INTRINSIC_INLINE void compress(char *dst, __m256i x, uint32_t mask) noexcept sse::storeu(dst + 16 - popcount(mask & 0xFFFF), avx::gethigh(almostthere)); } -#endif + #endif } // namespace lexer_detail @@ -239,18 +239,19 @@ typename lexer::result_type lexer::read(uint32_t *token_buf, // colon : 2 // brackets : 4 // whitespace : 8, 16 + // others : 0 - const uint32_t stu = simd::movemask_epi8( - simd::cmpeq_epi8(simd::And(result, simd::set1_epi8(7)), simd::zeros())); - const uint32_t wsp = simd::movemask_epi8( - simd::cmpeq_epi8(simd::And(result, simd::set1_epi8(24)), simd::zeros())); + const uint32_t stu = + simd::movemask_epi8(simd::cmpgt_epi8(result, simd::zeros())); + const uint32_t wsp = + simd::movemask_epi8(simd::cmpgt_epi8(result, simd::set1_epi8(7))); S |= (uint64_t)(stu) << (i * u8_width); W |= (uint64_t)(wsp) << (i * u8_width); } - S = ~S; - W = ~W; + const auto WS = S; + S ^= W; if (WJR_LIKELY(!B)) { B = prev_is_escape; @@ -266,11 +267,10 @@ typename lexer::result_type lexer::read(uint32_t *token_buf, const uint64_t R = prefix_xor(Q) ^ prev_in_string; prev_in_string = static_cast(static_cast(R) >> 63); - const auto WS = S | W; const auto WT = shld(WS, prev_is_ws, 1); prev_is_ws = WS; - S |= (WT & ~W); + S |= WT & ~W; S &= ~R; S |= Q; @@ -278,13 +278,16 @@ typename lexer::result_type lexer::read(uint32_t *token_buf, const auto num = popcount(S); do { - token_buf[0] = idx + ctz(S); + const auto __idx = idx; + WJR_ASSUME(!(__idx & 0x3F)); + + token_buf[0] = __idx + ctz(S); S &= S - 1; - token_buf[1] = idx + ctz(S); + token_buf[1] = __idx + ctz(S); S &= S - 1; - token_buf[2] = idx + ctz(S); + token_buf[2] = __idx + ctz(S); S &= S - 1; - token_buf[3] = idx + ctz(S); + token_buf[3] = __idx + ctz(S); if (WJR_UNLIKELY(num <= 4)) { break; @@ -292,13 +295,13 @@ typename lexer::result_type lexer::read(uint32_t *token_buf, S &= S - 1; - token_buf[4] = idx + ctz(S); + token_buf[4] = __idx + ctz(S); S &= S - 1; - token_buf[5] = idx + ctz(S); + token_buf[5] = __idx + ctz(S); S &= S - 1; - token_buf[6] = idx + ctz(S); + token_buf[6] = __idx + ctz(S); S &= S - 1; - token_buf[7] = idx + ctz(S); + token_buf[7] = __idx + ctz(S); if (WJR_LIKELY(num <= 8)) { break; @@ -307,7 +310,7 @@ typename lexer::result_type lexer::read(uint32_t *token_buf, S &= S - 1; for (int i = 8; i < 12; ++i) { - token_buf[i] = idx + ctz(S); + token_buf[i] = __idx + ctz(S); S &= S - 1; } @@ -316,7 +319,7 @@ typename lexer::result_type lexer::read(uint32_t *token_buf, } for (int i = 12; i < 16; ++i) { - token_buf[i] = idx + ctz(S); + token_buf[i] = __idx + ctz(S); S &= S - 1; } @@ -325,7 +328,7 @@ typename lexer::result_type lexer::read(uint32_t *token_buf, } for (int i = 16; i < num; ++i) { - token_buf[i] = idx + ctz(S); + token_buf[i] = __idx + ctz(S); S &= S - 1; } } while (false); @@ -411,6 +414,7 @@ char *minify(char *dst, const char *first, const char *last) noexcept { const auto result = simd::And(shuf_lo8, shuf_hi8); // whitespace : 8, 16 + // others : 0, 1, 2, 4 const uint32_t wsp = simd::movemask_epi8( simd::cmpeq_epi8(simd::And(result, simd::set1_epi8(24)), simd::zeros())); diff --git a/src/wjr/x86/json/string.cpp b/src/wjr/x86/json/string.cpp index c77ef0d9..34d36dc1 100644 --- a/src/wjr/x86/json/string.cpp +++ b/src/wjr/x86/json/string.cpp @@ -8,13 +8,13 @@ namespace wjr::json { WJR_INTRINSIC_INLINE char *small_copy(char *dst, const char *src, unsigned int n) noexcept { if (n >= 8) { -#if WJR_HAS_SIMD(AVX2) + #if WJR_HAS_SIMD(AVX2) if (n >= 16) { std::memcpy(dst, src, 16); std::memcpy(dst + n - 16, src + n - 16, 16); return dst + n; } -#endif + #endif std::memcpy(dst, src, 8); std::memcpy(dst + n - 8, src + n - 8, 8); return dst + n; @@ -164,7 +164,7 @@ result parse_string(char *dst, const char *first, const char *last) noex simd_mask_type B; do { -#if WJR_HAS_SIMD(AVX2) + #if WJR_HAS_SIMD(AVX2) if (WJR_UNLIKELY(n > 16)) { const auto m = n - 16; const auto x0 = sse::loadu(first); @@ -180,7 +180,7 @@ result parse_string(char *dst, const char *first, const char *last) noex break; } -#endif + #endif if (WJR_UNLIKELY(n <= 8)) { if (WJR_UNLIKELY(n <= 4)) { @@ -408,7 +408,7 @@ result check_string(const char *first, const char *last) noexcept { simd_mask_type B; do { -#if WJR_HAS_SIMD(AVX2) + #if WJR_HAS_SIMD(AVX2) if (WJR_UNLIKELY(n > 16)) { const auto m = n - 16; const auto x0 = sse::loadu(first); @@ -422,7 +422,7 @@ result check_string(const char *first, const char *last) noexcept { break; } -#endif + #endif if (WJR_UNLIKELY(n <= 8)) { if (WJR_UNLIKELY(n <= 4)) { diff --git a/src/wjr/x86/math/mul.cpp b/src/wjr/x86/math/mul.cpp index 84c9cae3..ccb8cbee 100644 --- a/src/wjr/x86/math/mul.cpp +++ b/src/wjr/x86/math/mul.cpp @@ -472,13 +472,13 @@ uint64_t __wjr_asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, #endif #if WJR_HAS_BUILTIN(ASM_ADDLSH_N) == 1 -#define WJR_ADDSUB_I 1 -#include "gen_addrsblsh_n.hpp" + #define WJR_ADDSUB_I 1 + #include "gen_addrsblsh_n.hpp" #endif #if WJR_HAS_BUILTIN(ASM_RSBLSH_N) == 1 -#define WJR_ADDSUB_I 0 -#include "gen_addrsblsh_n.hpp" + #define WJR_ADDSUB_I 0 + #include "gen_addrsblsh_n.hpp" #endif #if WJR_HAS_BUILTIN(ASM_BASECASE_MUL_S) == 1 diff --git a/src/wjr/x86/math/simd.cpp b/src/wjr/x86/math/simd.cpp index bc23dc8d..f3c25195 100644 --- a/src/wjr/x86/math/simd.cpp +++ b/src/wjr/x86/math/simd.cpp @@ -1,6 +1,6 @@ -#include #include #include +#include namespace wjr { diff --git a/third-party/atomic b/third-party/atomic new file mode 160000 index 00000000..fbdb5f44 --- /dev/null +++ b/third-party/atomic @@ -0,0 +1 @@ +Subproject commit fbdb5f44d4ace030d8a8836cc4f3738bcc1c6d72