diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 31572a968f..237710c5f4 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -106,6 +106,7 @@ function(add_benchmark name) target_link_libraries(benchmark-${name} PRIVATE benchmark::benchmark) endfunction() +add_benchmark(bitset_from_string src/bitset_from_string.cpp) add_benchmark(bitset_to_string src/bitset_to_string.cpp) add_benchmark(efficient_nonlocking_print src/efficient_nonlocking_print.cpp) add_benchmark(find_and_count src/find_and_count.cpp) diff --git a/benchmarks/src/bitset_from_string.cpp b/benchmarks/src/bitset_from_string.cpp new file mode 100644 index 0000000000..78faf10a37 --- /dev/null +++ b/benchmarks/src/bitset_from_string.cpp @@ -0,0 +1,89 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include + +using namespace std; + +namespace { + template + auto random_digits_init() { + mt19937_64 rnd{}; + uniform_int_distribution<> dis('0', '1'); + + constexpr size_t number_of_bitsets = (Min_length + N - 1) / N; + static_assert(number_of_bitsets != 0); + + constexpr size_t actual_size = number_of_bitsets * (N + 1); // +1 for \0 + + array result; + + for (size_t i = 0; i < actual_size; ++i) { + if (i % (N + 1) == N) { + result[i] = charT{'\0'}; // write null terminators + } else { + result[i] = static_cast(dis(rnd)); // fill random digits + } + } + + return result; + } + + enum class length_type : bool { char_count, null_term }; + + template + const auto random_digits = random_digits_init(); + + template + void BM_bitset_from_string(benchmark::State& state) { + const auto& digit_array = random_digits; + for (auto _ : state) { + benchmark::DoNotOptimize(digit_array); + const auto arr_data = digit_array.data(); + const auto arr_size = digit_array.size(); + for (size_t pos = 0; pos != arr_size; pos += N + 1) { + if constexpr (Length == length_type::char_count) { + bitset bs(arr_data + pos, N); + benchmark::DoNotOptimize(bs); + } else { + bitset bs(arr_data + pos); + benchmark::DoNotOptimize(bs); + } + } + } + } +} // namespace + +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); + +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); + +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); + +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); +BENCHMARK(BM_bitset_from_string); + +BENCHMARK_MAIN(); diff --git a/stl/inc/bitset b/stl/inc/bitset index acb60bca93..0fcff9b029 100644 --- a/stl/inc/bitset +++ b/stl/inc/bitset @@ -23,11 +23,19 @@ _STL_DISABLE_CLANG_WARNINGS #endif // !defined(_STD_BITSET_TO_STREAM_STACK_RESERVATION) #if _USE_STD_VECTOR_ALGORITHMS +// These bitset functions sometimes assume that the bit array has zero padding to a multiple of 2 or 4 bytes. +// The assumptions hold true even for the vNext suggestion to use smaller types for small bitsets (see GH-1498) +// due to vectorization thresholds. + extern "C" { __declspec(noalias) void __stdcall __std_bitset_to_string_1( char* _Dest, const void* _Src, size_t _Size_bits, char _Elem0, char _Elem1) noexcept; __declspec(noalias) void __stdcall __std_bitset_to_string_2( wchar_t* _Dest, const void* _Src, size_t _Size_bits, wchar_t _Elem0, wchar_t _Elem1) noexcept; +__declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const char* _Src, size_t _Size_bytes, + size_t _Size_bits, size_t _Size_chars, char _Elem0, char _Elem1) noexcept; +__declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const wchar_t* _Src, size_t _Size_bytes, + size_t _Size_bits, size_t _Size_chars, wchar_t _Elem0, wchar_t _Elem1) noexcept; } // extern "C" #endif // _USE_STD_VECTOR_ALGORITHMS @@ -115,6 +123,30 @@ public: private: template _CONSTEXPR23 void _Construct(const _Elem* const _Ptr, size_t _Count, const _Elem _Elem0, const _Elem _Elem1) { +#if _USE_STD_VECTOR_ALGORITHMS + constexpr size_t _Bitset_from_string_vector_threshold = 16; + if constexpr (_Bits >= _Bitset_from_string_vector_threshold + && _Is_implementation_handled_char_traits<_Traits> && sizeof(_Elem) <= 2) { + if (!_STD _Is_constant_evaluated()) { + bool _Result; + + if constexpr (sizeof(_Elem) == 1) { + _Result = __std_bitset_from_string_1(_Array, reinterpret_cast(_Ptr), sizeof(_Array), + _Bits, _Count, static_cast(_Elem0), static_cast(_Elem1)); + } else { + _STL_INTERNAL_STATIC_ASSERT(sizeof(_Elem) == 2); + _Result = __std_bitset_from_string_2(_Array, reinterpret_cast(_Ptr), sizeof(_Array), + _Bits, _Count, static_cast(_Elem0), static_cast(_Elem1)); + } + + if (!_Result) { + _Xinv(); + } + + return; + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS if (_Count > _Bits) { for (size_t _Idx = _Bits; _Idx < _Count; ++_Idx) { const auto _Ch = _Ptr[_Idx]; @@ -462,8 +494,8 @@ public: _CONSTEXPR23 void _To_string( _Elem* const _Buf, const size_t _Len, const _Elem _Elem0, const _Elem _Elem1) const noexcept { #if _USE_STD_VECTOR_ALGORITHMS - constexpr size_t _Bitset_vector_threshold = 32; - if constexpr (_Bits >= _Bitset_vector_threshold && is_integral_v<_Elem> && sizeof(_Elem) <= 2) { + constexpr size_t _Bitset_to_string_vector_threshold = 32; + if constexpr (_Bits >= _Bitset_to_string_vector_threshold && is_integral_v<_Elem> && sizeof(_Elem) <= 2) { if (!_Is_constant_evaluated()) { if constexpr (sizeof(_Elem) == 1) { __std_bitset_to_string_1(reinterpret_cast(_Buf), _Array, _Len, static_cast(_Elem0), diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 3d909e8e55..226b43a04d 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3703,8 +3703,8 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_1( if (_Size_bits > 0) { __assume(_Size_bits < 32); - uint32_t _Val = 0; - memcpy(&_Val, _Src, (_Size_bits + 7) / 8); + uint32_t _Val; + memcpy(&_Val, _Src, 4); const __m256i _Elems = _Bitset_to_string_1_step_avx(_Val, _Px0, _Px1); char _Tmp[32]; _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Elems); @@ -3733,11 +3733,7 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_1( if (_Size_bits > 0) { __assume(_Size_bits < 16); uint16_t _Val; - if (_Size_bits > 8) { - memcpy(&_Val, _Src, 2); - } else { - _Val = *reinterpret_cast(_Src); - } + memcpy(&_Val, _Src, 2); const __m128i _Elems = _Bitset_to_string_1_step(_Val, _Px0, _Px1); char _Tmp[16]; _mm_storeu_si128(reinterpret_cast<__m128i*>(_Tmp), _Elems); @@ -3780,11 +3776,7 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_2( if (_Size_bits > 0) { __assume(_Size_bits < 16); uint16_t _Val; - if (_Size_bits > 8) { - memcpy(&_Val, _Src, 2); - } else { - _Val = *reinterpret_cast(_Src); - } + memcpy(&_Val, _Src, 2); const __m256i _Elems = _Bitset_to_string_2_step_avx(_Val, _Px0, _Px1); wchar_t _Tmp[16]; _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Elems); @@ -3830,5 +3822,263 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_2( } } +} // extern "C" + +namespace { + + namespace __std_bitset_from_string { + +#ifdef _M_ARM64EC + using _Traits_1_avx = void; + using _Traits_1_sse = void; + using _Traits_2_avx = void; + using _Traits_2_sse = void; +#else // ^^^ defined(_M_ARM64EC) / !defined(_M_ARM64EC) vvv + struct _Traits_avx { + using _Vec = __m256i; + + static __m256i _Load(const void* _Src) noexcept { + return _mm256_loadu_si256(reinterpret_cast(_Src)); + } + + static void _Store(void* _Dest, const __m256i _Val) noexcept { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Dest), _Val); + } + + static bool _Check(const __m256i _Val, const __m256i _Ex1, const __m256i _Dx0) noexcept { + return _mm256_testc_si256(_Ex1, _mm256_xor_si256(_Val, _Dx0)); + } + }; + + struct _Traits_sse { + using _Vec = __m128i; + + static __m128i _Load(const void* _Src) noexcept { + return _mm_loadu_si128(reinterpret_cast(_Src)); + } + + static void _Store(void* _Dest, const __m128i _Val) noexcept { + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Val); + } + + static bool _Check(const __m128i _Val, const __m128i _Ex1, const __m128i _Dx0) noexcept { + return _mm_testc_si128(_Ex1, _mm_xor_si128(_Val, _Dx0)); + } + }; + + struct _Traits_1_avx : _Traits_avx { + using _Word = uint32_t; + + static __m256i _Set(const char _Val) noexcept { + return _mm256_set1_epi8(_Val); + } + + static uint32_t _To_bits(const __m256i _Ex1) noexcept { + const __m256i _Shuf = _mm256_set_epi8( // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); + return _rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 16); + } + + static __m256i _Cmp(const __m256i _Val, const __m256i _Dx1) noexcept { + return _mm256_cmpeq_epi8(_Val, _Dx1); + } + }; + + struct _Traits_1_sse : _Traits_sse { + using _Word = uint16_t; + + static __m128i _Set(const char _Val) noexcept { + return _mm_shuffle_epi8(_mm_cvtsi32_si128(_Val), _mm_setzero_si128()); + } + + static uint16_t _To_bits(const __m128i _Ex1) noexcept { + const __m128i _Shuf = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); + return static_cast(_mm_movemask_epi8(_Ex2)); + } + + static __m128i _Cmp(const __m128i _Val, const __m128i _Dx1) noexcept { + return _mm_cmpeq_epi8(_Val, _Dx1); + } + }; + + struct _Traits_2_avx : _Traits_avx { + using _Word = uint16_t; + + static __m256i _Set(const wchar_t _Val) noexcept { + return _mm256_set1_epi16(_Val); + } + + static uint16_t _To_bits(const __m256i _Ex1) noexcept { + const __m256i _Shuf = _mm256_set_epi8( // + +0, +2, +4, +6, +8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, // + -1, -1, -1, -1, -1, -1, -1, -1, +0, +2, +4, +6, +8, 10, 12, 14); + + const __m256i _Ex2 = _mm256_shuffle_epi8(_Ex1, _Shuf); + return static_cast(_rotl(static_cast(_mm256_movemask_epi8(_Ex2)), 8)); + } + + static __m256i _Cmp(const __m256i _Val, const __m256i _Dx1) noexcept { + return _mm256_cmpeq_epi16(_Val, _Dx1); + } + }; + + struct _Traits_2_sse : _Traits_sse { + using _Word = uint8_t; + + static __m128i _Set(const wchar_t _Val) noexcept { + return _mm_set1_epi16(_Val); + } + + static uint8_t _To_bits(const __m128i _Ex1) noexcept { + const __m128i _Shuf = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12, 14); + const __m128i _Ex2 = _mm_shuffle_epi8(_Ex1, _Shuf); + return static_cast(_mm_movemask_epi8(_Ex2)); + } + + static __m128i _Cmp(const __m128i _Val, const __m128i _Dx1) noexcept { + return _mm_cmpeq_epi16(_Val, _Dx1); + } + }; + + template + bool _Loop(const _Elem* const _Src, const _Elem* _Src_end, const typename _Traits::_Vec _Dx0, + const typename _Traits::_Vec _Dx1, _OutFn _Out) noexcept { + for (;;) { + typename _Traits::_Vec _Val; + constexpr size_t _Per_vec = sizeof(_Val) / sizeof(_Elem); + + if (const size_t _Left = _Src_end - _Src; _Left >= _Per_vec) { + _Src_end -= _Per_vec; + _Val = _Traits::_Load(_Src_end); + } else if (_Left == 0) { + return true; + } else { + _Src_end = _Src; + _Elem _Tmp[_Per_vec]; + _Traits::_Store(_Tmp, _Dx0); + _Elem* const _Tmpd = _Tmp + (_Per_vec - _Left); + _CSTD memcpy(_Tmpd, _Src_end, _Left * sizeof(_Elem)); + _Val = _Traits::_Load(_Tmp); + } + + const auto _Ex1 = _Traits::_Cmp(_Val, _Dx1); + + if (!_Traits::_Check(_Val, _Ex1, _Dx0)) { + return false; + } + + _Out(_Ex1); + } + } + + template + bool _Impl(void* const _Dest, const _Elem* const _Src, const size_t _Size_bytes, const size_t _Size_bits, + const size_t _Size_chars, const _Elem _Elem0, const _Elem _Elem1) noexcept { + const auto _Dx0 = _Traits::_Set(_Elem0); + const auto _Dx1 = _Traits::_Set(_Elem1); + + auto _Dst_words = reinterpret_cast<_Traits::_Word*>(_Dest); + void* _Dst_words_end = _Dst_words; + _Advance_bytes(_Dst_words_end, _Size_bytes); + + auto _Out = [&_Dst_words](const _Traits::_Vec _Ex1) { + *_Dst_words = _Traits::_To_bits(_Ex1); + ++_Dst_words; + }; + + const size_t _Size_convert = (_Size_chars <= _Size_bits) ? _Size_chars : _Size_bits; + + // Convert characters to bits + if (!_Loop<_Traits>(_Src, _Src + _Size_convert, _Dx0, _Dx1, _Out)) { + return false; + } + + // Verify remaining characters, if any + if (_Size_convert != _Size_chars + && !_Loop<_Traits>(_Src + _Size_convert, _Src + _Size_chars, _Dx0, _Dx1, [](_Traits::_Vec) {})) { + return false; + } + + // Trim tail (may be padding tail, or too short string, or both) + if (_Dst_words != _Dst_words_end) { + _CSTD memset(_Dst_words, 0, _Byte_length(_Dst_words, _Dst_words_end)); + } + + return true; + } +#endif // !defined(_M_ARM64EC) + + template + bool _Fallback(void* const _Dest, const _Elem* const _Src, const size_t _Size_bytes, const size_t _Size_bits, + const size_t _Size_chars, const _Elem _Elem0, const _Elem _Elem1) noexcept { + const auto _Dest_bytes = static_cast(_Dest); + size_t _Size_convert = _Size_chars; + + if (_Size_chars > _Size_bits) { + _Size_convert = _Size_bits; + + for (size_t _Ix = _Size_bits; _Ix < _Size_chars; ++_Ix) { + if (const _Elem _Cur = _Src[_Ix]; _Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { + return false; + } + } + } + + _CSTD memset(_Dest, 0, _Size_bytes); + + for (size_t _Ix = 0; _Ix != _Size_convert; ++_Ix) { + const _Elem _Cur = _Src[_Size_convert - _Ix - 1]; + + if (_Cur != _Elem0 && _Cur != _Elem1) [[unlikely]] { + return false; + } + + _Dest_bytes[_Ix >> 3] |= static_cast(_Cur == _Elem1) << (_Ix & 0x7); + } + + return true; + } + + template + bool _Dispatch(void* _Dest, const _Elem* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars, + _Elem _Elem0, _Elem _Elem1) noexcept { +#ifndef _M_ARM64EC + if (_Use_avx2() && _Size_bits >= 256) { + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + + return _Impl<_Avx>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + } else if (_Use_sse42()) { + return _Impl<_Sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + } else +#endif // !defined(_M_ARM64EC) + { + return _Fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); + } + } + + } // namespace __std_bitset_from_string + +} // unnamed namespace + +extern "C" { + +__declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* _Dest, const char* _Src, size_t _Size_bytes, + size_t _Size_bits, size_t _Size_chars, char _Elem0, char _Elem1) noexcept { + using namespace __std_bitset_from_string; + + return _Dispatch<_Traits_1_avx, _Traits_1_sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); +} + +__declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* _Dest, const wchar_t* _Src, size_t _Size_bytes, + size_t _Size_bits, size_t _Size_chars, wchar_t _Elem0, wchar_t _Elem1) noexcept { + using namespace __std_bitset_from_string; + + return _Dispatch<_Traits_2_avx, _Traits_2_sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1); +} + } // extern "C" #endif // defined(_M_IX86) || defined(_M_X64) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 28f5c92d37..8efd56dd02 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -997,6 +998,15 @@ void test_randomized_bitset_base_count(mt19937_64& gen) { test_randomized_bitset_base(make_index_sequence{}, gen); } +template +void assert_throws_inv(F f) { + try { + f(); + assert(false); + } catch (const invalid_argument&) { + } +} + void test_bitset(mt19937_64& gen) { assert(bitset<0>(0x0ULL).to_string() == ""); assert(bitset<0>(0xFEDCBA9876543210ULL).to_string() == ""); @@ -1038,6 +1048,33 @@ void test_bitset(mt19937_64& gen) { assert(bitset<75>(0xFEDCBA9876543210ULL).to_string() == U"000000000001111111011011100101110101001100001110110010101000011001000010000"); // not vectorized + assert(bitset<0>("").to_ullong() == 0); + assert(bitset<0>("1").to_ullong() == 0); + assert_throws_inv([] { (void) bitset<0>("x"); }); + + assert(bitset<45>("101110000000111010001011100101001111111111111").to_ullong() == 0x1701D1729FFFULL); + assert(bitset<45>("110101001100001110110010101000011001000010000").to_ullong() == 0x1A9876543210ULL); + assert(bitset<45>("111").to_ullong() == 0x7); + assert_throws_inv([] { (void) bitset<45>("11x11"); }); + assert_throws_inv([] { (void) bitset<45>("111111111111111111111111111111111111111111111x"); }); + assert_throws_inv([] { (void) bitset<45>("x111111111111111111111111111111111111111111111"); }); + + assert(bitset<64>("xxxxxxxoxxoxxxooxoxxxoxoxooxxooooxxxoxxooxoxoxooooxxooxooooxoooo", string::npos, 'o', 'x') + .to_ullong() + == 0xFEDCBA9876543210ULL); + assert(bitset<64>(L"xxxxxxxoxxoxxxooxoxxxoxoxooxxooooxxxoxxooxoxoxooooxxooxooooxoooo", wstring::npos, L'o', L'x') + .to_ullong() + == 0xFEDCBA9876543210ULL); + +#ifdef __cpp_lib_char8_t + assert(bitset<75>(u8"000000000001111111011011100101110101001100001110110010101000011001000010000").to_ullong() + == 0xFEDCBA9876543210ULL); +#endif // __cpp_lib_char8_t + assert(bitset<75>(u"000000000001111111011011100101110101001100001110110010101000011001000010000").to_ullong() + == 0xFEDCBA9876543210ULL); + assert(bitset<75>(U"000000000001111111011011100101110101001100001110110010101000011001000010000").to_ullong() + == 0xFEDCBA9876543210ULL); // not vectorized + test_randomized_bitset_base_count<512 - 5, 32 + 10>(gen); }