From aea725f8852331d1cf024062fb8c7deed3e1c796 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Wed, 4 Aug 2021 17:57:20 +0200 Subject: [PATCH] Fix SSE2 variant of TextColor::GetColor (#10867) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shortly before adding the SSE2 variant I "improved" it by using `_mm_packs_epi32`, but failed to test it again afterwards. ## PR Checklist * [x] Closes #10866 * [x] I work here ## Validation Steps Performed * `printf "\e[mNORMAL \e[1mBOLD\n"` results in correct bold white glyphs ✔️ --- src/buffer/out/TextColor.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/buffer/out/TextColor.cpp b/src/buffer/out/TextColor.cpp index 5a628384d63..dcd0a8df342 100644 --- a/src/buffer/out/TextColor.cpp +++ b/src/buffer/out/TextColor.cpp @@ -192,21 +192,24 @@ COLORREF TextColor::GetColor(const std::array& colorTable, const unsigned long index; return _BitScanForward(&index, mask) ? til::at(colorTable, static_cast(index) + 8) : defaultColor; // 5. #elif _M_AMD64 - // If you look closely this SSE2 algorithm is the exact same as the AVX one. + // If you look closely this SSE2 algorithm is the same as the AVX one. // The two differences are that we need to: // * do everything twice, because SSE is limited to 128 bits and not 256. // * use _mm_packs_epi32 to merge two 128 bits vectors into one in step 3.5. // _mm_packs_epi32 takes two SSE registers and truncates all 8 DWORDs into 8 WORDs, // the latter of which fits into a single register (which is then used in the identical step 4). + // * since the result are now 8 WORDs, we need to use _mm_movemask_epi8 (there's no 16-bit variant), + // which unlike AVX's step 4 results in in something like 0b0000110000000000. + // --> the index returned by _BitScanForward must be divided by 2. const auto haystack1 = _mm_loadu_si128(reinterpret_cast(colorTable.data() + 0)); const auto haystack2 = _mm_loadu_si128(reinterpret_cast(colorTable.data() + 4)); const auto needle = _mm_set1_epi32(__builtin_bit_cast(int, defaultColor)); const auto result1 = _mm_cmpeq_epi32(haystack1, needle); const auto result2 = _mm_cmpeq_epi32(haystack2, needle); const auto result = _mm_packs_epi32(result1, result2); // 3.5 - const auto mask = _mm_movemask_ps(_mm_castsi128_ps(result)); + const auto mask = _mm_movemask_epi8(result); unsigned long index; - return _BitScanForward(&index, mask) ? til::at(colorTable, static_cast(index) + 8) : defaultColor; + return _BitScanForward(&index, mask) ? til::at(colorTable, static_cast(index / 2) + 8) : defaultColor; #else for (size_t i = 0; i < 8; i++) {