Skip to content

Commit

Permalink
Fix SSE2 variant of TextColor::GetColor
Browse files Browse the repository at this point in the history
  • Loading branch information
lhecker committed Aug 4, 2021
1 parent 8ab3422 commit 34f0a43
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions src/buffer/out/TextColor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,21 +192,24 @@ COLORREF TextColor::GetColor(const std::array<COLORREF, 256>& colorTable, const
unsigned long index;
return _BitScanForward(&index, mask) ? til::at(colorTable, static_cast<size_t>(index) + 8) : defaultColor; // 5.
#elif _M_AMD64
// If you look closely this SSE2 algorithm is the exact same as the AVX one.
// If you look closely this SSE2 algorithm is the same as the AVX one.
// The two differences are that we need to:
// * do everything twice, because SSE is limited to 128 bits and not 256.
// * use _mm_packs_epi32 to merge two 128 bits vectors into one in step 3.5.
// _mm_packs_epi32 takes two SSE registers and truncates all 8 DWORDs into 8 WORDs,
// the latter of which fits into a single register (which is then used in the identical step 4).
// * since the result is are now 8 WORDs, we need to use _mm_movemask_epi8,
// which unlike 4. for AVX results in in something like 0b0000110000000000.
// --> index must be divided by 2.
const auto haystack1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(colorTable.data() + 0));
const auto haystack2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(colorTable.data() + 4));
const auto needle = _mm_set1_epi32(__builtin_bit_cast(int, defaultColor));
const auto result1 = _mm_cmpeq_epi32(haystack1, needle);
const auto result2 = _mm_cmpeq_epi32(haystack2, needle);
const auto result = _mm_packs_epi32(result1, result2); // 3.5
const auto mask = _mm_movemask_ps(_mm_castsi128_ps(result));
const auto mask = _mm_movemask_epi8(result);
unsigned long index;
return _BitScanForward(&index, mask) ? til::at(colorTable, static_cast<size_t>(index) + 8) : defaultColor;
return _BitScanForward(&index, mask) ? til::at(colorTable, static_cast<size_t>(index / 2) + 8) : defaultColor;
#else
for (size_t i = 0; i < 8; i++)
{
Expand Down

0 comments on commit 34f0a43

Please sign in to comment.