diff --git a/.github/actions/spelling/expect/expect.txt b/.github/actions/spelling/expect/expect.txt index 7e7b46f7d3a..74dc2109b20 100644 --- a/.github/actions/spelling/expect/expect.txt +++ b/.github/actions/spelling/expect/expect.txt @@ -2062,6 +2062,7 @@ vcpkg vcprintf vcxitems vec +vectorize vectorized VERCTRL VERTBAR diff --git a/src/buffer/out/Row.cpp b/src/buffer/out/Row.cpp index 097fc10dc11..0ea79d6649b 100644 --- a/src/buffer/out/Row.cpp +++ b/src/buffer/out/Row.cpp @@ -631,26 +631,36 @@ catch (...) const auto baseOffset = til::at(charOffsets, 0); const auto endOffset = til::at(charOffsets, colEndInput); const auto inToOutOffset = gsl::narrow_cast(chBeg - baseOffset); +#pragma warning(suppress : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1). + const auto dst = row._charOffsets.data() + colEnd; - // Now with the `colEndInput` figured out, we can easily copy the `charOffsets` into the `_charOffsets`. - // It's possible to use SIMD for this loop for extra perf gains. Something like this for SSE2 (~8x faster): - // const auto in = _mm_loadu_si128(...); - // const auto off = _mm_and_epi32(in, _mm_set1_epi16(CharOffsetsMask)); - // const auto trailer = _mm_and_epi32(in, _mm_set1_epi16(CharOffsetsTrailer)); - // const auto out = _mm_or_epi32(_mm_add_epi16(off, _mm_set1_epi16(inToOutOffset)), trailer); - // _mm_store_si128(..., out); - for (uint16_t i = 0; i < colEndInput; ++i, ++colEnd) - { - const auto ch = til::at(charOffsets, i); - const auto off = ch & CharOffsetsMask; - const auto trailer = ch & CharOffsetsTrailer; - til::at(row._charOffsets, colEnd) = gsl::narrow_cast((off + inToOutOffset) | trailer); - } + _copyOffsets(dst, charOffsets.data(), colEndInput, inToOutOffset); + colEnd += colEndInput; colEndDirty = gsl::narrow_cast(colBeg + colEndDirtyInput); charsConsumed = endOffset - baseOffset; } +#pragma warning(push) +#pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1). +[[msvc::forceinline]] void ROW::WriteHelper::_copyOffsets(uint16_t* __restrict dst, const uint16_t* __restrict src, uint16_t size, uint16_t offset) noexcept +{ + __assume(src != nullptr); + __assume(dst != nullptr); + + // All tested compilers (including MSVC) will neatly unroll and vectorize + // this loop, which is why it's written in this particular way. + for (const auto end = src + size; src != end; ++src, ++dst) + { + const uint16_t ch = *src; + const uint16_t off = ch & CharOffsetsMask; + const uint16_t trailer = ch & CharOffsetsTrailer; + const uint16_t newOff = off + offset; + *dst = newOff | trailer; + } +} +#pragma warning(pop) + [[msvc::forceinline]] void ROW::WriteHelper::Finish() { colEndDirty = row._adjustForward(colEndDirty); diff --git a/src/buffer/out/Row.hpp b/src/buffer/out/Row.hpp index 2054086f95a..6e4d8f0062e 100644 --- a/src/buffer/out/Row.hpp +++ b/src/buffer/out/Row.hpp @@ -171,6 +171,7 @@ class ROW final void ReplaceCharacters(til::CoordType width) noexcept; void ReplaceText() noexcept; void CopyTextFrom(const std::span& charOffsets) noexcept; + static void _copyOffsets(uint16_t* dst, const uint16_t* src, uint16_t size, uint16_t offset) noexcept; void Finish(); // Parent pointer.