From a9763f7086738c4cb2fdebb6e36d06689d9be65e Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Wed, 14 Jun 2023 01:33:52 +0200 Subject: [PATCH] Vectorize ROW initialization --- .github/actions/spelling/allow/apis.txt | 1 + .github/actions/spelling/expect/expect.txt | 5 +- .../actions/spelling/patterns/patterns.txt | 6 + src/buffer/out/Row.cpp | 113 +++++++++++++++++- src/buffer/out/Row.hpp | 11 +- src/inc/til.h | 9 ++ 6 files changed, 136 insertions(+), 9 deletions(-) diff --git a/.github/actions/spelling/allow/apis.txt b/.github/actions/spelling/allow/apis.txt index cfd2b503c49..8665cd35831 100644 --- a/.github/actions/spelling/allow/apis.txt +++ b/.github/actions/spelling/allow/apis.txt @@ -87,6 +87,7 @@ IObject iosfwd IPackage IPeasant +isa ISetup isspace IStorage diff --git a/.github/actions/spelling/expect/expect.txt b/.github/actions/spelling/expect/expect.txt index f6cea6a33d9..7e7b46f7d3a 100644 --- a/.github/actions/spelling/expect/expect.txt +++ b/.github/actions/spelling/expect/expect.txt @@ -1,4 +1,5 @@ aabbcc +aarch ABANDONFONT abbcc ABCDEFGHIJKLMNOPQRSTUVWXY @@ -157,7 +158,6 @@ capslock CARETBLINKINGENABLED CARRIAGERETURN cascadia -castsi catid cazamor CBash @@ -216,7 +216,6 @@ cmder CMDEXT cmh CMOUSEBUTTONS -cmpeq cmt cmw cmyk @@ -1024,7 +1023,6 @@ lnkd lnkfile LNM LOADONCALL -loadu LOBYTE localappdata locsrc @@ -1155,7 +1153,6 @@ MOUSEACTIVATE MOUSEFIRST MOUSEHWHEEL MOUSEMOVE -movemask MOVESTART msb msctf diff --git a/.github/actions/spelling/patterns/patterns.txt b/.github/actions/spelling/patterns/patterns.txt index 5acf5e9bfa6..6e4a39485df 100644 --- a/.github/actions/spelling/patterns/patterns.txt +++ b/.github/actions/spelling/patterns/patterns.txt @@ -27,6 +27,12 @@ ROY\sG\.\sBIV # Python stringprefix / binaryprefix \b(?:B|BR|Br|F|FR|Fr|R|RB|RF|Rb|Rf|U|UR|Ur|b|bR|br|f|fR|fr|r|rB|rF|rb|rf|u|uR|ur)' +# SSE intrinsics like "_mm_subs_epu16" +\b_mm(?:|256|512)_\w+\b + +# ARM NEON intrinsics like "vsubq_u16" +\bv\w+_[fsu](?:8|16|32|64)\b + # Automatically suggested patterns # hit-count: 3831 file-count: 582 # IServiceProvider diff --git a/src/buffer/out/Row.cpp b/src/buffer/out/Row.cpp index 737c59d4107..b262f19dfa9 100644 --- a/src/buffer/out/Row.cpp +++ b/src/buffer/out/Row.cpp @@ -9,6 +9,8 @@ #include "textBuffer.hpp" #include "../../types/inc/GlyphWidth.hpp" +extern "C" int __isa_available; + // The STL is missing a std::iota_n analogue for std::iota, so I made my own. template constexpr OutIt iota_n(OutIt dest, Diff count, T val) @@ -134,8 +136,117 @@ void ROW::Reset(const TextAttribute& attr) void ROW::_init() noexcept { - std::fill_n(_chars.begin(), _columnCount, UNICODE_SPACE); +#pragma warning(push) +#pragma warning(disable : 26462) // The value pointed to by '...' is assigned only once, mark it as a pointer to const (con.4). +#pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1). +#pragma warning(disable : 26490) // Don't use reinterpret_cast (type.1). + + // Fills _charsBuffer with whitespace and correspondingly _charOffsets + // with successive numbers from 0 to _columnCount+1. +#if defined(TIL_SSE_INTRINSICS) + alignas(__m256i) static constexpr uint16_t whitespaceData[]{ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20 }; + alignas(__m256i) static constexpr uint16_t offsetsData[]{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + alignas(__m256i) static constexpr uint16_t increment16Data[]{ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; + alignas(__m128i) static constexpr uint16_t increment8Data[]{ 8, 8, 8, 8, 8, 8, 8, 8 }; + + // The AVX loop operates on 32 bytes at a minimum. Since _charsBuffer/_charOffsets uses 2 byte large + // wchar_t/uint16_t respectively, this translates to 16-element writes, which equals a _columnCount of 15, + // because it doesn't include the past-the-end char-offset as described in the _charOffsets member comment. + if (__isa_available >= __ISA_AVAILABLE_AVX2 && _columnCount >= 15) + { + auto chars = _charsBuffer; + auto charOffsets = _charOffsets.data(); + + // The backing buffer for both chars and charOffsets is guaranteed to be 16-byte aligned, + // but AVX operations are 32-byte large. As such, when we write out the last chunk, we + // have to align it to the ends of the 2 buffers. This results in a potential overlap of + // 16 bytes between the last write in the main loop below and the final write afterwards. + // + // An example: + // If you have a terminal between 16 and 23 columns the buffer has a size of 48 bytes. + // The main loop below will iterate once, as it writes out bytes 0-31 and then exits. + // The final write afterwards cannot write bytes 32-63 because that would write + // out of bounds. Instead it writes bytes 16-47, overwriting 16 overlapping bytes. + // This is better than branching and switching to SSE2, because both things are slow. + // + // Since we want to exit the main loop with at least 1 write left to do as the final write, + // we need to subtract 1 alignment from the buffer length (= 16 bytes). Since _columnCount is + // in wchar_t's we subtract -8. The same applies to the ~7 here vs ~15. If you squint slightly + // you'll see how this is effectively the inverse of what CalculateCharsBufferStride does. + const auto tailColumnOffset = gsl::narrow_cast((_columnCount - 8u) & ~7); + const auto charsEndLoop = chars + tailColumnOffset; + const auto charOffsetsEndLoop = charOffsets + tailColumnOffset; + + const auto whitespace = _mm256_load_si256(reinterpret_cast(&whitespaceData[0])); + auto offsetsLoop = _mm256_load_si256(reinterpret_cast(&offsetsData[0])); + const auto offsets = _mm256_add_epi16(offsetsLoop, _mm256_set1_epi16(tailColumnOffset)); + + if (chars < charsEndLoop) + { + const auto increment = _mm256_load_si256(reinterpret_cast(&increment16Data[0])); + + do + { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(chars), whitespace); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(charOffsets), offsetsLoop); + offsetsLoop = _mm256_add_epi16(offsetsLoop, increment); + chars += 16; + charOffsets += 16; + } while (chars < charsEndLoop); + } + + _mm256_storeu_si256(reinterpret_cast<__m256i*>(charsEndLoop), whitespace); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(charOffsetsEndLoop), offsets); + } + else + { + auto chars = _charsBuffer; + auto charOffsets = _charOffsets.data(); + const auto charsEnd = chars + _columnCount; + + const auto whitespace = _mm_load_si128(reinterpret_cast(&whitespaceData[0])); + const auto increment = _mm_load_si128(reinterpret_cast(&increment8Data[0])); + auto offsets = _mm_load_si128(reinterpret_cast(&offsetsData[0])); + + do + { + _mm_storeu_si128(reinterpret_cast<__m128i*>(chars), whitespace); + _mm_storeu_si128(reinterpret_cast<__m128i*>(charOffsets), offsets); + offsets = _mm_add_epi16(offsets, increment); + chars += 8; + charOffsets += 8; + // If _columnCount is something like 120, the actual backing buffer for charOffsets is 121 items large. + // --> The while loop uses <= to emit at least 1 more write. + } while (chars <= charsEnd); + } +#elif defined(TIL_ARM_NEON_INTRINSICS) + alignas(uint16x8_t) static constexpr uint16_t offsetsData[]{ 0, 1, 2, 3, 4, 5, 6, 7 }; + + auto chars = _charsBuffer; + auto charOffsets = _charOffsets.data(); + const auto charsEnd = chars + _columnCount; + + const auto whitespace = vdupq_n_u16(L' '); + const auto increment = vdupq_n_u16(8); + auto offsets = vld1q_u16(&offsetsData[0]); + + do + { + vst1q_u16(chars, whitespace); + vst1q_u16(charOffsets, offsets); + offsets = vaddq_u16(offsets, increment); + chars += 8; + charOffsets += 8; + // If _columnCount is something like 120, the actual backing buffer for charOffsets is 121 items large. + // --> The while loop uses <= to emit at least 1 more write. + } while (chars <= charsEnd); +#else +#error "Vectorizing this function improves overall performance by up to 40%. Don't remove this warning, just add the vectorized code." + std::fill_n(_charsBuffer, _columnCount, UNICODE_SPACE); std::iota(_charOffsets.begin(), _charOffsets.end(), uint16_t{ 0 }); +#endif + +#pragma warning(push) } void ROW::TransferAttributes(const til::small_rle& attr, til::CoordType newWidth) diff --git a/src/buffer/out/Row.hpp b/src/buffer/out/Row.hpp index b677f565d36..79d61568a1c 100644 --- a/src/buffer/out/Row.hpp +++ b/src/buffer/out/Row.hpp @@ -60,16 +60,19 @@ struct RowWriteState class ROW final { public: - // The implicit agreement between ROW and TextBuffer is that TextBuffer supplies ROW with a charsBuffer of at - // least `columns * sizeof(wchar_t)` bytes and a charOffsetsBuffer of at least `(columns + 1) * sizeof(uint16_t)` - // bytes (see ROW::_charOffsets for why it needs space for 1 additional offset). + // The implicit agreement between ROW and TextBuffer is that the `charsBuffer` and `charOffsetsBuffer` + // arrays have a minimum alignment of 16 Bytes and a size of `rowWidth+1`. The former is used to + // implement Reset() efficiently via SIMD and the latter is used to store the past-the-end offset + // into the `charsBuffer`. Even though the `charsBuffer` could be only `rowWidth` large we need them + // to be the same size so that the SIMD code can process both arrays in the same loop simultaneously. + // This wastes up to 5.8% memory but increases overall scrolling performance by around 40%. // These methods exists to make this agreement explicit and serve as a reminder. // // TextBuffer calculates the distance in bytes between two ROWs (_bufferRowStride) as the sum of these values. // As such it's important that we return sizes with a minimum alignment of alignof(ROW). static constexpr size_t CalculateRowSize() noexcept { - return sizeof(ROW); + return (sizeof(ROW) + 15) & ~15; } static constexpr size_t CalculateCharsBufferSize(size_t columns) noexcept { diff --git a/src/inc/til.h b/src/inc/til.h index f7bcc5ac119..ee4f29df34c 100644 --- a/src/inc/til.h +++ b/src/inc/til.h @@ -3,6 +3,15 @@ #pragma once +// This is a copy of how DirectXMath.h determines _XM_SSE_INTRINSICS_ and _XM_ARM_NEON_INTRINSICS_. +#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC) +#define TIL_SSE_INTRINSICS +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__ +#define TIL_ARM_NEON_INTRINSICS +#else +#define TIL_NO_INTRINSICS +#endif + #define _TIL_INLINEPREFIX __declspec(noinline) inline #include "til/at.h"