From a9763f7086738c4cb2fdebb6e36d06689d9be65e Mon Sep 17 00:00:00 2001
From: Leonard Hecker <lhecker@microsoft.com>
Date: Wed, 14 Jun 2023 01:33:52 +0200
Subject: [PATCH] Vectorize ROW initialization

---
 .github/actions/spelling/allow/apis.txt       |   1 +
 .github/actions/spelling/expect/expect.txt    |   5 +-
 .../actions/spelling/patterns/patterns.txt    |   6 +
 src/buffer/out/Row.cpp                        | 113 +++++++++++++++++-
 src/buffer/out/Row.hpp                        |  11 +-
 src/inc/til.h                                 |   9 ++
 6 files changed, 136 insertions(+), 9 deletions(-)
diff --git a/.github/actions/spelling/allow/apis.txt b/.github/actions/spelling/allow/apis.txt
index cfd2b503c49..8665cd35831 100644
--- a/.github/actions/spelling/allow/apis.txt
+++ b/.github/actions/spelling/allow/apis.txt
@@ -87,6 +87,7 @@ IObject
 iosfwd
 IPackage
 IPeasant
+isa
 ISetup
 isspace
 IStorage
diff --git a/.github/actions/spelling/expect/expect.txt b/.github/actions/spelling/expect/expect.txt
index f6cea6a33d9..7e7b46f7d3a 100644
--- a/.github/actions/spelling/expect/expect.txt
+++ b/.github/actions/spelling/expect/expect.txt
@@ -1,4 +1,5 @@
 aabbcc
+aarch
 ABANDONFONT
 abbcc
 ABCDEFGHIJKLMNOPQRSTUVWXY
@@ -157,7 +158,6 @@ capslock
 CARETBLINKINGENABLED
 CARRIAGERETURN
 cascadia
-castsi
 catid
 cazamor
 CBash
@@ -216,7 +216,6 @@ cmder
 CMDEXT
 cmh
 CMOUSEBUTTONS
-cmpeq
 cmt
 cmw
 cmyk
@@ -1024,7 +1023,6 @@ lnkd
 lnkfile
 LNM
 LOADONCALL
-loadu
 LOBYTE
 localappdata
 locsrc
@@ -1155,7 +1153,6 @@ MOUSEACTIVATE
 MOUSEFIRST
 MOUSEHWHEEL
 MOUSEMOVE
-movemask
 MOVESTART
 msb
 msctf
diff --git a/.github/actions/spelling/patterns/patterns.txt b/.github/actions/spelling/patterns/patterns.txt
index 5acf5e9bfa6..6e4a39485df 100644
--- a/.github/actions/spelling/patterns/patterns.txt
+++ b/.github/actions/spelling/patterns/patterns.txt
@@ -27,6 +27,12 @@ ROY\sG\.\sBIV
 # Python stringprefix / binaryprefix
 \b(?:B|BR|Br|F|FR|Fr|R|RB|RF|Rb|Rf|U|UR|Ur|b|bR|br|f|fR|fr|r|rB|rF|rb|rf|u|uR|ur)'
 
+# SSE intrinsics like "_mm_subs_epu16"
+\b_mm(?:|256|512)_\w+\b
+
+# ARM NEON intrinsics like "vsubq_u16"
+\bv\w+_[fsu](?:8|16|32|64)\b
+
 # Automatically suggested patterns
 # hit-count: 3831 file-count: 582
 # IServiceProvider
diff --git a/src/buffer/out/Row.cpp b/src/buffer/out/Row.cpp
index 737c59d4107..b262f19dfa9 100644
--- a/src/buffer/out/Row.cpp
+++ b/src/buffer/out/Row.cpp
@@ -9,6 +9,8 @@
 #include "textBuffer.hpp"
 #include "../../types/inc/GlyphWidth.hpp"
 
+extern "C" int __isa_available;
+
 // The STL is missing a std::iota_n analogue for std::iota, so I made my own.
 template<typename OutIt, typename Diff, typename T>
 constexpr OutIt iota_n(OutIt dest, Diff count, T val)
@@ -134,8 +136,117 @@ void ROW::Reset(const TextAttribute& attr)
 
 void ROW::_init() noexcept
 {
-    std::fill_n(_chars.begin(), _columnCount, UNICODE_SPACE);
+#pragma warning(push)
+#pragma warning(disable : 26462) // The value pointed to by '...' is assigned only once, mark it as a pointer to const (con.4).
+#pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1).
+#pragma warning(disable : 26490) // Don't use reinterpret_cast (type.1).
+
+    // Fills _charsBuffer with whitespace and correspondingly _charOffsets
+    // with successive numbers from 0 to _columnCount+1.
+#if defined(TIL_SSE_INTRINSICS)
+    alignas(__m256i) static constexpr uint16_t whitespaceData[]{ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20 };
+    alignas(__m256i) static constexpr uint16_t offsetsData[]{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+    alignas(__m256i) static constexpr uint16_t increment16Data[]{ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 };
+    alignas(__m128i) static constexpr uint16_t increment8Data[]{ 8, 8, 8, 8, 8, 8, 8, 8 };
+
+    // The AVX loop operates on 32 bytes at a minimum. Since _charsBuffer/_charOffsets uses 2 byte large
+    // wchar_t/uint16_t respectively, this translates to 16-element writes, which equals a _columnCount of 15,
+    // because it doesn't include the past-the-end char-offset as described in the _charOffsets member comment.
+    if (__isa_available >= __ISA_AVAILABLE_AVX2 && _columnCount >= 15)
+    {
+        auto chars = _charsBuffer;
+        auto charOffsets = _charOffsets.data();
+
+        // The backing buffer for both chars and charOffsets is guaranteed to be 16-byte aligned,
+        // but AVX operations are 32-byte large. As such, when we write out the last chunk, we
+        // have to align it to the ends of the 2 buffers. This results in a potential overlap of
+        // 16 bytes between the last write in the main loop below and the final write afterwards.
+        //
+        // An example:
+        // If you have a terminal between 16 and 23 columns the buffer has a size of 48 bytes.
+        // The main loop below will iterate once, as it writes out bytes 0-31 and then exits.
+        // The final write afterwards cannot write bytes 32-63 because that would write
+        // out of bounds. Instead it writes bytes 16-47, overwriting 16 overlapping bytes.
+        // This is better than branching and switching to SSE2, because both things are slow.
+        //
+        // Since we want to exit the main loop with at least 1 write left to do as the final write,
+        // we need to subtract 1 alignment from the buffer length (= 16 bytes). Since _columnCount is
+        // in wchar_t's we subtract -8. The same applies to the ~7 here vs ~15. If you squint slightly
+        // you'll see how this is effectively the inverse of what CalculateCharsBufferStride does.
+        const auto tailColumnOffset = gsl::narrow_cast<uint16_t>((_columnCount - 8u) & ~7);
+        const auto charsEndLoop = chars + tailColumnOffset;
+        const auto charOffsetsEndLoop = charOffsets + tailColumnOffset;
+
+        const auto whitespace = _mm256_load_si256(reinterpret_cast<const __m256i*>(&whitespaceData[0]));
+        auto offsetsLoop = _mm256_load_si256(reinterpret_cast<const __m256i*>(&offsetsData[0]));
+        const auto offsets = _mm256_add_epi16(offsetsLoop, _mm256_set1_epi16(tailColumnOffset));
+
+        if (chars < charsEndLoop)
+        {
+            const auto increment = _mm256_load_si256(reinterpret_cast<const __m256i*>(&increment16Data[0]));
+
+            do
+            {
+                _mm256_storeu_si256(reinterpret_cast<__m256i*>(chars), whitespace);
+                _mm256_storeu_si256(reinterpret_cast<__m256i*>(charOffsets), offsetsLoop);
+                offsetsLoop = _mm256_add_epi16(offsetsLoop, increment);
+                chars += 16;
+                charOffsets += 16;
+            } while (chars < charsEndLoop);
+        }
+
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(charsEndLoop), whitespace);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(charOffsetsEndLoop), offsets);
+    }
+    else
+    {
+        auto chars = _charsBuffer;
+        auto charOffsets = _charOffsets.data();
+        const auto charsEnd = chars + _columnCount;
+
+        const auto whitespace = _mm_load_si128(reinterpret_cast<const __m128i*>(&whitespaceData[0]));
+        const auto increment = _mm_load_si128(reinterpret_cast<const __m128i*>(&increment8Data[0]));
+        auto offsets = _mm_load_si128(reinterpret_cast<const __m128i*>(&offsetsData[0]));
+
+        do
+        {
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(chars), whitespace);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(charOffsets), offsets);
+            offsets = _mm_add_epi16(offsets, increment);
+            chars += 8;
+            charOffsets += 8;
+            // If _columnCount is something like 120, the actual backing buffer for charOffsets is 121 items large.
+            // --> The while loop uses <= to emit at least 1 more write.
+        } while (chars <= charsEnd);
+    }
+#elif defined(TIL_ARM_NEON_INTRINSICS)
+    alignas(uint16x8_t) static constexpr uint16_t offsetsData[]{ 0, 1, 2, 3, 4, 5, 6, 7 };
+
+    auto chars = _charsBuffer;
+    auto charOffsets = _charOffsets.data();
+    const auto charsEnd = chars + _columnCount;
+
+    const auto whitespace = vdupq_n_u16(L' ');
+    const auto increment = vdupq_n_u16(8);
+    auto offsets = vld1q_u16(&offsetsData[0]);
+
+    do
+    {
+        vst1q_u16(chars, whitespace);
+        vst1q_u16(charOffsets, offsets);
+        offsets = vaddq_u16(offsets, increment);
+        chars += 8;
+        charOffsets += 8;
+        // If _columnCount is something like 120, the actual backing buffer for charOffsets is 121 items large.
+        // --> The while loop uses <= to emit at least 1 more write.
+    } while (chars <= charsEnd);
+#else
+#error "Vectorizing this function improves overall performance by up to 40%. Don't remove this warning, just add the vectorized code."
+    std::fill_n(_charsBuffer, _columnCount, UNICODE_SPACE);
     std::iota(_charOffsets.begin(), _charOffsets.end(), uint16_t{ 0 });
+#endif
+
+#pragma warning(push)
 }
 
 void ROW::TransferAttributes(const til::small_rle<TextAttribute, uint16_t, 1>& attr, til::CoordType newWidth)
diff --git a/src/buffer/out/Row.hpp b/src/buffer/out/Row.hpp
index b677f565d36..79d61568a1c 100644
--- a/src/buffer/out/Row.hpp
+++ b/src/buffer/out/Row.hpp
@@ -60,16 +60,19 @@ struct RowWriteState
 class ROW final
 {
 public:
-    // The implicit agreement between ROW and TextBuffer is that TextBuffer supplies ROW with a charsBuffer of at
-    // least `columns * sizeof(wchar_t)` bytes and a charOffsetsBuffer of at least `(columns + 1) * sizeof(uint16_t)`
-    // bytes (see ROW::_charOffsets for why it needs space for 1 additional offset).
+    // The implicit agreement between ROW and TextBuffer is that the `charsBuffer` and `charOffsetsBuffer`
+    // arrays have a minimum alignment of 16 Bytes and a size of `rowWidth+1`. The former is used to
+    // implement Reset() efficiently via SIMD and the latter is used to store the past-the-end offset
+    // into the `charsBuffer`. Even though the `charsBuffer` could be only `rowWidth` large we need them
+    // to be the same size so that the SIMD code can process both arrays in the same loop simultaneously.
+    // This wastes up to 5.8% memory but increases overall scrolling performance by around 40%.
     // These methods exists to make this agreement explicit and serve as a reminder.
     //
     // TextBuffer calculates the distance in bytes between two ROWs (_bufferRowStride) as the sum of these values.
     // As such it's important that we return sizes with a minimum alignment of alignof(ROW).
     static constexpr size_t CalculateRowSize() noexcept
     {
-        return sizeof(ROW);
+        return (sizeof(ROW) + 15) & ~15;
     }
     static constexpr size_t CalculateCharsBufferSize(size_t columns) noexcept
     {
diff --git a/src/inc/til.h b/src/inc/til.h
index f7bcc5ac119..ee4f29df34c 100644
--- a/src/inc/til.h
+++ b/src/inc/til.h
@@ -3,6 +3,15 @@
 
 #pragma once
 
+// This is a copy of how DirectXMath.h determines _XM_SSE_INTRINSICS_ and _XM_ARM_NEON_INTRINSICS_.
+#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC)
+#define TIL_SSE_INTRINSICS
+#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__
+#define TIL_ARM_NEON_INTRINSICS
+#else
+#define TIL_NO_INTRINSICS
+#endif
+
 #define _TIL_INLINEPREFIX __declspec(noinline) inline
 
 #include "til/at.h"