Skip to content

Commit

Permalink
Vectorize ROW initialization
Browse files Browse the repository at this point in the history
  • Loading branch information
lhecker committed Jun 5, 2023
1 parent 3cb78a4 commit a919562
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 14 deletions.
94 changes: 87 additions & 7 deletions src/buffer/out/Row.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include "textBuffer.hpp"
#include "../../types/inc/GlyphWidth.hpp"

extern "C" long __isa_enabled;

// The STL is missing a std::iota_n analogue for std::iota, so I made my own.
template<typename OutIt, typename Diff, typename T>
constexpr OutIt iota_n(OutIt dest, Diff count, T val)
Expand Down Expand Up @@ -82,10 +84,7 @@ ROW::ROW(wchar_t* charsBuffer, uint16_t* charOffsetsBuffer, uint16_t rowWidth, c
_attr{ rowWidth, fillAttribute },
_columnCount{ rowWidth }
{
if (_chars.data())
{
_init();
}
_init();
}

void ROW::SetWrapForced(const bool wrap) noexcept
Expand Down Expand Up @@ -124,7 +123,7 @@ LineRendition ROW::GetLineRendition() const noexcept
// - Attr - The default attribute (color) to fill
// Return Value:
// - <none>
void ROW::Reset(const TextAttribute& attr)
void ROW::Reset(const TextAttribute& attr) noexcept
{
_charsHeap.reset();
_chars = { _charsBuffer, _columnCount };
Expand All @@ -139,8 +138,89 @@ void ROW::Reset(const TextAttribute& attr)

void ROW::_init() noexcept
{
std::fill_n(_chars.begin(), _columnCount, UNICODE_SPACE);
std::iota(_charOffsets.begin(), _charOffsets.end(), uint16_t{ 0 });
#pragma warning(push)
#pragma warning(disable : 26462) // The value pointed to by '...' is assigned only once, mark it as a pointer to const (con.4).
#pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1).
#pragma warning(disable : 26490) // Don't use reinterpret_cast (type.1).

// Fills _charsBuffer with whitespace and correspondingly _charOffsets
// with successive numbers from 0 to _columnCount+1.
#if defined(TIL_SSE_INTRINSICS)
alignas(__m256i) static constexpr uint16_t offsetsData[]{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };

if ((__isa_enabled & (1 << __ISA_AVAILABLE_AVX2)) && _columnCount >= 16)
{
auto chars = _charsBuffer;
auto charOffsets = _charOffsets.data();

const auto tailColumnOffset = gsl::narrow_cast<uint16_t>((_columnCount - 8u) & ~7);
const auto charsEndLoop = chars + tailColumnOffset;
const auto charOffsetsEndLoop = charOffsets + tailColumnOffset;

const auto whitespace = _mm256_set1_epi16(L' ');
auto offsetsLoop = _mm256_load_si256(reinterpret_cast<const __m256i*>(&offsetsData[0]));
const auto offsets = _mm256_add_epi16(offsetsLoop, _mm256_set1_epi16(tailColumnOffset));

if (chars < charsEndLoop)
{
const auto increment = _mm256_set1_epi16(16);

do
{
_mm256_storeu_si256(reinterpret_cast<__m256i*>(chars), whitespace);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(charOffsets), offsetsLoop);
offsetsLoop = _mm256_add_epi16(offsetsLoop, increment);
chars += 16;
charOffsets += 16;
} while (chars < charsEndLoop);
}

_mm256_storeu_si256(reinterpret_cast<__m256i*>(charsEndLoop), whitespace);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(charOffsetsEndLoop), offsets);
}
else
{
auto chars = _charsBuffer;
auto charOffsets = _charOffsets.data();
const auto charsEnd = chars + _columnCount;

const auto whitespace = _mm_set1_epi16(L' ');
const auto increment = _mm_set1_epi16(8);
auto offsets = _mm_load_si128(reinterpret_cast<const __m128i*>(&offsetsData[0]));

do
{
_mm_storeu_si128(reinterpret_cast<__m128i*>(chars), whitespace);
_mm_storeu_si128(reinterpret_cast<__m128i*>(charOffsets), offsets);
offsets = _mm_add_epi16(offsets, increment);
chars += 8;
charOffsets += 8;
} while (chars <= charsEnd);
}
#elif defined(TIL_ARM_NEON_INTRINSICS)
alignas(uint16x8_t) static constexpr uint16_t offsetsData[]{ 0, 1, 2, 3, 4, 5, 6, 7 };

auto chars = _charsBuffer;
auto charOffsets = _charOffsets.data();
const auto charsEnd = chars + _columnCount;

const auto whitespace = vdupq_n_u16(L' ');
const auto increment = vdupq_n_u16(8);
auto offsets = vld1q_u16(&offsetsData[0]);

do
{
vst1q_u16(chars, whitespace);
vst1q_u16(charOffsets, offsets);
offsets = vaddq_u16(offsets, increment);
chars += 8;
charOffsets += 8;
} while (chars <= charsEnd);
#else
#error "This method benefits greatly from vectorization, which makes text processing significantly faster overall (+40% with AVX2). If you intentionally break my performance I'll break you. :)"
#endif

#pragma warning(push)
}

void ROW::TransferAttributes(const til::small_rle<TextAttribute, uint16_t, 1>& attr, til::CoordType newWidth)
Expand Down
19 changes: 18 additions & 1 deletion src/buffer/out/Row.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,23 @@ struct RowWriteState
class ROW final
{
public:
// The implicit agreement between ROW and TextBuffer is that the `charsBuffer` and `charOffsetsBuffer`
// arrays have a minimum alignment of 16 Bytes and a size of `rowWidth+1`. The former is used to
// implement Reset() efficiently via SIMD and the latter is used to store the past-the-end offset
// into the `charsBuffer`. Even though the `charsBuffer` could be only `rowWidth` large we need them
// to be the same size so that the SIMD code can process both arrays in the same loop simultaneously.
// This wastes up to 5.8% memory but increases overall scrolling performance by around 40%.
//
// This method exists to make this agreement explicit and serve as a reminder.
static constexpr size_t CalculateCharsBufferStride(size_t columns) noexcept
{
return (columns * sizeof(wchar_t) + 16) & ~15;
}
static constexpr size_t CalculateCharOffsetsBufferStride(size_t columns) noexcept
{
return (columns * sizeof(uint16_t) + 16) & ~15;
}

ROW() = default;
ROW(wchar_t* charsBuffer, uint16_t* charOffsetsBuffer, uint16_t rowWidth, const TextAttribute& fillAttribute);

Expand All @@ -76,7 +93,7 @@ class ROW final
void SetLineRendition(const LineRendition lineRendition) noexcept;
LineRendition GetLineRendition() const noexcept;

void Reset(const TextAttribute& attr);
void Reset(const TextAttribute& attr) noexcept;
void TransferAttributes(const til::small_rle<TextAttribute, uint16_t, 1>& attr, til::CoordType newWidth);

til::CoordType NavigateToPrevious(til::CoordType column) const noexcept;
Expand Down
8 changes: 3 additions & 5 deletions src/buffer/out/textBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -698,10 +698,8 @@ wil::unique_virtualalloc_ptr<std::byte> TextBuffer::_allocateBuffer(til::size sz
const auto w = gsl::narrow<uint16_t>(sz.width);
const auto h = gsl::narrow<uint16_t>(sz.height);

const auto charsBytes = w * sizeof(wchar_t);
// The ROW::_indices array stores 1 more item than the buffer is wide.
// That extra column stores the past-the-end _chars pointer.
const auto indicesBytes = w * sizeof(uint16_t) + sizeof(uint16_t);
const auto charsBytes = ROW::CalculateCharsBufferStride(w);
const auto indicesBytes = ROW::CalculateCharOffsetsBufferStride(w);
const auto rowStride = charsBytes + indicesBytes;
// 65535*65535 cells would result in a charsAreaSize of 8GiB.
// --> Use uint64_t so that we can safely do our calculations even on x86.
Expand Down Expand Up @@ -926,7 +924,7 @@ til::point TextBuffer::BufferToScreenPosition(const til::point position) const n
// Routine Description:
// - Resets the text contents of this buffer with the default character
// and the default current color attributes
void TextBuffer::Reset()
void TextBuffer::Reset() noexcept
{
const auto attr = GetCurrentAttributes();

Expand Down
2 changes: 1 addition & 1 deletion src/buffer/out/textBuffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ class TextBuffer final
til::point ScreenToBufferPosition(const til::point position) const noexcept;
til::point BufferToScreenPosition(const til::point position) const noexcept;

void Reset();
void Reset() noexcept;

[[nodiscard]] HRESULT ResizeTraditional(const til::size newSize) noexcept;

Expand Down

0 comments on commit a919562

Please sign in to comment.