Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-40095: [C++][Parquet] Remove AVX512 variants of BYTE_STREAM_SPLIT encoding #40127

Merged
merged 1 commit into from
Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
222 changes: 2 additions & 220 deletions cpp/src/arrow/util/byte_stream_split_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -332,226 +332,11 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const int64_t num_valu
}
#endif // ARROW_HAVE_AVX2

#if defined(ARROW_HAVE_AVX512)
template <int kNumStreams>
void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_t stride,
uint8_t* out) {
static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams.");
constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2);
constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams;

const int64_t size = num_values * kNumStreams;
if (size < kBlockSize) // Back to AVX2 for small size
return ByteStreamSplitDecodeAvx2<kNumStreams>(data, num_values, stride, out);
const int64_t num_blocks = size / kBlockSize;

// First handle suffix.
const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
for (int64_t i = num_processed_elements; i < num_values; ++i) {
uint8_t gathered_byte_data[kNumStreams];
for (int b = 0; b < kNumStreams; ++b) {
const int64_t byte_index = b * stride + i;
gathered_byte_data[b] = data[byte_index];
}
memcpy(out + i * kNumStreams, gathered_byte_data, kNumStreams);
}

// Processed hierarchically using the unpack, then two shuffles.
__m512i stage[kNumStreamsLog2 + 1][kNumStreams];
__m512i shuffle[kNumStreams];
__m512i final_result[kNumStreams];
constexpr int kNumStreamsHalf = kNumStreams / 2U;

for (int64_t i = 0; i < num_blocks; ++i) {
for (int j = 0; j < kNumStreams; ++j) {
stage[0][j] = _mm512_loadu_si512(
reinterpret_cast<const __m512i*>(&data[i * sizeof(__m512i) + j * stride]));
}

for (int step = 0; step < kNumStreamsLog2; ++step) {
for (int j = 0; j < kNumStreamsHalf; ++j) {
stage[step + 1][j * 2] =
_mm512_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
stage[step + 1][j * 2 + 1] =
_mm512_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
}
}

if constexpr (kNumStreams == 8) {
// path for double, 128i index:
// {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
// {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
// {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
// {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
stage[kNumStreamsLog2][1], 0b01000100);
shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
stage[kNumStreamsLog2][3], 0b01000100);
shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
stage[kNumStreamsLog2][5], 0b01000100);
shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
stage[kNumStreamsLog2][7], 0b01000100);
shuffle[4] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
stage[kNumStreamsLog2][1], 0b11101110);
shuffle[5] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
stage[kNumStreamsLog2][3], 0b11101110);
shuffle[6] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
stage[kNumStreamsLog2][5], 0b11101110);
shuffle[7] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
stage[kNumStreamsLog2][7], 0b11101110);

final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
final_result[1] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
final_result[2] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
final_result[5] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
final_result[6] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
} else {
// path for float, 128i index:
// {0x00, 0x04, 0x08, 0x0C}, {0x01, 0x05, 0x09, 0x0D}
// {0x02, 0x06, 0x0A, 0x0E}, {0x03, 0x07, 0x0B, 0x0F},
shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
stage[kNumStreamsLog2][1], 0b01000100);
shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
stage[kNumStreamsLog2][3], 0b01000100);
shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
stage[kNumStreamsLog2][1], 0b11101110);
shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
stage[kNumStreamsLog2][3], 0b11101110);

final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
}

for (int j = 0; j < kNumStreams; ++j) {
_mm512_storeu_si512(
reinterpret_cast<__m512i*>(out + (i * kNumStreams + j) * sizeof(__m512i)),
final_result[j]);
}
}
}

template <int kNumStreams>
void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const int64_t num_values,
uint8_t* output_buffer_raw) {
static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams.");
constexpr int kBlockSize = sizeof(__m512i) * kNumStreams;

const int64_t size = num_values * kNumStreams;

if (size < kBlockSize) // Back to AVX2 for small size
return ByteStreamSplitEncodeAvx2<kNumStreams>(raw_values, num_values,
output_buffer_raw);

const int64_t num_blocks = size / kBlockSize;
const __m512i* raw_values_simd = reinterpret_cast<const __m512i*>(raw_values);
__m512i* output_buffer_streams[kNumStreams];
for (int i = 0; i < kNumStreams; ++i) {
output_buffer_streams[i] =
reinterpret_cast<__m512i*>(&output_buffer_raw[num_values * i]);
}

// First handle suffix.
const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
for (int64_t i = num_processed_elements; i < num_values; ++i) {
for (int j = 0; j < kNumStreams; ++j) {
const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
output_buffer_raw[j * num_values + i] = byte_in_value;
}
}

constexpr int KNumUnpack = (kNumStreams == 8) ? 2 : 3;
__m512i final_result[kNumStreams];
__m512i unpack[KNumUnpack + 1][kNumStreams];
__m512i permutex[kNumStreams];
__m512i permutex_mask;
if constexpr (kNumStreams == 8) {
// use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version.
permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 0x000E0006,
0x001D0015, 0x000D0005, 0x001C0014, 0x000C0004,
0x001B0013, 0x000B0003, 0x001A0012, 0x000A0002,
0x00190011, 0x00090001, 0x00180010, 0x00080000);
} else {
permutex_mask = _mm512_set_epi32(0x0F, 0x0B, 0x07, 0x03, 0x0E, 0x0A, 0x06, 0x02, 0x0D,
0x09, 0x05, 0x01, 0x0C, 0x08, 0x04, 0x00);
}

for (int64_t block_index = 0; block_index < num_blocks; ++block_index) {
for (int i = 0; i < kNumStreams; ++i) {
unpack[0][i] = _mm512_loadu_si512(&raw_values_simd[block_index * kNumStreams + i]);
}

for (int unpack_lvl = 0; unpack_lvl < KNumUnpack; ++unpack_lvl) {
for (int i = 0; i < kNumStreams / 2; ++i) {
unpack[unpack_lvl + 1][i * 2] = _mm512_unpacklo_epi8(
unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
unpack[unpack_lvl + 1][i * 2 + 1] = _mm512_unpackhi_epi8(
unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
}
}

if constexpr (kNumStreams == 8) {
// path for double
// 1. unpack to epi16 block
// 2. permutexvar_epi16 to 128i block
// 3. shuffle 128i to final 512i target, index:
// {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
// {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
// {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
// {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
for (int i = 0; i < kNumStreams; ++i)
permutex[i] = _mm512_permutexvar_epi16(permutex_mask, unpack[KNumUnpack][i]);

__m512i shuffle[kNumStreams];
shuffle[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
shuffle[1] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b01000100);
shuffle[2] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
shuffle[3] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b11101110);
shuffle[4] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
shuffle[5] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b01000100);
shuffle[6] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
shuffle[7] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b11101110);

final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
final_result[5] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
final_result[6] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
} else {
// Path for float.
// 1. Processed hierarchically to 32i block using the unpack intrinsics.
// 2. Pack 128i block using _mm256_permutevar8x32_epi32.
// 3. Pack final 256i block with _mm256_permute2x128_si256.
for (int i = 0; i < kNumStreams; ++i)
permutex[i] = _mm512_permutexvar_epi32(permutex_mask, unpack[KNumUnpack][i]);

final_result[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
final_result[1] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
final_result[2] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
final_result[3] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
}

for (int i = 0; i < kNumStreams; ++i) {
_mm512_storeu_si512(&output_buffer_streams[i][block_index], final_result[i]);
}
}
}
#endif // ARROW_HAVE_AVX512

#if defined(ARROW_HAVE_SIMD_SPLIT)
template <int kNumStreams>
void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values,
int64_t stride, uint8_t* out) {
#if defined(ARROW_HAVE_AVX512)
return ByteStreamSplitDecodeAvx512<kNumStreams>(data, num_values, stride, out);
#elif defined(ARROW_HAVE_AVX2)
#if defined(ARROW_HAVE_AVX2)
return ByteStreamSplitDecodeAvx2<kNumStreams>(data, num_values, stride, out);
#elif defined(ARROW_HAVE_SSE4_2)
return ByteStreamSplitDecodeSse2<kNumStreams>(data, num_values, stride, out);
Expand All @@ -563,10 +348,7 @@ void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values,
template <int kNumStreams>
void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, const int64_t num_values,
uint8_t* output_buffer_raw) {
#if defined(ARROW_HAVE_AVX512)
return ByteStreamSplitEncodeAvx512<kNumStreams>(raw_values, num_values,
output_buffer_raw);
#elif defined(ARROW_HAVE_AVX2)
#if defined(ARROW_HAVE_AVX2)
return ByteStreamSplitEncodeAvx2<kNumStreams>(raw_values, num_values,
output_buffer_raw);
#elif defined(ARROW_HAVE_SSE4_2)
Expand Down
4 changes: 0 additions & 4 deletions cpp/src/arrow/util/byte_stream_split_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,6 @@ class TestByteStreamSplitSpecialized : public ::testing::Test {
#if defined(ARROW_HAVE_AVX2)
encode_funcs_.push_back({"avx2", &ByteStreamSplitEncodeAvx2<kWidth>});
decode_funcs_.push_back({"avx2", &ByteStreamSplitDecodeAvx2<kWidth>});
#endif
#if defined(ARROW_HAVE_AVX512)
encode_funcs_.push_back({"avx512", &ByteStreamSplitEncodeAvx512<kWidth>});
decode_funcs_.push_back({"avx512", &ByteStreamSplitDecodeAvx512<kWidth>});
#endif
}

Expand Down
27 changes: 0 additions & 27 deletions cpp/src/parquet/encoding_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -468,33 +468,6 @@ BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx2)->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Range(MIN_RANGE, MAX_RANGE);
#endif

#if defined(ARROW_HAVE_AVX512)
static void BM_ByteStreamSplitDecode_Float_Avx512(benchmark::State& state) {
BM_ByteStreamSplitDecode<float>(
state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512<sizeof(float)>);
}

static void BM_ByteStreamSplitDecode_Double_Avx512(benchmark::State& state) {
BM_ByteStreamSplitDecode<double>(
state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512<sizeof(double)>);
}

static void BM_ByteStreamSplitEncode_Float_Avx512(benchmark::State& state) {
BM_ByteStreamSplitEncode<float>(
state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512<sizeof(float)>);
}

static void BM_ByteStreamSplitEncode_Double_Avx512(benchmark::State& state) {
BM_ByteStreamSplitEncode<double>(
state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512<sizeof(double)>);
}

BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx512)->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx512)->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx512)->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx512)->Range(MIN_RANGE, MAX_RANGE);
#endif

template <typename DType>
static auto MakeDeltaBitPackingInputFixed(size_t length) {
using T = typename DType::c_type;
Expand Down
Loading