Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: introduce simd algorithm for bitpacking #568

Merged
merged 1 commit into from
Dec 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,6 @@ jobs:
ccache --show-stats
echo Run ctest -V -L DFLY
#GLOG_logtostderr=1 GLOG_vmodule=transaction=1,engine_shard_set=1
GLOG_logtostderr=1 GLOG_vmodule=rdb_load=1,rdb_save=2,snapshot=2 ctest -V -L DFLY
GLOG_logtostderr=1 GLOG_vmodule=rdb_load=1,rdb_save=1,snapshot=1 ctest -V -L DFLY
./dragonfly_test --mem_defrag_threshold=0.05 # trying to catch issue with defrag
# GLOG_logtostderr=1 GLOG_vmodule=transaction=1,engine_shard_set=1 CTEST_OUTPUT_ON_FAILURE=1 ninja server/test
4 changes: 3 additions & 1 deletion src/core/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
add_library(dfly_core compact_object.cc dragonfly_core.cc extent_tree.cc
external_alloc.cc interpreter.cc json_object.cc mi_memory_resource.cc
segment_allocator.cc small_string.cc tx_queue.cc dense_set.cc string_set.cc)
segment_allocator.cc small_string.cc tx_queue.cc dense_set.cc string_set.cc
detail/bitpacking.cc)

cxx_link(dfly_core base absl::flat_hash_map absl::str_format redis_lib TRDP::lua lua_modules
Boost::fiber TRDP::jsoncons crypto)

Expand Down
130 changes: 6 additions & 124 deletions src/core/compact_object.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,15 @@ extern "C" {
#include "base/flags.h"
#include "base/logging.h"
#include "base/pod_array.h"
#include "core/detail/bitpacking.h"
#include "core/string_set.h"

#if defined(__aarch64__)
#include "base/sse2neon.h"
#else
#include <emmintrin.h>
#endif

ABSL_FLAG(bool, use_set2, true, "If true use DenseSet for an optimized set data structure");

namespace dfly {
using namespace std;
using absl::GetFlag;
using detail::binpacked_len;

namespace {

Expand Down Expand Up @@ -154,35 +150,6 @@ inline void FreeObjStream(void* ptr) {
freeStream((stream*)ptr);
}

// Daniel Lemire's function validate_ascii_fast() - under Apache/MIT license.
// See https://github.com/lemire/fastvalidate-utf-8/
// The function returns true (1) if all chars passed in src are
// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
bool validate_ascii_fast(const char* src, size_t len) {
size_t i = 0;
__m128i has_error = _mm_setzero_si128();
if (len >= 16) {
for (; i <= len - 16; i += 16) {
__m128i current_bytes = _mm_loadu_si128((const __m128i*)(src + i));
has_error = _mm_or_si128(has_error, current_bytes);
}
}
int error_mask = _mm_movemask_epi8(has_error);

char tail_has_error = 0;
for (; i < len; i++) {
tail_has_error |= src[i];
}
error_mask |= (tail_has_error & 0x80);

return !error_mask;
}

// maps ascii len to 7-bit packed length. Each 8 bytes are converted to 7 bytes.
inline constexpr size_t binpacked_len(size_t ascii_len) {
return (ascii_len * 7 + 7) / 8; /* rounded up */
}

// converts 7-bit packed length back to ascii length. Note that this conversion
// is not accurate since it maps 7 bytes to 8 bytes (rounds up), while we may have
// 7 byte strings converted to 7 byte as well.
Expand Down Expand Up @@ -428,91 +395,6 @@ void RobjWrapper::MakeInnerRoom(size_t current_cap, size_t desired, pmr::memory_
inner_obj_ = newp;
}

#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC push_options
#pragma GCC optimize("Ofast")
#endif

// len must be at least 16
void ascii_pack(const char* ascii, size_t len, uint8_t* bin) {
const char* end = ascii + len;

unsigned i = 0;
while (ascii + 8 <= end) {
for (i = 0; i < 7; ++i) {
*bin++ = (ascii[0] >> i) | (ascii[1] << (7 - i));
++ascii;
}
++ascii;
}

// epilog - we do not pack since we have less than 8 bytes.
while (ascii < end) {
*bin++ = *ascii++;
}
}

// unpacks 8->7 encoded blob back to ascii.
// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
// the source buffer.
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
// left than we can unpack inplace.
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii) {
constexpr uint8_t kM = 0x7F;
uint8_t p = 0;
unsigned i = 0;

while (ascii_len >= 8) {
for (i = 0; i < 7; ++i) {
uint8_t src = *bin; // keep on stack in case we unpack inplace.
*ascii++ = (p >> (8 - i)) | ((src << i) & kM);
p = src;
++bin;
}

ascii_len -= 8;
*ascii++ = p >> 1;
}

DCHECK_LT(ascii_len, 8u);
for (i = 0; i < ascii_len; ++i) {
*ascii++ = *bin++;
}
}

// compares packed and unpacked strings. packed must be of length = binpacked_len(ascii_len).
bool compare_packed(const uint8_t* packed, const char* ascii, size_t ascii_len) {
unsigned i = 0;
bool res = true;
const char* end = ascii + ascii_len;

while (ascii + 8 <= end) {
for (i = 0; i < 7; ++i) {
uint8_t conv = (ascii[0] >> i) | (ascii[1] << (7 - i));
res &= (conv == *packed);
++ascii;
++packed;
}

if (!res)
return false;

++ascii;
}

while (ascii < end) {
if (*ascii++ != *packed++) {
return false;
}
}

return true;
}

#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC pop_options
#endif

} // namespace detail

using namespace std;
Expand Down Expand Up @@ -777,7 +659,7 @@ void CompactObj::SetString(std::string_view str) {
DCHECK_GT(str.size(), kInlineLen);

string_view encoded = str;
bool is_ascii = kUseAsciiEncoding && validate_ascii_fast(str.data(), str.size());
bool is_ascii = kUseAsciiEncoding && detail::validate_ascii_fast(str.data(), str.size());

if (is_ascii) {
size_t encode_len = binpacked_len(str.size());
Expand All @@ -792,7 +674,7 @@ void CompactObj::SetString(std::string_view str) {
}

tl.tmp_buf.resize(encode_len);
detail::ascii_pack(str.data(), str.size(), tl.tmp_buf.data());
detail::ascii_pack_simd(str.data(), str.size(), tl.tmp_buf.data());
encoded = string_view{reinterpret_cast<char*>(tl.tmp_buf.data()), encode_len};

if (encoded.size() <= kInlineLen) {
Expand Down Expand Up @@ -1125,7 +1007,7 @@ bool CompactObj::CmpEncoded(string_view sv) const {
if (u_.r_obj.Size() != encode_len)
return false;

if (!validate_ascii_fast(sv.data(), sv.size()))
if (!detail::validate_ascii_fast(sv.data(), sv.size()))
return false;

return detail::compare_packed(to_byte(u_.r_obj.inner_obj()), sv.data(), sv.size());
Expand All @@ -1139,7 +1021,7 @@ bool CompactObj::CmpEncoded(string_view sv) const {
if (u_.small_str.size() != encode_len)
return false;

if (!validate_ascii_fast(sv.data(), sv.size()))
if (!detail::validate_ascii_fast(sv.data(), sv.size()))
return false;

// We need to compare an unpacked sv with 2 packed parts.
Expand Down
10 changes: 0 additions & 10 deletions src/core/compact_object.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,6 @@ class RobjWrapper {

} __attribute__((packed));

// unpacks 8->7 encoded blob back to ascii.
// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
// the source buffer.
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
// left than we can unpack inplace.
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii);

// packs ascii string (does not verify) into binary form saving 1 bit per byte on average (12.5%).
void ascii_pack(const char* ascii, size_t len, uint8_t* bin);

} // namespace detail

class CompactObj {
Expand Down
80 changes: 75 additions & 5 deletions src/core/compact_object_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include "base/gtest.h"
#include "base/logging.h"
#include "core/detail/bitpacking.h"
#include "core/flat_set.h"
#include "core/json_object.h"
#include "core/mi_memory_resource.h"
Expand Down Expand Up @@ -189,13 +190,24 @@ TEST_F(CompactObjectTest, AsciiUtil) {
std::string_view data{"aaaaaabb"};
uint8_t buf[32];

char ascii2[] = "xxxxxxxxxxxxxx";
detail::ascii_pack(data.data(), 7, buf);
detail::ascii_unpack(buf, 7, ascii2);
char outbuf[32] = "xxxxxxxxxxxxxx";
detail::ascii_pack_simd(data.data(), 7, buf);
detail::ascii_unpack(buf, 7, outbuf);

ASSERT_EQ('x', ascii2[7]) << ascii2;
std::string_view actual{ascii2, 7};
ASSERT_EQ('x', outbuf[7]) << outbuf;
std::string_view actual{outbuf, 7};
ASSERT_EQ(data.substr(0, 7), actual);

string data3;
for (unsigned i = 0; i < 97; ++i) {
data3.append("12345678910");
}
string act_str(data3.size(), 'y');
std::vector<uint8_t> binvec(detail::binpacked_len(data3.size()));
detail::ascii_pack_simd(data3.data(), data3.size(), binvec.data());
detail::ascii_unpack(binvec.data(), data3.size(), act_str.data());

ASSERT_EQ(data3, act_str);
}

TEST_F(CompactObjectTest, IntSet) {
Expand Down Expand Up @@ -453,4 +465,62 @@ TEST_F(CompactObjectTest, JsonTypeWithPathTest) {
}
}

static void ascii_pack_naive(const char* ascii, size_t len, uint8_t* bin) {
const char* end = ascii + len;

unsigned i = 0;
while (ascii + 8 <= end) {
for (i = 0; i < 7; ++i) {
*bin++ = (ascii[0] >> i) | (ascii[1] << (7 - i));
++ascii;
}
++ascii;
}

// epilog - we do not pack since we have less than 8 bytes.
while (ascii < end) {
*bin++ = *ascii++;
}
}

static void BM_PackNaive(benchmark::State& state) {
string val(1024, 'a');
uint8_t buf[1024];

while (state.KeepRunning()) {
ascii_pack_naive(val.data(), val.size(), buf);
}
}
BENCHMARK(BM_PackNaive);

static void BM_Pack(benchmark::State& state) {
string val(1024, 'a');
uint8_t buf[1024];

while (state.KeepRunning()) {
detail::ascii_pack(val.data(), val.size(), buf);
}
}
BENCHMARK(BM_Pack);

static void BM_Pack2(benchmark::State& state) {
string val(1024, 'a');
uint8_t buf[1024];

while (state.KeepRunning()) {
detail::ascii_pack(val.data(), val.size(), buf);
}
}
BENCHMARK(BM_Pack2);

static void BM_PackSimd(benchmark::State& state) {
string val(1024, 'a');
uint8_t buf[1024];

while (state.KeepRunning()) {
detail::ascii_pack_simd(val.data(), val.size(), buf);
}
}
BENCHMARK(BM_PackSimd);

} // namespace dfly
Loading