Skip to content

Commit

Permalink
[libc] Adding a version of memset with software prefetching (#70857)
Browse files Browse the repository at this point in the history
Software prefetching helps recover performance when hardware prefetching
is disabled. The 'LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING' compile
time option allows users to use this patch.
  • Loading branch information
doshimili authored Nov 10, 2023
1 parent f7bbb58 commit 3153aa4
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 21 deletions.
4 changes: 4 additions & 0 deletions libc/config/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
"LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
"value": false,
"doc": "Read more than a byte at a time to perform byte-string operations like strlen."
},
"LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": {
"value": false,
"doc": "Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled."
}
}
}
4 changes: 4 additions & 0 deletions libc/src/string/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ add_subdirectory(memory_utils)
if(LIBC_CONF_STRING_UNSAFE_WIDE_READ)
list(APPEND string_config_options "-DLIBC_COPT_STRING_UNSAFE_WIDE_READ")
endif()
if(LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
list(APPEND string_config_options "-DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING")
endif()
if(string_config_options)
list(PREPEND string_config_options "COMPILE_OPTIONS")
endif()
Expand Down Expand Up @@ -656,6 +659,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
add_memset(memset)
elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
Expand Down
8 changes: 6 additions & 2 deletions libc/src/string/memory_utils/op_generic.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,15 +154,19 @@ template <typename T> struct Memset {
tail(dst, value, count);
}

LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value,
size_t count, size_t offset) {
static_assert(SIZE > 1, "a loop of size 1 does not need tail");
size_t offset = 0;
do {
block(dst + offset, value);
offset += SIZE;
} while (offset < count - SIZE);
tail(dst, value, count);
}

LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
return loop_and_tail_offset(dst, value, count, 0);
}
};

template <typename T, typename... TS> struct MemsetSequence {
Expand Down
8 changes: 8 additions & 0 deletions libc/src/string/memory_utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,14 @@ template <size_t SIZE> struct AlignHelper {
uintptr_t offset_;
};

LIBC_INLINE void prefetch_for_write(CPtr dst) {
__builtin_prefetch(dst, /*write*/ 1, /*max locality*/ 3);
}

LIBC_INLINE void prefetch_to_local_cache(CPtr dst) {
__builtin_prefetch(dst, /*read*/ 0, /*max locality*/ 3);
}

} // namespace LIBC_NAMESPACE

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H
5 changes: 0 additions & 5 deletions libc/src/string/memory_utils/x86_64/inline_memcpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,6 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =

} // namespace x86

// TODO: Move to a shared header when appropriate.
[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) {
__builtin_prefetch(addr, 0, 3);
}

[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
Expand Down
70 changes: 56 additions & 14 deletions libc/src/string/memory_utils/x86_64/inline_memset.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,67 @@
#include <stddef.h> // size_t

namespace LIBC_NAMESPACE {
namespace x86 {
// Size of one cache line for software prefetching
LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64;
LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2;
LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5;

LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);

} // namespace x86

[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
#if defined(__AVX512F__)
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = generic_v512;
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = generic_v512;
#elif defined(__AVX__)
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = cpp::array<generic_v256, 2>;
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = cpp::array<generic_v256, 2>;
#elif defined(__SSE2__)
using uint128_t = generic_v128;
using uint256_t = cpp::array<generic_v128, 2>;
using uint512_t = cpp::array<generic_v128, 4>;
using uint128_t = generic_v128;
using uint256_t = cpp::array<generic_v128, 2>;
using uint512_t = cpp::array<generic_v128, 4>;
#else
using uint128_t = cpp::array<uint64_t, 2>;
using uint256_t = cpp::array<uint64_t, 4>;
using uint512_t = cpp::array<uint64_t, 8>;
using uint128_t = cpp::array<uint64_t, 2>;
using uint256_t = cpp::array<uint64_t, 4>;
using uint512_t = cpp::array<uint64_t, 8>;
#endif

[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
constexpr size_t PREFETCH_DISTANCE = x86::kFiveCachelinesSize;
constexpr size_t PREFETCH_DEGREE = x86::kTwoCachelinesSize;
constexpr size_t SIZE = sizeof(uint256_t);
// Prefetch one cache line
prefetch_for_write(dst + x86::kOneCachelineSize);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
// Prefetch the second cache line
prefetch_for_write(dst + x86::kTwoCachelinesSize);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
align_to_next_boundary<32>(dst, count);
if (count <= 192) {
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
} else {
generic::MemsetSequence<uint512_t, uint256_t>::block(dst, value);
size_t offset = 96;
while (offset + PREFETCH_DEGREE + SIZE <= count) {
prefetch_for_write(dst + offset + PREFETCH_DISTANCE);
prefetch_for_write(dst + offset + PREFETCH_DISTANCE +
x86::kOneCachelineSize);
for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE)
generic::Memset<uint256_t>::block(dst + offset, value);
}
generic::Memset<uint256_t>::loop_and_tail_offset(dst, value, count, offset);
}
}

[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
if (count == 0)
return;
if (count == 1)
Expand All @@ -53,6 +93,8 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
return generic::Memset<uint128_t>::head_tail(dst, value, count);
if (count <= 64)
return generic::Memset<uint256_t>::head_tail(dst, value, count);
if constexpr (x86::kUseSoftwarePrefetchingMemset)
return inline_memset_x86_gt64_sw_prefetching(dst, value, count);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
// Aligned loop
Expand Down
1 change: 1 addition & 0 deletions utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ PRINTF_COPTS = [
MEMORY_COPTS = [
# "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
# "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
# "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING",
]

# A flag to pick which `mpfr` to use for math tests.
Expand Down

0 comments on commit 3153aa4

Please sign in to comment.