From 24db19416de300c0c4b4be5128c648cbe765045c Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Fri, 19 Apr 2024 13:36:51 -0700 Subject: [PATCH] [mono] simdhash tweaks (#101082) * Optimize simdhash scalar bucket scan to generate chained cmovs instead of conditional branches, and to skip suffix slots that will always be empty * Optimize temporary copies of hash->buffers out of most simdhash APIs * Build fixes for ght-compatible simdhash * Add missing license headers * Improve simdhash microbenchmark suite and make it compatible with windows x64 MSVC * Use xoshiro256 to generate random values in microbenchmark, with a fixed seed, instead of libc rand * Add filtering support to simdhash microbenchmark makefile --- src/native/containers/dn-simdhash-arch.h | 69 ++---------- .../containers/dn-simdhash-ght-compatible.c | 1 - .../containers/dn-simdhash-ght-compatible.h | 5 + .../containers/dn-simdhash-specialization.h | 105 ++++++++++++------ src/native/containers/dn-simdhash.h | 9 +- .../containers/simdhash-benchmark/Makefile | 13 ++- .../simdhash-benchmark/all-measurements.c | 2 + .../simdhash-benchmark/all-measurements.h | 44 ++++++-- .../simdhash-benchmark/run-benchmark.ps1 | 2 +- 9 files changed, 134 insertions(+), 116 deletions(-) diff --git a/src/native/containers/dn-simdhash-arch.h b/src/native/containers/dn-simdhash-arch.h index d99288e3476ac3..22f3902eafa804 100644 --- a/src/native/containers/dn-simdhash-arch.h +++ b/src/native/containers/dn-simdhash-arch.h @@ -92,15 +92,10 @@ build_search_vector (uint8_t needle) // returns an index in range 0-13 on match, 14-32 if no match static DN_FORCEINLINE(uint32_t) -find_first_matching_suffix ( +find_first_matching_suffix_simd ( dn_simdhash_search_vector needle, // Only used by the vectorized implementations; discarded by scalar. - dn_simdhash_suffixes haystack, - // HACK: Pass the address of haystack.values directly, for scalar fallback. - // Without this, clang makes a full unaligned copy of haystack before calling us. - // Discarded by the vectorized implementations. - uint8_t haystack_values[DN_SIMDHASH_VECTOR_WIDTH], - uint32_t count + dn_simdhash_suffixes haystack ) { #if defined(__wasm_simd128__) return ctz(wasm_i8x16_bitmask(wasm_i8x16_eq(needle.vec, haystack.vec))); @@ -123,37 +118,8 @@ find_first_matching_suffix ( msb.b[1] = vaddv_u8(vget_high_u8(masked.vec)); return ctz(msb.u); #else - // HACK: We can't put this in a common helper function without introducing a temporary - // unaligned copy-from-table-to-stack in wasm-without-simd -#define ITER(offset) \ - if (needle == haystack_values[offset]) \ - return offset; - - // It is safe to unroll this without bounds checks - // One would expect this to blow out the branch predictor, but in my testing - // it's significantly faster when there is no match, and slightly faster - // for cases where there is a match. - // Looping from 0-count is slower than this in my testing, even though it's - // going to check fewer suffixes most of the time - probably due to the - // comparison against count for each suffix. - // FIXME: If we move this into the specialization header, we can limit the - // number of unrolled iterations to the number of keys in the bucket. - ITER(0); - ITER(1); - ITER(2); - ITER(3); - ITER(4); - ITER(5); - ITER(6); - ITER(7); - ITER(8); - ITER(9); - ITER(10); - ITER(11); - ITER(12); - ITER(13); -#undef ITER - return 32; + dn_simdhash_assert(!"Scalar fallback should be in use here"); + return 32; #endif } @@ -194,17 +160,12 @@ build_search_vector (uint8_t needle) // returns an index in range 0-13 on match, 14-32 if no match static DN_FORCEINLINE(uint32_t) -find_first_matching_suffix_internal ( - __m128i needle, __m128i haystack, - uint32_t count +find_first_matching_suffix_simd ( + dn_simdhash_search_vector needle, dn_simdhash_suffixes haystack ) { - return ctz(_mm_movemask_epi8(_mm_cmpeq_epi8(needle, haystack))); + return ctz(_mm_movemask_epi8(_mm_cmpeq_epi8(needle.m128, haystack.m128))); } -// use a macro to discard haystack_values, otherwise MSVC's codegen is worse -#define find_first_matching_suffix(needle, haystack, haystack_values, count) \ - find_first_matching_suffix_internal(needle.m128, haystack.m128, count) - #else // unknown compiler and/or unknown non-simd arch #define DN_SIMDHASH_USE_SCALAR_FALLBACK 1 @@ -228,22 +189,6 @@ build_search_vector (uint8_t needle) return needle; } -// returns an index in range 0-14 on match, 32 if no match -static DN_FORCEINLINE(uint32_t) -find_first_matching_suffix ( - dn_simdhash_search_vector needle, dn_simdhash_suffixes haystack, - uint8_t haystack_values[DN_SIMDHASH_VECTOR_WIDTH], uint32_t count -) { - // TODO: It might be profitable to hand-unroll this loop, but right now doing so - // hits a bug in clang and generates really bad WASM. - // HACK: We can't put this in a common helper function without introducing a temporary - // unaligned copy-from-table-to-stack in wasm-without-simd - for (uint32_t i = 0; i < count; i++) - if (needle == haystack_values[i]) - return i; - return 32; -} - #endif // end of clang/gcc or msvc or fallback #endif // __DN_SIMDHASH_ARCH_H__ diff --git a/src/native/containers/dn-simdhash-ght-compatible.c b/src/native/containers/dn-simdhash-ght-compatible.c index 330f2a14f7385a..eced9aac478c89 100644 --- a/src/native/containers/dn-simdhash-ght-compatible.c +++ b/src/native/containers/dn-simdhash-ght-compatible.c @@ -80,7 +80,6 @@ dn_simdhash_ght_replaced (dn_simdhash_ght_data data, void * old_key, void * new_ #define DN_SIMDHASH_NO_DEFAULT_NEW 1 #include "dn-simdhash-specialization.h" -#include "dn-simdhash-ght-compatible.h" dn_simdhash_ght_t * dn_simdhash_ght_new ( diff --git a/src/native/containers/dn-simdhash-ght-compatible.h b/src/native/containers/dn-simdhash-ght-compatible.h index b99d67477ccc5b..4474f02346d3b6 100644 --- a/src/native/containers/dn-simdhash-ght-compatible.h +++ b/src/native/containers/dn-simdhash-ght-compatible.h @@ -1,3 +1,8 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +typedef dn_simdhash_t dn_simdhash_ght_t; + typedef void (*dn_simdhash_ght_destroy_func) (void * data); typedef unsigned int (*dn_simdhash_ght_hash_func) (const void * key); typedef int32_t (*dn_simdhash_ght_equal_func) (const void * a, const void * b); diff --git a/src/native/containers/dn-simdhash-specialization.h b/src/native/containers/dn-simdhash-specialization.h index 093ffaf20a7a4e..7c5d9b2ecdb1b5 100644 --- a/src/native/containers/dn-simdhash-specialization.h +++ b/src/native/containers/dn-simdhash-specialization.h @@ -95,6 +95,48 @@ dn_simdhash_meta_t DN_SIMDHASH_T_META = { sizeof(DN_SIMDHASH_INSTANCE_DATA_T), }; +static DN_FORCEINLINE(uint32_t) +find_first_matching_suffix_scalar ( + uint8_t needle, + uint8_t haystack[DN_SIMDHASH_VECTOR_WIDTH] +) { + uint32_t result = 32; + // ITERs for indices beyond our specialization's bucket capacity will be + // constant-false and not check the specific bucket slot +#define ITER(offset) \ + { \ + /* Avoid MSVC C4127 by computing this separately in a temp local */ \ + uint8_t in_bounds = (offset < DN_SIMDHASH_BUCKET_CAPACITY); \ + if (in_bounds && (needle == haystack[offset])) \ + result = offset; \ + } + + // It is safe to unroll this without bounds checks + // Looping from 0-count is slower than this in my testing, even though it's + // going to check fewer suffixes most of the time - probably due to the + // comparison against count for each suffix. + // Scanning in reverse and conditionally modifying result allows clang to + // emit a chain of 'select' operations per slot on wasm, which produces + // smaller code that seems to be much faster than a chain of + // 'if (...) return' for successful matches, and only slightly slower + // for failed matches + ITER(13); + ITER(12); + ITER(11); + ITER(10); + ITER(9); + ITER(8); + ITER(7); + ITER(6); + ITER(5); + ITER(4); + ITER(3); + ITER(2); + ITER(1); + ITER(0); +#undef ITER + return result; +} static DN_FORCEINLINE(void) check_self (DN_SIMDHASH_T_PTR self) @@ -152,7 +194,12 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc overflow_count = dn_simdhash_extract_lane(bucket_suffixes, DN_SIMDHASH_CASCADED_SLOT); // We could early-out here when count==0, but it doesn't appear to meaningfully improve // search performance to do so, and might actually worsen it - uint32_t index = find_first_matching_suffix(search_vector, bucket_suffixes, bucket_suffixes.values, count); +#ifdef DN_SIMDHASH_USE_SCALAR_FALLBACK + uint32_t index = find_first_matching_suffix_scalar(search_vector, bucket->suffixes.values); +#else + uint32_t index = find_first_matching_suffix_simd(search_vector, bucket_suffixes); +#endif +#undef bucket_suffixes for (; index < count; index++) { // FIXME: Could be profitable to manually hoist the data load outside of the loop, // if not out of SCAN_BUCKET_INTERNAL entirely. Clang appears to do LICM on it. @@ -164,8 +211,6 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc return index; } -#undef bucket_suffixes - if (overflow_count) return DN_SIMDHASH_SCAN_BUCKET_OVERFLOWED; else @@ -173,13 +218,13 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc } // Helper macros so that we can optimize and change scan logic more easily -#define BEGIN_SCAN_BUCKETS(initial_index, bucket_index, bucket_address) \ +#define BEGIN_SCAN_BUCKETS(buffers, initial_index, bucket_index, bucket_address) \ { \ uint32_t bucket_index = initial_index, scan_buckets_length = buffers.buckets_length; \ bucket_t *restrict bucket_address = address_of_bucket(buffers, bucket_index); \ do { -#define END_SCAN_BUCKETS(initial_index, bucket_index, bucket_address) \ +#define END_SCAN_BUCKETS(buffers, initial_index, bucket_index, bucket_address) \ bucket_index++; \ bucket_address++; \ /* Wrap around if we hit the last bucket. */ \ @@ -211,7 +256,7 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc static void adjust_cascaded_counts (dn_simdhash_buffers_t buffers, uint32_t first_bucket_index, uint32_t last_bucket_index, uint8_t increase) { - BEGIN_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address) + BEGIN_SCAN_BUCKETS(buffers, first_bucket_index, bucket_index, bucket_address) if (bucket_index == last_bucket_index) break; @@ -224,26 +269,25 @@ adjust_cascaded_counts (dn_simdhash_buffers_t buffers, uint32_t first_bucket_ind dn_simdhash_bucket_set_cascaded_count(bucket_address->suffixes, cascaded_count - 1); } } - END_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address) + END_SCAN_BUCKETS(buffers, first_bucket_index, bucket_index, bucket_address) } -static DN_SIMDHASH_VALUE_T * +static DN_FORCEINLINE(DN_SIMDHASH_VALUE_T *) DN_SIMDHASH_FIND_VALUE_INTERNAL (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key, uint32_t key_hash) { - dn_simdhash_buffers_t buffers = hash->buffers; uint8_t suffix = dn_simdhash_select_suffix(key_hash); - uint32_t first_bucket_index = dn_simdhash_select_bucket_index(buffers, key_hash); + uint32_t first_bucket_index = dn_simdhash_select_bucket_index(hash->buffers, key_hash); dn_simdhash_search_vector search_vector = build_search_vector(suffix); - BEGIN_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address) + BEGIN_SCAN_BUCKETS(hash->buffers, first_bucket_index, bucket_index, bucket_address) int index_in_bucket = DN_SIMDHASH_SCAN_BUCKET_INTERNAL(hash, bucket_address, key, search_vector); if (index_in_bucket >= 0) { uint32_t value_slot_index = (bucket_index * DN_SIMDHASH_BUCKET_CAPACITY) + index_in_bucket; - return address_of_value(buffers, value_slot_index); + return address_of_value(hash->buffers, value_slot_index); } else if (index_in_bucket == DN_SIMDHASH_SCAN_BUCKET_NO_OVERFLOW) { return NULL; } - END_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address) + END_SCAN_BUCKETS(hash->buffers, first_bucket_index, bucket_index, bucket_address) return NULL; } @@ -288,12 +332,11 @@ DN_SIMDHASH_TRY_INSERT_INTERNAL (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key, return DN_SIMDHASH_INSERT_NEED_TO_GROW; } - dn_simdhash_buffers_t buffers = hash->buffers; uint8_t suffix = dn_simdhash_select_suffix(key_hash); uint32_t first_bucket_index = dn_simdhash_select_bucket_index(hash->buffers, key_hash); dn_simdhash_search_vector search_vector = build_search_vector(suffix); - BEGIN_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address) + BEGIN_SCAN_BUCKETS(hash->buffers, first_bucket_index, bucket_index, bucket_address) // If necessary, check the current bucket for the key if (mode != DN_SIMDHASH_INSERT_MODE_REHASHING) { int index_in_bucket = DN_SIMDHASH_SCAN_BUCKET_INTERNAL(hash, bucket_address, key, search_vector); @@ -325,19 +368,19 @@ DN_SIMDHASH_TRY_INSERT_INTERNAL (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key, *key_slot_address = key; // Now store the value, it's in a different cache line uint32_t value_slot_index = (bucket_index * DN_SIMDHASH_BUCKET_CAPACITY) + new_index; - DN_SIMDHASH_VALUE_T *restrict value_slot_address = address_of_value(buffers, value_slot_index); + DN_SIMDHASH_VALUE_T *restrict value_slot_address = address_of_value(hash->buffers, value_slot_index); *value_slot_address = value; // printf("Inserted [%zd, %zd] in bucket %d at index %d\n", key, value, bucket_index, new_index); // If we cascaded out of our original target bucket, scan through our probe path // and increase the cascade counters. We have to wait until now to do that, because // during the process of getting here we may end up finding a duplicate, which would // leave the cascade counters in a corrupted state - adjust_cascaded_counts(buffers, first_bucket_index, bucket_index, 1); + adjust_cascaded_counts(hash->buffers, first_bucket_index, bucket_index, 1); return DN_SIMDHASH_INSERT_OK_ADDED_NEW; } // The current bucket is full, so try the next bucket. - END_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address) + END_SCAN_BUCKETS(hash->buffers, first_bucket_index, bucket_index, bucket_address) return DN_SIMDHASH_INSERT_NEED_TO_GROW; } @@ -362,10 +405,9 @@ DN_SIMDHASH_REHASH_INTERNAL (DN_SIMDHASH_T_PTR hash, dn_simdhash_buffers_t old_b static void DN_SIMDHASH_DESTROY_ALL (DN_SIMDHASH_T_PTR hash) { - dn_simdhash_buffers_t buffers = hash->buffers; - BEGIN_SCAN_PAIRS(buffers, key_address, value_address) + BEGIN_SCAN_PAIRS(hash->buffers, key_address, value_address) DN_SIMDHASH_ON_REMOVE(DN_SIMDHASH_GET_DATA(hash), *key_address, *value_address); - END_SCAN_PAIRS(buffers, key_address, value_address) + END_SCAN_PAIRS(hash->buffers, key_address, value_address) } #endif @@ -385,14 +427,6 @@ dn_simdhash_vtable_t DN_SIMDHASH_T_VTABLE = { DN_SIMDHASH_T_PTR DN_SIMDHASH_NEW (uint32_t capacity, dn_allocator_t *allocator) { - // If this isn't satisfied, the generic code will allocate incorrectly sized buffers - // HACK: Use static_assert because for some reason assert produces unused variable warnings only on CI - struct silence_nuisance_msvc_warning { bucket_t a, b; }; - static_assert( - sizeof(struct silence_nuisance_msvc_warning) == (sizeof(bucket_t) * 2), - "Inconsistent spacing/sizing for bucket_t" - ); - return dn_simdhash_new_internal(&DN_SIMDHASH_T_META, DN_SIMDHASH_T_VTABLE, capacity, allocator); } #endif @@ -475,12 +509,11 @@ DN_SIMDHASH_TRY_REMOVE_WITH_HASH (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key, { check_self(hash); - dn_simdhash_buffers_t buffers = hash->buffers; uint8_t suffix = dn_simdhash_select_suffix(key_hash); - uint32_t first_bucket_index = dn_simdhash_select_bucket_index(buffers, key_hash); + uint32_t first_bucket_index = dn_simdhash_select_bucket_index(hash->buffers, key_hash); dn_simdhash_search_vector search_vector = build_search_vector(suffix); - BEGIN_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address) + BEGIN_SCAN_BUCKETS(hash->buffers, first_bucket_index, bucket_index, bucket_address) int index_in_bucket = DN_SIMDHASH_SCAN_BUCKET_INTERNAL(hash, bucket_address, key, search_vector); if (index_in_bucket >= 0) { // We found the item. Replace it with the last item in the bucket, then erase @@ -490,8 +523,8 @@ DN_SIMDHASH_TRY_REMOVE_WITH_HASH (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key, uint32_t value_slot_index = (bucket_index * DN_SIMDHASH_BUCKET_CAPACITY) + index_in_bucket, replacement_value_slot_index = (bucket_index * DN_SIMDHASH_BUCKET_CAPACITY) + replacement_index_in_bucket; - DN_SIMDHASH_VALUE_T *value_address = address_of_value(buffers, value_slot_index); - DN_SIMDHASH_VALUE_T *replacement_address = address_of_value(buffers, replacement_value_slot_index); + DN_SIMDHASH_VALUE_T *value_address = address_of_value(hash->buffers, value_slot_index); + DN_SIMDHASH_VALUE_T *replacement_address = address_of_value(hash->buffers, replacement_value_slot_index); DN_SIMDHASH_KEY_T *key_address = &bucket_address->keys[index_in_bucket]; DN_SIMDHASH_KEY_T *replacement_key_address = &bucket_address->keys[replacement_index_in_bucket]; @@ -529,7 +562,7 @@ DN_SIMDHASH_TRY_REMOVE_WITH_HASH (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key, // to go through all the buckets we visited on the way here and reduce // their cascade counters (if possible), to maintain better scan performance. if (bucket_index != first_bucket_index) - adjust_cascaded_counts(buffers, first_bucket_index, bucket_index, 0); + adjust_cascaded_counts(hash->buffers, first_bucket_index, bucket_index, 0); #if DN_SIMDHASH_HAS_REMOVE_HANDLER // We've finished removing the item, so we're in a consistent state and can notify @@ -539,7 +572,7 @@ DN_SIMDHASH_TRY_REMOVE_WITH_HASH (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key, return 1; } else if (index_in_bucket == DN_SIMDHASH_SCAN_BUCKET_NO_OVERFLOW) return 0; - END_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address) + END_SCAN_BUCKETS(hash->buffers, first_bucket_index, bucket_index, bucket_address) return 0; } diff --git a/src/native/containers/dn-simdhash.h b/src/native/containers/dn-simdhash.h index 2a26083ec7df1c..da4a7914e18873 100644 --- a/src/native/containers/dn-simdhash.h +++ b/src/native/containers/dn-simdhash.h @@ -108,12 +108,9 @@ dn_simdhash_select_suffix (uint32_t key_hash) return (key_hash >> 24) | DN_SIMDHASH_SUFFIX_SALT; } -static DN_FORCEINLINE(uint32_t) -dn_simdhash_select_bucket_index (dn_simdhash_buffers_t buffers, uint32_t key_hash) -{ - // This relies on bucket count being a power of two. - return key_hash & (buffers.buckets_length - 1); -} +// This relies on bucket count being a power of two. +#define dn_simdhash_select_bucket_index(buffers, key_hash) \ + ((key_hash) & ((buffers).buckets_length - 1)) // Creates a simdhash with the provided configuration metadata, vtable, size, and allocator. diff --git a/src/native/containers/simdhash-benchmark/Makefile b/src/native/containers/simdhash-benchmark/Makefile index 7aa316ba1555b6..6349d1c96538b5 100644 --- a/src/native/containers/simdhash-benchmark/Makefile +++ b/src/native/containers/simdhash-benchmark/Makefile @@ -2,25 +2,32 @@ dn_deps := $(wildcard ../*.c) $(wildcard ../*.h) benchmark_deps := $(wildcard ./*.c) $(wildcard ./*.h) +# I don't know why this is necessary +nodejs_path := $(shell which node) benchmark_sources := ../dn-simdhash.c ../dn-vector.c ./benchmark.c ../dn-simdhash-u32-ptr.c ../dn-simdhash-string-ptr.c ./ghashtable.c ./all-measurements.c common_options := -g -O3 -DNO_CONFIG_H -lm -DNDEBUG +ifeq ($(SIMD), 0) + wasm_options := -mbulk-memory +else + wasm_options := -mbulk-memory -msimd128 +endif benchmark-native: $(dn_deps) $(benchmark_deps) clang $(benchmark_sources) $(common_options) -DSIZEOF_VOID_P=8 benchmark-wasm: $(dn_deps) $(benchmark_deps) - ~/Projects/emscripten/emcc $(benchmark_sources) $(common_options) -DSIZEOF_VOID_P=4 -mbulk-memory -msimd128 + ~/Projects/emscripten/emcc $(benchmark_sources) $(common_options) $(wasm_options) -DSIZEOF_VOID_P=4 disassemble-benchmark: benchmark-native benchmark-wasm objdump -d ./a.out > ./a.dis ~/wabt/bin/wasm2wat ./a.out.wasm > ./a.wat run-native: benchmark-native - ./a.out + ./a.out $(FILTER) run-wasm: benchmark-wasm - node ./a.out.js + $(nodejs_path) ./a.out.js $(FILTER) default_target: disassemble-benchmark diff --git a/src/native/containers/simdhash-benchmark/all-measurements.c b/src/native/containers/simdhash-benchmark/all-measurements.c index 2d04bbd4270120..ac3356e69712b0 100644 --- a/src/native/containers/simdhash-benchmark/all-measurements.c +++ b/src/native/containers/simdhash-benchmark/all-measurements.c @@ -1,6 +1,8 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +#define _CRT_RAND_S + #include #include #include diff --git a/src/native/containers/simdhash-benchmark/all-measurements.h b/src/native/containers/simdhash-benchmark/all-measurements.h index 2e49927337a34f..e6646f203e49e9 100644 --- a/src/native/containers/simdhash-benchmark/all-measurements.h +++ b/src/native/containers/simdhash-benchmark/all-measurements.h @@ -6,27 +6,57 @@ #ifndef MEASUREMENTS_IMPLEMENTATION #define MEASUREMENTS_IMPLEMENTATION 1 -// If this is too large and libc's rand() is low quality (i.e. MSVC), -// initializing the data will take forever -#define INNER_COUNT 1024 * 16 +#define INNER_COUNT 1024 * 32 #define BASELINE_SIZE 20480 static dn_simdhash_u32_ptr_t *random_u32s_hash; static dn_vector_t *sequential_u32s, *random_u32s, *random_unused_u32s; +// rand() isn't guaranteed to give 32 random bits on all targets, so +// use xoshiro seeded with 4 known integers +// We want to use the same random sequence on every benchmark run, otherwise +// execution times could vary +uint64_t rol64(uint64_t x, int k) { + return (x << k) | (x >> (64 - k)); +} + +static uint64_t rng_state[4] = { + 0x6ED11324ABA9232Cul, + 0x1F0C07E6522724A6ul, + 0x293F7FDA2AF571D4ul, + 0x5804EC9FFD70112Aul, +}; + +uint64_t xoshiro256ss() { + uint64_t const result = rol64(rng_state[1] * 5, 7) * 9; + uint64_t const t = rng_state[1] << 17; + + rng_state[2] ^= rng_state[0]; + rng_state[3] ^= rng_state[1]; + rng_state[1] ^= rng_state[2]; + rng_state[0] ^= rng_state[3]; + + rng_state[2] ^= t; + rng_state[3] = rol64(rng_state[3], 45); + + return result; +} + +static uint32_t random_uint () { + return (uint32_t)(xoshiro256ss() & 0xFFFFFFFFu); +} + static void init_data () { random_u32s_hash = dn_simdhash_u32_ptr_new(INNER_COUNT, NULL); sequential_u32s = dn_vector_alloc(sizeof(uint32_t)); random_u32s = dn_vector_alloc(sizeof(uint32_t)); random_unused_u32s = dn_vector_alloc(sizeof(uint32_t)); - // For consistent data between runs - srand(1); for (uint32_t i = 0; i < INNER_COUNT; i++) { dn_vector_push_back(sequential_u32s, i); retry: { - uint32_t key = (uint32_t)(rand() & 0xFFFFFFFFu); + uint32_t key = random_uint(); if (!dn_simdhash_u32_ptr_try_add(random_u32s_hash, key, NULL)) goto retry; @@ -36,7 +66,7 @@ retry: { for (uint32_t i = 0; i < INNER_COUNT; i++) { retry2: { - uint32_t key = (uint32_t)(rand() & 0xFFFFFFFFu); + uint32_t key = random_uint(); if (!dn_simdhash_u32_ptr_try_add(random_u32s_hash, key, NULL)) goto retry2; diff --git a/src/native/containers/simdhash-benchmark/run-benchmark.ps1 b/src/native/containers/simdhash-benchmark/run-benchmark.ps1 index 536ba40cd9144f..9417d16c505cbf 100755 --- a/src/native/containers/simdhash-benchmark/run-benchmark.ps1 +++ b/src/native/containers/simdhash-benchmark/run-benchmark.ps1 @@ -1,2 +1,2 @@ -cl /GS- /O2 /std:c17 ./*.c ../dn-simdhash-u32-ptr.c ../dn-simdhash.c ../dn-vector.c ../dn-simdhash-string-ptr.c /DNO_CONFIG_H /DSIZEOF_VOID_P=8 /Fe:all-measurements.exe +cl /GS- /O2 /std:c17 ./*.c ../dn-simdhash-u32-ptr.c ../dn-simdhash.c ../dn-vector.c ../dn-simdhash-string-ptr.c /DNO_CONFIG_H /DSIZEOF_VOID_P=8 /DNDEBUG /Fe:all-measurements.exe ./all-measurements.exe