diff --git a/src/native/containers/dn-simdhash-arch.h b/src/native/containers/dn-simdhash-arch.h
index d99288e3476ac..22f3902eafa80 100644
--- a/src/native/containers/dn-simdhash-arch.h
+++ b/src/native/containers/dn-simdhash-arch.h
@@ -92,15 +92,10 @@ build_search_vector (uint8_t needle)
 
 // returns an index in range 0-13 on match, 14-32 if no match
 static DN_FORCEINLINE(uint32_t)
-find_first_matching_suffix (
+find_first_matching_suffix_simd (
 	dn_simdhash_search_vector needle,
 	// Only used by the vectorized implementations; discarded by scalar.
-	dn_simdhash_suffixes haystack,
-	// HACK: Pass the address of haystack.values directly, for scalar fallback.
-	// Without this, clang makes a full unaligned copy of haystack before calling us.
-	// Discarded by the vectorized implementations.
-	uint8_t haystack_values[DN_SIMDHASH_VECTOR_WIDTH],
-	uint32_t count
+	dn_simdhash_suffixes haystack
 ) {
 #if defined(__wasm_simd128__)
 	return ctz(wasm_i8x16_bitmask(wasm_i8x16_eq(needle.vec, haystack.vec)));
@@ -123,37 +118,8 @@ find_first_matching_suffix (
 	msb.b[1] = vaddv_u8(vget_high_u8(masked.vec));
 	return ctz(msb.u);
 #else
-	// HACK: We can't put this in a common helper function without introducing a temporary
-	//  unaligned copy-from-table-to-stack in wasm-without-simd
-#define ITER(offset) \
-	if (needle == haystack_values[offset]) \
-		return offset;
-
-	// It is safe to unroll this without bounds checks
-	// One would expect this to blow out the branch predictor, but in my testing
-	//  it's significantly faster when there is no match, and slightly faster
-	//  for cases where there is a match.
-	// Looping from 0-count is slower than this in my testing, even though it's
-	//  going to check fewer suffixes most of the time - probably due to the
-	//  comparison against count for each suffix.
-	// FIXME: If we move this into the specialization header, we can limit the
-	//  number of unrolled iterations to the number of keys in the bucket.
-	ITER(0);
-	ITER(1);
-	ITER(2);
-	ITER(3);
-	ITER(4);
-	ITER(5);
-	ITER(6);
-	ITER(7);
-	ITER(8);
-	ITER(9);
-	ITER(10);
-	ITER(11);
-	ITER(12);
-	ITER(13);
-#undef ITER
-	return 32;
+	dn_simdhash_assert(!"Scalar fallback should be in use here");
+    return 32;
 #endif
 }
 
@@ -194,17 +160,12 @@ build_search_vector (uint8_t needle)
 
 // returns an index in range 0-13 on match, 14-32 if no match
 static DN_FORCEINLINE(uint32_t)
-find_first_matching_suffix_internal (
-	__m128i needle, __m128i haystack,
-	uint32_t count
+find_first_matching_suffix_simd (
+	dn_simdhash_search_vector needle, dn_simdhash_suffixes haystack
 ) {
-	return ctz(_mm_movemask_epi8(_mm_cmpeq_epi8(needle, haystack)));
+	return ctz(_mm_movemask_epi8(_mm_cmpeq_epi8(needle.m128, haystack.m128)));
 }
 
-// use a macro to discard haystack_values, otherwise MSVC's codegen is worse
-#define find_first_matching_suffix(needle, haystack, haystack_values, count) \
-	find_first_matching_suffix_internal(needle.m128, haystack.m128, count)
-
 #else // unknown compiler and/or unknown non-simd arch
 
 #define DN_SIMDHASH_USE_SCALAR_FALLBACK 1
@@ -228,22 +189,6 @@ build_search_vector (uint8_t needle)
 	return needle;
 }
 
-// returns an index in range 0-14 on match, 32 if no match
-static DN_FORCEINLINE(uint32_t)
-find_first_matching_suffix (
-	dn_simdhash_search_vector needle, dn_simdhash_suffixes haystack,
-	uint8_t haystack_values[DN_SIMDHASH_VECTOR_WIDTH], uint32_t count
-) {
-	// TODO: It might be profitable to hand-unroll this loop, but right now doing so
-	//  hits a bug in clang and generates really bad WASM.
-	// HACK: We can't put this in a common helper function without introducing a temporary
-	//  unaligned copy-from-table-to-stack in wasm-without-simd
-	for (uint32_t i = 0; i < count; i++)
-		if (needle == haystack_values[i])
-			return i;
-	return 32;
-}
-
 #endif // end of clang/gcc or msvc or fallback
 
 #endif // __DN_SIMDHASH_ARCH_H__
diff --git a/src/native/containers/dn-simdhash-ght-compatible.c b/src/native/containers/dn-simdhash-ght-compatible.c
index 330f2a14f7385..eced9aac478c8 100644
--- a/src/native/containers/dn-simdhash-ght-compatible.c
+++ b/src/native/containers/dn-simdhash-ght-compatible.c
@@ -80,7 +80,6 @@ dn_simdhash_ght_replaced (dn_simdhash_ght_data data, void * old_key, void * new_
 #define DN_SIMDHASH_NO_DEFAULT_NEW 1
 
 #include "dn-simdhash-specialization.h"
-#include "dn-simdhash-ght-compatible.h"
 
 dn_simdhash_ght_t *
 dn_simdhash_ght_new (
diff --git a/src/native/containers/dn-simdhash-ght-compatible.h b/src/native/containers/dn-simdhash-ght-compatible.h
index b99d67477ccc5..4474f02346d3b 100644
--- a/src/native/containers/dn-simdhash-ght-compatible.h
+++ b/src/native/containers/dn-simdhash-ght-compatible.h
@@ -1,3 +1,8 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+typedef dn_simdhash_t dn_simdhash_ght_t;
+
 typedef void         (*dn_simdhash_ght_destroy_func) (void * data);
 typedef unsigned int (*dn_simdhash_ght_hash_func)    (const void * key);
 typedef int32_t      (*dn_simdhash_ght_equal_func)   (const void * a, const void * b);
diff --git a/src/native/containers/dn-simdhash-specialization.h b/src/native/containers/dn-simdhash-specialization.h
index 093ffaf20a7a4..7c5d9b2ecdb1b 100644
--- a/src/native/containers/dn-simdhash-specialization.h
+++ b/src/native/containers/dn-simdhash-specialization.h
@@ -95,6 +95,48 @@ dn_simdhash_meta_t DN_SIMDHASH_T_META = {
 	sizeof(DN_SIMDHASH_INSTANCE_DATA_T),
 };
 
+static DN_FORCEINLINE(uint32_t)
+find_first_matching_suffix_scalar (
+	uint8_t needle,
+	uint8_t haystack[DN_SIMDHASH_VECTOR_WIDTH]
+) {
+	uint32_t result = 32;
+	// ITERs for indices beyond our specialization's bucket capacity will be
+	//  constant-false and not check the specific bucket slot
+#define ITER(offset) \
+	{ \
+		/* Avoid MSVC C4127 by computing this separately in a temp local */ \
+		uint8_t in_bounds = (offset < DN_SIMDHASH_BUCKET_CAPACITY); \
+		if (in_bounds && (needle == haystack[offset])) \
+			result = offset; \
+	}
+
+	// It is safe to unroll this without bounds checks
+	// Looping from 0-count is slower than this in my testing, even though it's
+	//  going to check fewer suffixes most of the time - probably due to the
+	//  comparison against count for each suffix.
+	// Scanning in reverse and conditionally modifying result allows clang to
+	//  emit a chain of 'select' operations per slot on wasm, which produces
+	//  smaller code that seems to be much faster than a chain of
+	//  'if (...) return' for successful matches, and only slightly slower
+	//  for failed matches
+	ITER(13);
+	ITER(12);
+	ITER(11);
+	ITER(10);
+	ITER(9);
+	ITER(8);
+	ITER(7);
+	ITER(6);
+	ITER(5);
+	ITER(4);
+	ITER(3);
+	ITER(2);
+	ITER(1);
+	ITER(0);
+#undef ITER
+	return result;
+}
 
 static DN_FORCEINLINE(void)
 check_self (DN_SIMDHASH_T_PTR self)
@@ -152,7 +194,12 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc
 		overflow_count = dn_simdhash_extract_lane(bucket_suffixes, DN_SIMDHASH_CASCADED_SLOT);
 	// We could early-out here when count==0, but it doesn't appear to meaningfully improve
 	//  search performance to do so, and might actually worsen it
-	uint32_t index = find_first_matching_suffix(search_vector, bucket_suffixes, bucket_suffixes.values, count);
+#ifdef DN_SIMDHASH_USE_SCALAR_FALLBACK
+	uint32_t index = find_first_matching_suffix_scalar(search_vector, bucket->suffixes.values);
+#else
+	uint32_t index = find_first_matching_suffix_simd(search_vector, bucket_suffixes);
+#endif
+#undef bucket_suffixes
 	for (; index < count; index++) {
 		// FIXME: Could be profitable to manually hoist the data load outside of the loop,
 		//  if not out of SCAN_BUCKET_INTERNAL entirely. Clang appears to do LICM on it.
@@ -164,8 +211,6 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc
 			return index;
 	}
 
-#undef bucket_suffixes
-
 	if (overflow_count)
 		return DN_SIMDHASH_SCAN_BUCKET_OVERFLOWED;
 	else
@@ -173,13 +218,13 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc
 }
 
 // Helper macros so that we can optimize and change scan logic more easily
-#define BEGIN_SCAN_BUCKETS(initial_index, bucket_index, bucket_address) \
+#define BEGIN_SCAN_BUCKETS(buffers, initial_index, bucket_index, bucket_address) \
 	{ \
 		uint32_t bucket_index = initial_index, scan_buckets_length = buffers.buckets_length; \
 		bucket_t *restrict bucket_address = address_of_bucket(buffers, bucket_index); \
 		do {
 
-#define END_SCAN_BUCKETS(initial_index, bucket_index, bucket_address) \
+#define END_SCAN_BUCKETS(buffers, initial_index, bucket_index, bucket_address) \
 			bucket_index++; \
 			bucket_address++; \
 			/* Wrap around if we hit the last bucket. */ \
@@ -211,7 +256,7 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc
 static void
 adjust_cascaded_counts (dn_simdhash_buffers_t buffers, uint32_t first_bucket_index, uint32_t last_bucket_index, uint8_t increase)
 {
-	BEGIN_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address)
+	BEGIN_SCAN_BUCKETS(buffers, first_bucket_index, bucket_index, bucket_address)
 		if (bucket_index == last_bucket_index)
 			break;
 
@@ -224,26 +269,25 @@ adjust_cascaded_counts (dn_simdhash_buffers_t buffers, uint32_t first_bucket_ind
 				dn_simdhash_bucket_set_cascaded_count(bucket_address->suffixes, cascaded_count - 1);
 			}
 		}
-	END_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address)
+	END_SCAN_BUCKETS(buffers, first_bucket_index, bucket_index, bucket_address)
 }
 
-static DN_SIMDHASH_VALUE_T *
+static DN_FORCEINLINE(DN_SIMDHASH_VALUE_T *)
 DN_SIMDHASH_FIND_VALUE_INTERNAL (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key, uint32_t key_hash)
 {
-	dn_simdhash_buffers_t buffers = hash->buffers;
 	uint8_t suffix = dn_simdhash_select_suffix(key_hash);
-	uint32_t first_bucket_index = dn_simdhash_select_bucket_index(buffers, key_hash);
+	uint32_t first_bucket_index = dn_simdhash_select_bucket_index(hash->buffers, key_hash);
 	dn_simdhash_search_vector search_vector = build_search_vector(suffix);
 
-	BEGIN_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address)
+	BEGIN_SCAN_BUCKETS(hash->buffers, first_bucket_index, bucket_index, bucket_address)
 		int index_in_bucket = DN_SIMDHASH_SCAN_BUCKET_INTERNAL(hash, bucket_address, key, search_vector);
 		if (index_in_bucket >= 0) {
 			uint32_t value_slot_index = (bucket_index * DN_SIMDHASH_BUCKET_CAPACITY) + index_in_bucket;
-			return address_of_value(buffers, value_slot_index);
+			return address_of_value(hash->buffers, value_slot_index);
 		} else if (index_in_bucket == DN_SIMDHASH_SCAN_BUCKET_NO_OVERFLOW) {
 			return NULL;
 		}
-	END_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address)
+	END_SCAN_BUCKETS(hash->buffers, first_bucket_index, bucket_index, bucket_address)
 
 	return NULL;
 }
@@ -288,12 +332,11 @@ DN_SIMDHASH_TRY_INSERT_INTERNAL (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key,
 		return DN_SIMDHASH_INSERT_NEED_TO_GROW;
 	}
 
-	dn_simdhash_buffers_t buffers = hash->buffers;
 	uint8_t suffix = dn_simdhash_select_suffix(key_hash);
 	uint32_t first_bucket_index = dn_simdhash_select_bucket_index(hash->buffers, key_hash);
 	dn_simdhash_search_vector search_vector = build_search_vector(suffix);
 
-	BEGIN_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address)
+	BEGIN_SCAN_BUCKETS(hash->buffers, first_bucket_index, bucket_index, bucket_address)
 		// If necessary, check the current bucket for the key
 		if (mode != DN_SIMDHASH_INSERT_MODE_REHASHING) {
 			int index_in_bucket = DN_SIMDHASH_SCAN_BUCKET_INTERNAL(hash, bucket_address, key, search_vector);
@@ -325,19 +368,19 @@ DN_SIMDHASH_TRY_INSERT_INTERNAL (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key,
 			*key_slot_address = key;
 			// Now store the value, it's in a different cache line
 			uint32_t value_slot_index = (bucket_index * DN_SIMDHASH_BUCKET_CAPACITY) + new_index;
-			DN_SIMDHASH_VALUE_T *restrict value_slot_address = address_of_value(buffers, value_slot_index);
+			DN_SIMDHASH_VALUE_T *restrict value_slot_address = address_of_value(hash->buffers, value_slot_index);
 			*value_slot_address = value;
 			// printf("Inserted [%zd, %zd] in bucket %d at index %d\n", key, value, bucket_index, new_index);
 			// If we cascaded out of our original target bucket, scan through our probe path
 			//  and increase the cascade counters. We have to wait until now to do that, because
 			//  during the process of getting here we may end up finding a duplicate, which would
 			//  leave the cascade counters in a corrupted state
-			adjust_cascaded_counts(buffers, first_bucket_index, bucket_index, 1);
+			adjust_cascaded_counts(hash->buffers, first_bucket_index, bucket_index, 1);
 			return DN_SIMDHASH_INSERT_OK_ADDED_NEW;
 		}
 
 		// The current bucket is full, so try the next bucket.
-	END_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address)
+	END_SCAN_BUCKETS(hash->buffers, first_bucket_index, bucket_index, bucket_address)
 
 	return DN_SIMDHASH_INSERT_NEED_TO_GROW;
 }
@@ -362,10 +405,9 @@ DN_SIMDHASH_REHASH_INTERNAL (DN_SIMDHASH_T_PTR hash, dn_simdhash_buffers_t old_b
 static void
 DN_SIMDHASH_DESTROY_ALL (DN_SIMDHASH_T_PTR hash)
 {
-	dn_simdhash_buffers_t buffers = hash->buffers;
-	BEGIN_SCAN_PAIRS(buffers, key_address, value_address)
+	BEGIN_SCAN_PAIRS(hash->buffers, key_address, value_address)
 		DN_SIMDHASH_ON_REMOVE(DN_SIMDHASH_GET_DATA(hash), *key_address, *value_address);
-	END_SCAN_PAIRS(buffers, key_address, value_address)
+	END_SCAN_PAIRS(hash->buffers, key_address, value_address)
 }
 #endif
 
@@ -385,14 +427,6 @@ dn_simdhash_vtable_t DN_SIMDHASH_T_VTABLE = {
 DN_SIMDHASH_T_PTR
 DN_SIMDHASH_NEW (uint32_t capacity, dn_allocator_t *allocator)
 {
-	// If this isn't satisfied, the generic code will allocate incorrectly sized buffers
-	// HACK: Use static_assert because for some reason assert produces unused variable warnings only on CI
-	struct silence_nuisance_msvc_warning { bucket_t a, b; };
-	static_assert(
-		sizeof(struct silence_nuisance_msvc_warning) == (sizeof(bucket_t) * 2),
-		"Inconsistent spacing/sizing for bucket_t"
-	);
-
 	return dn_simdhash_new_internal(&DN_SIMDHASH_T_META, DN_SIMDHASH_T_VTABLE, capacity, allocator);
 }
 #endif
@@ -475,12 +509,11 @@ DN_SIMDHASH_TRY_REMOVE_WITH_HASH (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key,
 {
 	check_self(hash);
 
-	dn_simdhash_buffers_t buffers = hash->buffers;
 	uint8_t suffix = dn_simdhash_select_suffix(key_hash);
-	uint32_t first_bucket_index = dn_simdhash_select_bucket_index(buffers, key_hash);
+	uint32_t first_bucket_index = dn_simdhash_select_bucket_index(hash->buffers, key_hash);
 	dn_simdhash_search_vector search_vector = build_search_vector(suffix);
 
-	BEGIN_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address)
+	BEGIN_SCAN_BUCKETS(hash->buffers, first_bucket_index, bucket_index, bucket_address)
 		int index_in_bucket = DN_SIMDHASH_SCAN_BUCKET_INTERNAL(hash, bucket_address, key, search_vector);
 		if (index_in_bucket >= 0) {
 			// We found the item. Replace it with the last item in the bucket, then erase
@@ -490,8 +523,8 @@ DN_SIMDHASH_TRY_REMOVE_WITH_HASH (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key,
 			uint32_t value_slot_index = (bucket_index * DN_SIMDHASH_BUCKET_CAPACITY) + index_in_bucket,
 				replacement_value_slot_index = (bucket_index * DN_SIMDHASH_BUCKET_CAPACITY) + replacement_index_in_bucket;
 
-			DN_SIMDHASH_VALUE_T *value_address = address_of_value(buffers, value_slot_index);
-			DN_SIMDHASH_VALUE_T *replacement_address = address_of_value(buffers, replacement_value_slot_index);
+			DN_SIMDHASH_VALUE_T *value_address = address_of_value(hash->buffers, value_slot_index);
+			DN_SIMDHASH_VALUE_T *replacement_address = address_of_value(hash->buffers, replacement_value_slot_index);
 			DN_SIMDHASH_KEY_T *key_address = &bucket_address->keys[index_in_bucket];
 			DN_SIMDHASH_KEY_T *replacement_key_address = &bucket_address->keys[replacement_index_in_bucket];
 
@@ -529,7 +562,7 @@ DN_SIMDHASH_TRY_REMOVE_WITH_HASH (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key,
 			//  to go through all the buckets we visited on the way here and reduce
 			//  their cascade counters (if possible), to maintain better scan performance.
 			if (bucket_index != first_bucket_index)
-				adjust_cascaded_counts(buffers, first_bucket_index, bucket_index, 0);
+				adjust_cascaded_counts(hash->buffers, first_bucket_index, bucket_index, 0);
 
 #if DN_SIMDHASH_HAS_REMOVE_HANDLER
 			// We've finished removing the item, so we're in a consistent state and can notify
@@ -539,7 +572,7 @@ DN_SIMDHASH_TRY_REMOVE_WITH_HASH (DN_SIMDHASH_T_PTR hash, DN_SIMDHASH_KEY_T key,
 			return 1;
 		} else if (index_in_bucket == DN_SIMDHASH_SCAN_BUCKET_NO_OVERFLOW)
 			return 0;
-	END_SCAN_BUCKETS(first_bucket_index, bucket_index, bucket_address)
+	END_SCAN_BUCKETS(hash->buffers, first_bucket_index, bucket_index, bucket_address)
 
 	return 0;
 }
diff --git a/src/native/containers/dn-simdhash.h b/src/native/containers/dn-simdhash.h
index 2a26083ec7df1..da4a7914e1887 100644
--- a/src/native/containers/dn-simdhash.h
+++ b/src/native/containers/dn-simdhash.h
@@ -108,12 +108,9 @@ dn_simdhash_select_suffix (uint32_t key_hash)
 	return (key_hash >> 24) | DN_SIMDHASH_SUFFIX_SALT;
 }
 
-static DN_FORCEINLINE(uint32_t)
-dn_simdhash_select_bucket_index (dn_simdhash_buffers_t buffers, uint32_t key_hash)
-{
-	// This relies on bucket count being a power of two.
-	return key_hash & (buffers.buckets_length - 1);
-}
+// This relies on bucket count being a power of two.
+#define dn_simdhash_select_bucket_index(buffers, key_hash) \
+	((key_hash) & ((buffers).buckets_length - 1))
 
 
 // Creates a simdhash with the provided configuration metadata, vtable, size, and allocator.
diff --git a/src/native/containers/simdhash-benchmark/Makefile b/src/native/containers/simdhash-benchmark/Makefile
index 7aa316ba1555b..6349d1c96538b 100644
--- a/src/native/containers/simdhash-benchmark/Makefile
+++ b/src/native/containers/simdhash-benchmark/Makefile
@@ -2,25 +2,32 @@
 
 dn_deps := $(wildcard ../*.c) $(wildcard ../*.h)
 benchmark_deps := $(wildcard ./*.c) $(wildcard ./*.h)
+# I don't know why this is necessary
+nodejs_path := $(shell which node)
 
 benchmark_sources := ../dn-simdhash.c ../dn-vector.c ./benchmark.c ../dn-simdhash-u32-ptr.c ../dn-simdhash-string-ptr.c ./ghashtable.c ./all-measurements.c
 common_options := -g -O3 -DNO_CONFIG_H -lm -DNDEBUG
+ifeq ($(SIMD), 0)
+	wasm_options := -mbulk-memory
+else
+	wasm_options := -mbulk-memory -msimd128
+endif
 
 benchmark-native: $(dn_deps) $(benchmark_deps)
 	clang $(benchmark_sources) $(common_options) -DSIZEOF_VOID_P=8
 
 benchmark-wasm: $(dn_deps) $(benchmark_deps)
-	~/Projects/emscripten/emcc $(benchmark_sources) $(common_options) -DSIZEOF_VOID_P=4 -mbulk-memory -msimd128
+	~/Projects/emscripten/emcc $(benchmark_sources) $(common_options) $(wasm_options) -DSIZEOF_VOID_P=4
 
 disassemble-benchmark: benchmark-native benchmark-wasm
 	objdump -d ./a.out > ./a.dis
 	~/wabt/bin/wasm2wat ./a.out.wasm > ./a.wat
 
 run-native: benchmark-native
-	./a.out
+	./a.out $(FILTER)
 
 run-wasm: benchmark-wasm
-	node ./a.out.js
+	$(nodejs_path) ./a.out.js $(FILTER)
 
 default_target: disassemble-benchmark
 
diff --git a/src/native/containers/simdhash-benchmark/all-measurements.c b/src/native/containers/simdhash-benchmark/all-measurements.c
index 2d04bbd427012..ac3356e69712b 100644
--- a/src/native/containers/simdhash-benchmark/all-measurements.c
+++ b/src/native/containers/simdhash-benchmark/all-measurements.c
@@ -1,6 +1,8 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+#define _CRT_RAND_S
+
 #include <stdint.h>
 #include <inttypes.h>
 #include <stdio.h>
diff --git a/src/native/containers/simdhash-benchmark/all-measurements.h b/src/native/containers/simdhash-benchmark/all-measurements.h
index 2e49927337a34..e6646f203e49e 100644
--- a/src/native/containers/simdhash-benchmark/all-measurements.h
+++ b/src/native/containers/simdhash-benchmark/all-measurements.h
@@ -6,27 +6,57 @@
 #ifndef MEASUREMENTS_IMPLEMENTATION
 #define MEASUREMENTS_IMPLEMENTATION 1
 
-// If this is too large and libc's rand() is low quality (i.e. MSVC),
-//  initializing the data will take forever
-#define INNER_COUNT 1024 * 16
+#define INNER_COUNT 1024 * 32
 #define BASELINE_SIZE 20480
 
 static dn_simdhash_u32_ptr_t *random_u32s_hash;
 static dn_vector_t *sequential_u32s, *random_u32s, *random_unused_u32s;
 
+// rand() isn't guaranteed to give 32 random bits on all targets, so
+//  use xoshiro seeded with 4 known integers
+// We want to use the same random sequence on every benchmark run, otherwise
+//  execution times could vary
+uint64_t rol64(uint64_t x, int k) {
+	return (x << k) | (x >> (64 - k));
+}
+
+static uint64_t rng_state[4] = {
+    0x6ED11324ABA9232Cul,
+    0x1F0C07E6522724A6ul,
+    0x293F7FDA2AF571D4ul,
+    0x5804EC9FFD70112Aul,
+};
+
+uint64_t xoshiro256ss() {
+	uint64_t const result = rol64(rng_state[1] * 5, 7) * 9;
+	uint64_t const t = rng_state[1] << 17;
+
+	rng_state[2] ^= rng_state[0];
+	rng_state[3] ^= rng_state[1];
+	rng_state[1] ^= rng_state[2];
+	rng_state[0] ^= rng_state[3];
+
+	rng_state[2] ^= t;
+	rng_state[3] = rol64(rng_state[3], 45);
+
+	return result;
+}
+
+static uint32_t random_uint () {
+    return (uint32_t)(xoshiro256ss() & 0xFFFFFFFFu);
+}
+
 static void init_data () {
     random_u32s_hash = dn_simdhash_u32_ptr_new(INNER_COUNT, NULL);
     sequential_u32s = dn_vector_alloc(sizeof(uint32_t));
     random_u32s = dn_vector_alloc(sizeof(uint32_t));
     random_unused_u32s = dn_vector_alloc(sizeof(uint32_t));
-    // For consistent data between runs
-    srand(1);
 
     for (uint32_t i = 0; i < INNER_COUNT; i++) {
         dn_vector_push_back(sequential_u32s, i);
 
 retry: {
-        uint32_t key = (uint32_t)(rand() & 0xFFFFFFFFu);
+        uint32_t key = random_uint();
         if (!dn_simdhash_u32_ptr_try_add(random_u32s_hash, key, NULL))
             goto retry;
 
@@ -36,7 +66,7 @@ retry: {
 
     for (uint32_t i = 0; i < INNER_COUNT; i++) {
 retry2: {
-        uint32_t key = (uint32_t)(rand() & 0xFFFFFFFFu);
+        uint32_t key = random_uint();
         if (!dn_simdhash_u32_ptr_try_add(random_u32s_hash, key, NULL))
             goto retry2;
 
diff --git a/src/native/containers/simdhash-benchmark/run-benchmark.ps1 b/src/native/containers/simdhash-benchmark/run-benchmark.ps1
index 536ba40cd9144..9417d16c505cb 100755
--- a/src/native/containers/simdhash-benchmark/run-benchmark.ps1
+++ b/src/native/containers/simdhash-benchmark/run-benchmark.ps1
@@ -1,2 +1,2 @@
-cl /GS- /O2 /std:c17 ./*.c ../dn-simdhash-u32-ptr.c ../dn-simdhash.c ../dn-vector.c ../dn-simdhash-string-ptr.c /DNO_CONFIG_H /DSIZEOF_VOID_P=8 /Fe:all-measurements.exe
+cl /GS- /O2 /std:c17 ./*.c ../dn-simdhash-u32-ptr.c ../dn-simdhash.c ../dn-vector.c ../dn-simdhash-string-ptr.c /DNO_CONFIG_H /DSIZEOF_VOID_P=8 /DNDEBUG /Fe:all-measurements.exe
 ./all-measurements.exe