diff --git a/Platform.h b/Platform.h index 67eff381..eced588b 100644 --- a/Platform.h +++ b/Platform.h @@ -3,6 +3,8 @@ #pragma once +#include + #if defined(__aarch64__) && defined(HAVE_INT64) // fixme: bad system call with threads (8 cpu octocore) # define NCPU 0 @@ -20,6 +22,30 @@ void SetThreadAffinity ( std::thread &t, int cpu ); #endif void SetAffinity ( int cpu ); +// That's not UINT64_MAX as it's converted to int64_t sometimes. +constexpr uint64_t timer_inf = INT64_MAX; + +static inline uint64_t timer_sub(uint64_t a, uint64_t b) +{ +#if defined(__mips__) + // Tune SafeMHzFor32CC to reflect max clock freq. of your machine with 32-bit cycle counter. + const uint32_t SafeMHzFor32CC = 1500; + + if ((a >> 60) != (b >> 60)) return timer_inf; + const uint32_t scale = (a >> 60) + 1; + const uint32_t acc = a & UINT32_MAX, bcc = b & UINT32_MAX; + uint32_t as = (a >> 32) & (UINT32_MAX >> 4), bs = (b >> 32) & (UINT32_MAX >> 4); + if (as < bs) as += 1 << 28; // 28-bit seconds value overflows every 8 years, so wrap is safe + const uint32_t dsceil = as - bs + 1; + const uint32_t safeds = UINT32_MAX / 1000000 * scale / SafeMHzFor32CC; // that's floor() + if (dsceil > safeds) return timer_inf; // 32-bit CC value wraps every ~two seconds @ 2 GHz + const uint32_t diff = acc - bcc; // well-defined unsigned overflow + return uint64_t(diff) * scale; +#else + return (a > b) ? a - b : timer_inf; // true 64-bit cycle counter overflows once a century @ 4 GHz +#endif +} + #ifndef __x86_64__ #if defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) #define __x86_64__ @@ -44,7 +70,6 @@ void SetAffinity ( int cpu ); #include #include // Has to be included before intrin.h or VC complains about 'ceil' #include // for __rdtsc -#include #define ROTL32(x,y) _rotl(x,y) #define ROTL64(x,y) _rotl64(x,y) @@ -82,7 +107,6 @@ void SetAffinity ( int cpu ); #include #endif #include -#include #include #if NCPU > 1 #include @@ -123,31 +147,29 @@ inline uint64_t rotr64 ( uint64_t x, int8_t r ) #define ROTR32(x,y) rotr32(x,y) #define ROTR64(x,y) rotr64(x,y) +// CLOCK_MONOTONIC_RAW access is measurably faster on some platforms. +#if defined(CLOCK_MONOTONIC_RAW) +# define CLOCK_MONOTONIC_FASTEST CLOCK_MONOTONIC_RAW +#elif defined(CLOCK_MONOTONIC) +# define CLOCK_MONOTONIC_FASTEST CLOCK_MONOTONIC +#else +# undef CLOCK_MONOTONIC_FASTEST +#endif + __inline__ uint64_t timeofday() { -#if defined(CLOCK_MONOTONIC_RAW) || defined(CLOCK_MONOTONIC) -# if defined(CLOCK_MONOTONIC_RAW) - // CLOCK_MONOTONIC_RAW access is measurably faster on some platforms. - const clockid_t clock = CLOCK_MONOTONIC_RAW; -# else - const clockid_t clock = CLOCK_MONOTONIC; -# endif +#ifdef CLOCK_MONOTONIC_FASTEST struct timespec ts; - clock_gettime(clock, &ts); + clock_gettime(CLOCK_MONOTONIC_FASTEST, &ts); return int64_t(ts.tv_sec) * 1000000000 + ts.tv_nsec; #else +# warning neither CLOCK_MONOTONIC nor CLOCK_MONOTONIC_RAW is defined, no nanosecond precision. struct timeval tv; gettimeofday(&tv, NULL); return int64_t(tv.tv_sec) * 1000000000 + tv.tv_usec * 1000; #endif } -#if defined(__mips16) && !defined(__mips16e2) && (_MIPS_ISA == _MIPS_ISA_MIPS32 && __mips_isa_rev >= 2) -// `rdhwr` is MIPS32r2 or MIPS16e2 and not MIPS16. Some OpenWRT builds run -// with `-mips32r2 -mtune=24kc -mips16`, so MIPS16 has to be disabled for alike -// builds to get acces to `rdhwr` from assembler's standpoint. -__attribute__((nomips16)) -#endif __inline__ uint64_t rdtsc() { #ifdef _MSC_VER @@ -180,20 +202,39 @@ __inline__ uint64_t rdtsc() return (uint64_t)(pmccntr) * 64; // Should optimize to << 6 } return timeofday(); -#elif defined(__mips__) +#else + return timeofday(); +#endif +} + +#if defined(__mips__) +#if defined(__mips16) && !defined(__mips16e2) && (_MIPS_ISA == _MIPS_ISA_MIPS32 && __mips_isa_rev >= 2) +// `rdhwr` is MIPS32r2 or MIPS16e2 and not MIPS16. Some OpenWRT builds run +// with `-mips32r2 -mtune=24kc -mips16`, so MIPS16 has to be disabled for alike +// builds to get acces to `rdhwr` from assembler's standpoint, otherwise build fails. +__attribute__((nomips16)) +#endif +__inline__ uint64_t timer_mips() +{ // Access to these registers _might_ be prohibited to user-mode code, // but there is no way to check it. Linux allows it in configure_hwrena(): // https://github.com/torvalds/linux/blob/v6.9/arch/mips/kernel/traps.c#L2190-L2194 - uint32_t cntr, scale = 1; + uint32_t cntr, scale; asm volatile( "rdhwr %0, $2\n\t" // MIPS_HWR_CC "rdhwr %1, $3\n\t" // MIPS_HWR_CCRES : "=r" (cntr), "=r" (scale)); - return uint64_t(cntr) * scale; -#else - return timeofday(); -#endif + scale--; + if (scale > 15) + return timer_inf; + // Unfortunately, 32-bit counter overflows in a few seconds, so wall clock timestamp + // has to be embedded into the timer value. Hopefully, clock_gettime call is VDSO... + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_FASTEST, &ts); + const uint32_t s28 = ts.tv_sec & (UINT32_MAX >> 4); + return uint64_t(scale) << 60 | (uint64_t(s28) << 32) | cntr; } +#endif // __mips__ // see https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf 3.2.1 The Improved Benchmarking Method __inline__ uint64_t timer_start() @@ -219,6 +260,8 @@ __inline__ uint64_t timer_start() "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); return ((uint64_t)cycles_high << 32) | cycles_low; +#elif defined(__mips__) + return timer_mips(); #else return rdtsc(); #endif @@ -247,6 +290,8 @@ __inline__ uint64_t timer_end() "cpuid\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); return ((uint64_t)cycles_high << 32) | cycles_low; +#elif defined(__mips__) + return timer_mips(); #else return rdtsc(); #endif diff --git a/SpeedTest.cpp b/SpeedTest.cpp index 0f4efa6c..9ba3a42a 100644 --- a/SpeedTest.cpp +++ b/SpeedTest.cpp @@ -170,12 +170,10 @@ NEVER_INLINE int64_t timehash ( pfHash hash, const void * key, int len, int seed uint32_t temp[16]; begin = timer_start(); - hash(key,len,seed,temp); - end = timer_end(); - - return end - begin; + + return timer_sub(end, begin); } //----------------------------------------------------------------------------- @@ -213,7 +211,7 @@ NEVER_INLINE int64_t timehash_small ( pfHash hash, const void * key, int len, in end = timer_end(); delete[] buf; - return (int64_t)((end - begin) / (double)NUM_TRIALS); + return timer_sub(end, begin) / NUM_TRIALS; } //----------------------------------------------------------------------------- @@ -353,7 +351,7 @@ double HashMapSpeedTest ( pfHash pfhash, const int hashbits, hashmap.erase(line); } end = timer_end(); - t1 = (double)(end - begin) / (double)words.size(); + t1 = double(timer_sub(end, begin)) / words.size(); } fflush(NULL); printf("%0.3f cycles/op (%zu inserts, 1%% deletions)\n", @@ -378,7 +376,7 @@ double HashMapSpeedTest ( pfHash pfhash, const int hashbits, found++; } end = timer_end(); - t = (double)(end - begin) / (double)words.size(); + t = double(timer_sub(end, begin)) / words.size(); if(found > 0 && t > 0) times.push_back(t); } hashmap.clear(); @@ -414,7 +412,7 @@ double HashMapSpeedTest ( pfHash pfhash, const int hashbits, phashmap.erase(line); } end = timer_end(); - t1 = (double)(end - begin) / (double)words.size(); + t1 = double(timer_sub(end, begin)) / words.size(); } fflush(NULL); printf("%0.3f cycles/op (%zu inserts, 1%% deletions)\n", @@ -438,7 +436,7 @@ double HashMapSpeedTest ( pfHash pfhash, const int hashbits, found++; } end = timer_end(); - t = (double)(end - begin) / (double)words.size(); + t = double(timer_sub(end, begin)) / words.size(); if(found > 0 && t > 0) times.push_back(t); } phashmap.clear(); diff --git a/main.cpp b/main.cpp index 5e7499b0..e1a89838 100644 --- a/main.cpp +++ b/main.cpp @@ -1027,8 +1027,14 @@ void test ( hashfunc hash, HashInfo* info ) printf("[[[ Speed Tests ]]]\n\n"); if (timer_counts_ns()) printf("WARNING: no cycle counter, cycle == 1ns\n"); - if (timer_start() == timer_end()) - printf("WARNING: timer resolution is low\n"); + { + const uint64_t begin = timer_start(), end = timer_end(); + const uint64_t delta = timer_sub(end, begin); + if (delta > 64) // "good" is ~30..40 ticks + printf("WARNING: timer resolution is %llu (%#llx) ticks (%#llx - %#llx). Broken VDSO?\n", + (unsigned long long)delta, (unsigned long long)delta, + (unsigned long long)end, (unsigned long long)begin); + } fflush(NULL); Seed_init (info, info->verification);