Skip to content

Commit

Permalink
Handle MIPS32 cycle counter overflow
Browse files Browse the repository at this point in the history
  • Loading branch information
darkk committed Aug 29, 2024
1 parent 0fea460 commit 3edb6bb
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 33 deletions.
89 changes: 67 additions & 22 deletions Platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#pragma once

#include <stdint.h>

#if defined(__aarch64__) && defined(HAVE_INT64)
// fixme: bad system call with threads (8 cpu octocore)
# define NCPU 0
Expand All @@ -20,6 +22,30 @@ void SetThreadAffinity ( std::thread &t, int cpu );
#endif
void SetAffinity ( int cpu );

// That's not UINT64_MAX as it's converted to int64_t sometimes.
constexpr uint64_t timer_inf = INT64_MAX;

static inline uint64_t timer_sub(uint64_t a, uint64_t b)
{
#if defined(__mips__)
// Tune SafeMHzFor32CC to reflect max clock freq. of your machine with 32-bit cycle counter.
const uint32_t SafeMHzFor32CC = 1500;

if ((a >> 60) != (b >> 60)) return timer_inf;
const uint32_t scale = (a >> 60) + 1;
const uint32_t acc = a & UINT32_MAX, bcc = b & UINT32_MAX;
uint32_t as = (a >> 32) & (UINT32_MAX >> 4), bs = (b >> 32) & (UINT32_MAX >> 4);
if (as < bs) as += 1 << 28; // 28-bit seconds value overflows every 8 years, so wrap is safe
const uint32_t dsceil = as - bs + 1;
const uint32_t safeds = UINT32_MAX / 1000000 * scale / SafeMHzFor32CC; // that's floor()
if (dsceil > safeds) return timer_inf; // 32-bit CC value wraps every ~two seconds @ 2 GHz
const uint32_t diff = acc - bcc; // well-defined unsigned overflow
return uint64_t(diff) * scale;
#else
return (a > b) ? a - b : timer_inf; // true 64-bit cycle counter overflows once a century @ 4 GHz
#endif
}

#ifndef __x86_64__
#if defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64)
#define __x86_64__
Expand All @@ -44,7 +70,6 @@ void SetAffinity ( int cpu );
#include <stdlib.h>
#include <math.h> // Has to be included before intrin.h or VC complains about 'ceil'
#include <intrin.h> // for __rdtsc
#include <stdint.h>

#define ROTL32(x,y) _rotl(x,y)
#define ROTL64(x,y) _rotl64(x,y)
Expand Down Expand Up @@ -82,7 +107,6 @@ void SetAffinity ( int cpu );
#include <cstddef>
#endif
#include <stdlib.h>
#include <stdint.h>
#include <sys/time.h>
#if NCPU > 1
#include <pthread.h>
Expand Down Expand Up @@ -123,31 +147,29 @@ inline uint64_t rotr64 ( uint64_t x, int8_t r )
#define ROTR32(x,y) rotr32(x,y)
#define ROTR64(x,y) rotr64(x,y)

// CLOCK_MONOTONIC_RAW access is measurably faster on some platforms.
#if defined(CLOCK_MONOTONIC_RAW)
# define CLOCK_MONOTONIC_FASTEST CLOCK_MONOTONIC_RAW
#elif defined(CLOCK_MONOTONIC)
# define CLOCK_MONOTONIC_FASTEST CLOCK_MONOTONIC
#else
# undef CLOCK_MONOTONIC_FASTEST
#endif

__inline__ uint64_t timeofday()
{
#if defined(CLOCK_MONOTONIC_RAW) || defined(CLOCK_MONOTONIC)
# if defined(CLOCK_MONOTONIC_RAW)
// CLOCK_MONOTONIC_RAW access is measurably faster on some platforms.
const clockid_t clock = CLOCK_MONOTONIC_RAW;
# else
const clockid_t clock = CLOCK_MONOTONIC;
# endif
#ifdef CLOCK_MONOTONIC_FASTEST
struct timespec ts;
clock_gettime(clock, &ts);
clock_gettime(CLOCK_MONOTONIC_FASTEST, &ts);
return int64_t(ts.tv_sec) * 1000000000 + ts.tv_nsec;
#else
# warning neither CLOCK_MONOTONIC nor CLOCK_MONOTONIC_RAW is defined, no nanosecond precision.
struct timeval tv;
gettimeofday(&tv, NULL);
return int64_t(tv.tv_sec) * 1000000000 + tv.tv_usec * 1000;
#endif
}

#if defined(__mips16) && !defined(__mips16e2) && (_MIPS_ISA == _MIPS_ISA_MIPS32 && __mips_isa_rev >= 2)
// `rdhwr` is MIPS32r2 or MIPS16e2 and not MIPS16. Some OpenWRT builds run
// with `-mips32r2 -mtune=24kc -mips16`, so MIPS16 has to be disabled for alike
// builds to get acces to `rdhwr` from assembler's standpoint.
__attribute__((nomips16))
#endif
__inline__ uint64_t rdtsc()
{
#ifdef _MSC_VER
Expand Down Expand Up @@ -180,20 +202,39 @@ __inline__ uint64_t rdtsc()
return (uint64_t)(pmccntr) * 64; // Should optimize to << 6
}
return timeofday();
#elif defined(__mips__)
#else
return timeofday();
#endif
}

#if defined(__mips__)
#if defined(__mips16) && !defined(__mips16e2) && (_MIPS_ISA == _MIPS_ISA_MIPS32 && __mips_isa_rev >= 2)
// `rdhwr` is MIPS32r2 or MIPS16e2 and not MIPS16. Some OpenWRT builds run
// with `-mips32r2 -mtune=24kc -mips16`, so MIPS16 has to be disabled for alike
// builds to get acces to `rdhwr` from assembler's standpoint, otherwise build fails.
__attribute__((nomips16))
#endif
__inline__ uint64_t timer_mips()
{
// Access to these registers _might_ be prohibited to user-mode code,
// but there is no way to check it. Linux allows it in configure_hwrena():
// https://github.com/torvalds/linux/blob/v6.9/arch/mips/kernel/traps.c#L2190-L2194
uint32_t cntr, scale = 1;
uint32_t cntr, scale;
asm volatile(
"rdhwr %0, $2\n\t" // MIPS_HWR_CC
"rdhwr %1, $3\n\t" // MIPS_HWR_CCRES
: "=r" (cntr), "=r" (scale));
return uint64_t(cntr) * scale;
#else
return timeofday();
#endif
scale--;
if (scale > 15)
return timer_inf;
// Unfortunately, 32-bit counter overflows in a few seconds, so wall clock timestamp
// has to be embedded into the timer value. Hopefully, clock_gettime call is VDSO...
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_FASTEST, &ts);
const uint32_t s28 = ts.tv_sec & (UINT32_MAX >> 4);
return uint64_t(scale) << 60 | (uint64_t(s28) << 32) | cntr;
}
#endif // __mips__

// see https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf 3.2.1 The Improved Benchmarking Method
__inline__ uint64_t timer_start()
Expand All @@ -219,6 +260,8 @@ __inline__ uint64_t timer_start()
"mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::
"%rax", "%rbx", "%rcx", "%rdx");
return ((uint64_t)cycles_high << 32) | cycles_low;
#elif defined(__mips__)
return timer_mips();
#else
return rdtsc();
#endif
Expand Down Expand Up @@ -247,6 +290,8 @@ __inline__ uint64_t timer_end()
"cpuid\n\t": "=r" (cycles_high), "=r" (cycles_low)::
"%rax", "%rbx", "%rcx", "%rdx");
return ((uint64_t)cycles_high << 32) | cycles_low;
#elif defined(__mips__)
return timer_mips();
#else
return rdtsc();
#endif
Expand Down
16 changes: 7 additions & 9 deletions SpeedTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,10 @@ NEVER_INLINE int64_t timehash ( pfHash hash, const void * key, int len, int seed
uint32_t temp[16];

begin = timer_start();

hash(key,len,seed,temp);

end = timer_end();
return end - begin;

return timer_sub(end, begin);
}

//-----------------------------------------------------------------------------
Expand Down Expand Up @@ -213,7 +211,7 @@ NEVER_INLINE int64_t timehash_small ( pfHash hash, const void * key, int len, in
end = timer_end();
delete[] buf;

return (int64_t)((end - begin) / (double)NUM_TRIALS);
return timer_sub(end, begin) / NUM_TRIALS;
}

//-----------------------------------------------------------------------------
Expand Down Expand Up @@ -353,7 +351,7 @@ double HashMapSpeedTest ( pfHash pfhash, const int hashbits,
hashmap.erase(line);
}
end = timer_end();
t1 = (double)(end - begin) / (double)words.size();
t1 = double(timer_sub(end, begin)) / words.size();
}
fflush(NULL);
printf("%0.3f cycles/op (%zu inserts, 1%% deletions)\n",
Expand All @@ -378,7 +376,7 @@ double HashMapSpeedTest ( pfHash pfhash, const int hashbits,
found++;
}
end = timer_end();
t = (double)(end - begin) / (double)words.size();
t = double(timer_sub(end, begin)) / words.size();
if(found > 0 && t > 0) times.push_back(t);
}
hashmap.clear();
Expand Down Expand Up @@ -414,7 +412,7 @@ double HashMapSpeedTest ( pfHash pfhash, const int hashbits,
phashmap.erase(line);
}
end = timer_end();
t1 = (double)(end - begin) / (double)words.size();
t1 = double(timer_sub(end, begin)) / words.size();
}
fflush(NULL);
printf("%0.3f cycles/op (%zu inserts, 1%% deletions)\n",
Expand All @@ -438,7 +436,7 @@ double HashMapSpeedTest ( pfHash pfhash, const int hashbits,
found++;
}
end = timer_end();
t = (double)(end - begin) / (double)words.size();
t = double(timer_sub(end, begin)) / words.size();
if(found > 0 && t > 0) times.push_back(t);
}
phashmap.clear();
Expand Down
10 changes: 8 additions & 2 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1027,8 +1027,14 @@ void test ( hashfunc<hashtype> hash, HashInfo* info )
printf("[[[ Speed Tests ]]]\n\n");
if (timer_counts_ns())
printf("WARNING: no cycle counter, cycle == 1ns\n");
if (timer_start() == timer_end())
printf("WARNING: timer resolution is low\n");
{
const uint64_t begin = timer_start(), end = timer_end();
const uint64_t delta = timer_sub(end, begin);
if (delta > 64) // "good" is ~30..40 ticks
printf("WARNING: timer resolution is %llu (%#llx) ticks (%#llx - %#llx). Broken VDSO?\n",
(unsigned long long)delta, (unsigned long long)delta,
(unsigned long long)end, (unsigned long long)begin);
}
fflush(NULL);

Seed_init (info, info->verification);
Expand Down

0 comments on commit 3edb6bb

Please sign in to comment.