Skip to content

Commit

Permalink
Update to latest libprimesieve
Browse files Browse the repository at this point in the history
  • Loading branch information
kimwalisch committed Jul 11, 2024
1 parent dcdb6f0 commit 9100e57
Showing 1 changed file with 138 additions and 36 deletions.
174 changes: 138 additions & 36 deletions lib/primesieve/include/primesieve/popcnt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,23 @@
#include "cpu_supports_popcnt.hpp"
#endif

// GCC & Clang
#if defined(__GNUC__) || \
__has_builtin(__builtin_popcountl)

// CPUID is only enabled on x86 and x86-64 CPUs
// if the user compiles without -mpopcnt.
#if defined(ENABLE_MULTIARCH_x86_POPCNT)
#if defined(__x86_64__)

namespace {

/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
inline uint64_t popcnt64_bitwise(uint64_t x)
NOINLINE uint64_t popcnt64_bitwise(uint64_t x)
{
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
Expand All @@ -40,19 +49,6 @@ inline uint64_t popcnt64_bitwise(uint64_t x)
return (x * h01) >> 56;
}

} // namespace

// GCC & Clang
#if defined(__GNUC__) || \
__has_builtin(__builtin_popcountl)

// CPUID is only enabled on x86 and x86-64 CPUs
// if the user compiles without -mpopcnt.
#if defined(ENABLE_MULTIARCH_x86_POPCNT)
#if defined(__x86_64__)

namespace {

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
// On my AMD EPYC 7642 CPU using GCC 12 this runtime
Expand All @@ -63,13 +59,7 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
return x;
}
else
{
// On x86 and x64 CPUs when using the GCC compiler
// __builtin_popcount*(x) is slow (not inlined function call)
// when compiling without -mpopcnt. Therefore we avoid
// using __builtin_popcount*(x) here.
return popcnt64_bitwise(x);
}
}

} // namespace
Expand All @@ -78,6 +68,25 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x)

namespace {

/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
NOINLINE uint64_t popcnt64_bitwise(uint64_t x)
{
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
uint64_t h01 = 0x0101010101010101ull;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
}

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
if_likely(cpu_supports_popcnt)
Expand All @@ -89,13 +98,7 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
return x0 + x1;
}
else
{
// On x86 and x64 CPUs when using the GCC compiler
// __builtin_popcount*(x) is slow (not inlined function call)
// when compiling without -mpopcnt. Therefore we avoid
// using __builtin_popcount*(x) here.
return popcnt64_bitwise(x);
}
}

} // namespace
Expand Down Expand Up @@ -132,23 +135,66 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x)

namespace {

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
#if defined(__POPCNT__) || \
defined(__AVX__)

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
return __popcnt64(x);
}

#elif defined(ENABLE_MULTIARCH_x86_POPCNT)

/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
NOINLINE uint64_t popcnt64_bitwise(uint64_t x)
{
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
uint64_t h01 = 0x0101010101010101ull;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
}

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
if_likely(cpu_supports_popcnt)
return __popcnt64(x);
else
return popcnt64_bitwise(x);
}

#else
return popcnt64_bitwise(x);
#endif

/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
uint64_t h01 = 0x0101010101010101ull;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
}

#endif

} // namespace

#elif defined(_MSC_VER) && \
Expand All @@ -159,25 +205,68 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x)

namespace {

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
#if defined(__POPCNT__) || \
defined(__AVX__)

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
return __popcnt(uint32_t(x)) +
__popcnt(uint32_t(x >> 32));
}

#elif defined(ENABLE_MULTIARCH_x86_POPCNT)

/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
NOINLINE uint64_t popcnt64_bitwise(uint64_t x)
{
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
uint64_t h01 = 0x0101010101010101ull;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
}

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
if_likely(cpu_supports_popcnt)
return __popcnt(uint32_t(x)) +
__popcnt(uint32_t(x >> 32));
else
return popcnt64_bitwise(x);
}

#else
return popcnt64_bitwise(x);
#endif

/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
uint64_t h01 = 0x0101010101010101ull;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
}

#endif

} // namespace

#elif __cplusplus >= 202002L && \
Expand All @@ -201,10 +290,23 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x)

namespace {

/// Portable (but slow) popcount algorithm
/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
return popcnt64_bitwise(x);
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
uint64_t h01 = 0x0101010101010101ull;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
}

} // namespace
Expand Down

0 comments on commit 9100e57

Please sign in to comment.