diff --git a/CMakeLists.txt b/CMakeLists.txt index 20b137765..8bbae670d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,6 +61,16 @@ if( BUILTIN_POPCNT ) endif() endif() +include(CheckAVX2) +if( AVX2 ) + if( CMAKE_COMPILER_IS_GNUCXX ) + append_cxx_compiler_flags("-mavx2" "GCC" CMAKE_CXX_OPT_FLAGS) + elseif( CMAKE_COMPILER_IS_CLANGXX ) + append_cxx_compiler_flags("-mavx2" "CLANG" CMAKE_CXX_OPT_FLAGS) + endif() + message(STATUS "Your compiler is not supported yet!") +endif() + add_subdirectory(external) add_subdirectory(include) add_subdirectory(lib) diff --git a/CMakeModules/CheckAVX2.cmake b/CMakeModules/CheckAVX2.cmake new file mode 100644 index 000000000..dcd8815f8 --- /dev/null +++ b/CMakeModules/CheckAVX2.cmake @@ -0,0 +1,24 @@ +# Check if the CPU provides fast operations +# for popcount, leftmost and rightmost bit + +set(AVX2 0) +# Check if we are on a Linux system +if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + # Use /proc/cpuinfo to get the information + file(STRINGS "/proc/cpuinfo" _cpuinfo) + if(_cpuinfo MATCHES "(avx2)") + set(AVX2 1) + endif() +elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") +# handle windows +# get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE) +# get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE) +elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") +# handle MacOs +execute_process(COMMAND sysctl -n machdep.cpu.features + OUTPUT_VARIABLE _cpuinfo OUTPUT_STRIP_TRAILING_WHITESPACE) + if(_cpuinfo MATCHES "AVX2") + set(AVX2 1) + endif() +endif() + diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp index b13fe791f..4f7436020 100644 --- a/include/sdsl/bits.hpp +++ b/include/sdsl/bits.hpp @@ -24,6 +24,9 @@ #include // for uint64_t uint32_t declaration #include // for cerr #include +#include // SSE/AVX +#include "ymm_union.hpp" // convenient YMM register wrapper +#include "xmm_union.hpp" // convenient XMM register wrapper #ifdef __SSE4_2__ #include #endif @@ -102,6 +105,22 @@ struct bits { */ static uint64_t cnt(uint64_t x); + //! Counts the number of set bits in YMM register x. + /*! \param YMM register + \return Number of set bits. + */ +#ifdef __AVX2__ + static uint64_t cnt256(__m256i x); +#endif + + //! Counts the number of set bits in XMM register x. + /*! \param XMM register + \return Number of set bits. + */ +#ifdef __SSE4_2__ + static uint64_t cnt128(__m128i x); +#endif + //! Position of the most significant set bit the 64-bit word x /*! \param x 64-bit word \return The position (in 0..63) of the least significant set bit @@ -237,6 +256,54 @@ struct bits { // ============= inline - implementations ================ +#ifdef __AVX2__ +inline uint64_t bits::cnt256(__m256i x){ + + // 4-bit universal table, 4-bit mask + static const __m256i MASK4_256 = _mm256_set1_epi8(0x0F); + static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + 0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4); + + __m256i low, high, bwcount; + + // byte halves stored in separate YMM registers + low = _mm256_and_si256(MASK4_256, x); + high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(x, 4)); + + // bytewise population count + bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low), + _mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, high)); + + // Use union to access individual bytes (unsigned integers) + sdsl::YMM_union ymm_union; + ymm_union.ymm = _mm256_sad_epu8(bwcount, _mm256_setzero_si256()); + return ymm_union.values[0] + ymm_union.values[4] + ymm_union.values[8] + ymm_union.values[12]; +} +#endif + +#ifdef __SSE4_2__ +inline uint64_t bits::cnt128(__m128i x){ + + // 4-bit universal table, 4-bit mask + static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); + static const __m128i MASK4 = _mm_set1_epi8(0x0F); + + __m128i low, high, count; + + low = _mm_and_si128(MASK4, x); + high = _mm_and_si128(MASK4, _mm_srli_epi16(x, 4)); + count = _mm_add_epi8(_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, low), + _mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high)); + + // Use union to access individual bytes (unsigned integers) + sdsl::XMM_union xmm_union; + xmm_union.xmm = _mm_sad_epu8(count, _mm_setzero_si128()); + return xmm_union.values[0] + xmm_union.values[4]; +} +#endif + // see page 11, Knuth TAOCP Vol 4 F1A inline uint64_t bits::cnt(uint64_t x) { diff --git a/include/sdsl/uint256_t.hpp b/include/sdsl/uint256_t.hpp index b93a42e31..c522da121 100644 --- a/include/sdsl/uint256_t.hpp +++ b/include/sdsl/uint256_t.hpp @@ -62,8 +62,30 @@ class uint256_t } inline uint16_t popcount() { +#ifdef __AVX2__ // Fastest method: 32 table lookups per clock cycle + sdsl::YMM_union ymm_union; + ymm_union.values[0] = m_lo; + ymm_union.values[1] = m_mid; + ymm_union.values[2] = m_high >> 64; + ymm_union.values[3] = m_high; + return bits::cnt256(ymm_union.ymm); +#endif + +#ifdef __SSE4_2__ // 16 table lookups per clock cycle + sdsl::XMM_union xmm_union1; + sdsl::XMM_union xmm_union2; + xmm_union1.values[0] = m_lo; + xmm_union1.values[1] = m_mid; + xmm_union2.values[0] = m_high >> 64; + xmm_union2.values[1] = m_high; + + return bits::cnt128(xmm_union1.xmm) + bits::cnt128(xmm_union2.xmm); + + +#else // byte after byte return ((uint16_t)bits::cnt(m_lo)) + bits::cnt(m_mid) + bits::cnt(m_high>>64) + bits::cnt(m_high); +#endif } inline uint16_t hi() { diff --git a/include/sdsl/xmm_union.hpp b/include/sdsl/xmm_union.hpp new file mode 100644 index 000000000..a28f4402c --- /dev/null +++ b/include/sdsl/xmm_union.hpp @@ -0,0 +1,37 @@ +/* sdsl - succinct data structures library + Copyright (C) 2012 Simon Gog + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see http://www.gnu.org/licenses/ . +*/ +/*! \file xmm_union.hpp + \brief xmm_union.hpp contains a convenientunion for XMM registers (128-bits). + \author Diego Havenstein +*/ +#ifndef INCLUDED_SDSL_XMMUNION +#define INCLUDED_SDSL_XMMUNION + +namespace sdsl +{ + +#ifdef __SSE4_2__ +template +union XMM_union { + __m128i xmm; + T values[16/sizeof(T)]; +}; +#endif + +} // end namespace + +#endif diff --git a/include/sdsl/ymm_union.hpp b/include/sdsl/ymm_union.hpp new file mode 100644 index 000000000..809b050b3 --- /dev/null +++ b/include/sdsl/ymm_union.hpp @@ -0,0 +1,37 @@ +/* sdsl - succinct data structures library + Copyright (C) 2012 Simon Gog + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see http://www.gnu.org/licenses/ . +*/ +/*! \file ymm_union.hpp + \brief ymm_union.hpp contains a convenientunion for YMM registers (256-bits). + \author Diego Havenstein +*/ +#ifndef INCLUDED_SDSL_YMMUNION +#define INCLUDED_SDSL_YMMUNION + +namespace sdsl +{ + +#ifdef __AVX2__ +template +union YMM_union { + __m256i ymm; + T values[32/sizeof(T)]; +}; +#endif + +} // end namespace + +#endif