Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/add arm support #286

Closed
wants to merge 53 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
2d89df4
move x86 arch and SIMD types to x86 arch folder
markos Sep 17, 2020
6a40793
move cpuid stuff to util/arch/x86
markos Sep 17, 2020
ea721c9
move crc32 SSE42 implementation to util/arch/x86
markos Sep 18, 2020
956b001
move masked_move* AVX2 implementation to util/arch/x86
markos Sep 18, 2020
8ed5f4a
fix include paths for masked_move
markos Sep 18, 2020
aac1f0f
move x86 bitutils.h implementations to util/arch/x86/bitutils.h
markos Sep 22, 2020
6581aae
move x86 popcount.h implementations to util/arch/x86/popcount.h
markos Sep 22, 2020
9f3ad89
move andn helper function to bitutils.h
markos Sep 22, 2020
e915d84
no need to check for WIN32*
markos Sep 22, 2020
e8e188a
move x86 implementations of simd_utils.h to util/arch/x86/
markos Sep 22, 2020
f7a6b89
add some set*() functions, harmonize names, rename setAxB to set1_AxB…
markos Sep 23, 2020
5333467
fix names, use own intrinsic instead of explicit _mm* ones
markos Sep 23, 2020
04fbf24
Revert "move x86 popcount.h implementations to util/arch/x86/popcount.h"
markos Sep 23, 2020
f0e70bc
Revert "Revert "move x86 popcount.h implementations to util/arch/x86/…
markos Sep 24, 2020
b1170bc
add arm checks in platform.cmake
markos Oct 6, 2020
5952c64
add necessary modifications to CMake system to enable building on ARM…
markos Oct 6, 2020
e91082d
use right intrinsic
markos Oct 6, 2020
9a04942
minor fix
markos Oct 7, 2020
4c924cc
add arm architecture basic defines
markos Oct 7, 2020
5d773dd
use C implementation of popcount for arm
markos Oct 7, 2020
d2cf1a7
move cpuid_flags.h header to common
markos Oct 8, 2020
1c2c73b
add C implementation of pdep64()
markos Oct 8, 2020
a921217
add arm bitutils.h header
markos Oct 8, 2020
31ac671
add ARM version of simd_utils.h
markos Oct 13, 2020
5b425bd
add arm simple cpuid_flags
markos Oct 15, 2020
c5a7f4b
add ARM simd_utils vectorized functions for 128-bit vectors
markos Oct 15, 2020
45bfed9
add scalar versions of the vectorized functions for architectures tha…
markos Oct 15, 2020
e7e1308
fix compilation paths for cpuid_flags for x86
markos Oct 16, 2020
83977db
split arch-agnostic simd_utils.h functions into the common file
markos Oct 16, 2020
4bce012
Revert "move x86 popcount.h implementations to util/arch/x86/popcount.h"
markos Oct 16, 2020
c4db636
scalar implementations of diffrich256 and diffrich384
markos Oct 16, 2020
149ea93
don't redefine function on x86
markos Oct 16, 2020
0bef151
don't use SSE directly in the tests
markos Oct 30, 2020
5482429
fix ARM implementations
markos Oct 30, 2020
547f79b
small optimization in storecompress*()
markos Oct 30, 2020
592b190
needed for ARM vector type conversions
markos Oct 30, 2020
18296ee
fix 32-bit/64-bit detection
markos Nov 5, 2020
7b8cf97
add extra instructions (currently arm-only), fix order of elements in…
markos Nov 5, 2020
3390418
add compress128 function and implementation
markos Nov 5, 2020
501f60e
add some debug info
markos Nov 5, 2020
62fed20
add some debug and minor optimizations in unit test
markos Nov 5, 2020
c4f1372
remove debug from functions
markos Nov 5, 2020
606c53a
fix compiler flag testcase
markos Nov 24, 2020
1c26f04
when building in debug mode, vgetq_lane_*() and vextq_*() need immedi…
markos Nov 24, 2020
d763652
helper functions to print a m128 vector in debug mode
markos Nov 24, 2020
17ab42d
small optimization that was for some reason failing in ARM, should be…
markos Nov 24, 2020
259c257
define debug vector print functions to NULL in non-debug mode
markos Dec 3, 2020
38477b0
fix movq and load_m128_from_u64a and resp. test for NEON
markos Dec 3, 2020
c38722a
add ARM platform
markos Dec 3, 2020
39945b7
clear zones array
markos Dec 3, 2020
773dc6f
optimize *shiftbyte_m128() functions to use palign instead of variabl…
markos Dec 7, 2020
e088c6a
remove forgotten printf
markos Dec 7, 2020
61b963a
fix x86 compilation
markos Dec 8, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 26 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ else()
string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
endforeach ()

if (CMAKE_COMPILER_IS_GNUCC)
if (ARCH_IA32 OR ARCH_X86_64 AND CMAKE_COMPILER_IS_GNUCC)
message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
# If gcc doesn't recognise the host cpu, then mtune=native becomes
# generic, which isn't very good in some cases. march=native looks at
Expand Down Expand Up @@ -281,10 +281,16 @@ else()
endif()

CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
if (ARCH_IA32 OR ARCH_X86_64)
CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
elseif (ARCH_ARM32 OR ARCH_AARCH64)
CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_C_ARM_NEON_H)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions")
endif()

CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
Expand Down Expand Up @@ -564,11 +570,22 @@ install(FILES ${hs_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs")
set (hs_exec_common_SRCS
src/alloc.c
src/scratch.c
src/util/cpuid_flags.c
src/util/cpuid_flags.h
src/util/arch/common/cpuid_flags.h
src/util/multibit.c
)

if (ARCH_IA32 OR ARCH_X86_64)
set (hs_exec_common_SRCS
${hs_exec_common_SRCS}
src/util/arch/x86/cpuid_flags.c
)
else (ARCH_ARM32 OR ARCH_AARCH64)
set (hs_exec_common_SRCS
${hs_exec_common_SRCS}
src/util/arch/arm/cpuid_flags.c
)
endif ()

set (hs_exec_SRCS
${hs_HEADERS}
src/hs_version.h
Expand Down Expand Up @@ -694,7 +711,6 @@ set (hs_exec_SRCS
src/util/exhaust.h
src/util/fatbit.h
src/util/join.h
src/util/masked_move.h
src/util/multibit.h
src/util/multibit.c
src/util/multibit_compress.h
Expand All @@ -716,7 +732,8 @@ set (hs_exec_SRCS

set (hs_exec_avx2_SRCS
src/fdr/teddy_avx2.c
src/util/masked_move.c
src/util/arch/x86/masked_move.c
src/util/arch/x86/masked_move.h
)


Expand Down
47 changes: 32 additions & 15 deletions cmake/arch.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ if (HAVE_C_X86INTRIN_H)
set (INTRIN_INC_H "x86intrin.h")
elseif (HAVE_C_INTRIN_H)
set (INTRIN_INC_H "intrin.h")
else ()
elseif (HAVE_C_ARM_NEON_H)
set (INTRIN_INC_H "arm_neon.h")
set (FAT_RUNTIME OFF)
else()
message (FATAL_ERROR "No intrinsics header found")
endif ()

Expand All @@ -29,15 +32,16 @@ else (NOT FAT_RUNTIME)
set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ARCH_C_FLAGS}")
endif ()

# ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
if (ARCH_IA32 OR ARCH_X86_64)
# ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
int main() {
__m128i a = _mm_set1_epi8(1);
(void)_mm_shuffle_epi8(a, a);
}" HAVE_SSSE3)

# now look for AVX2
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
# now look for AVX2
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
#if !defined(__AVX2__)
#error no avx2
#endif
Expand All @@ -47,8 +51,8 @@ int main(){
(void)_mm256_xor_si256(z, z);
}" HAVE_AVX2)

# and now for AVX512
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
# and now for AVX512
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
#if !defined(__AVX512BW__)
#error no avx512bw
#endif
Expand All @@ -58,8 +62,8 @@ int main(){
(void)_mm512_abs_epi8(z);
}" HAVE_AVX512)

# and now for AVX512VBMI
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
# and now for AVX512VBMI
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
#if !defined(__AVX512VBMI__)
#error no avx512vbmi
#endif
Expand All @@ -70,26 +74,39 @@ int main(){
(void)_mm512_permutexvar_epi8(idx, a);
}" HAVE_AVX512VBMI)

elseif (ARCH_ARM32 OR ARCH_AARCH64)
CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
int main() {
int32x4_t a = vdupq_n_s32(1);
(void)a;
}" HAVE_NEON)
else ()
message (FATAL_ERROR "Unsupported architecture")
endif ()

if (FAT_RUNTIME)
if (NOT HAVE_SSSE3)
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
message(FATAL_ERROR "SSSE3 support required to build fat runtime")
endif ()
if (NOT HAVE_AVX2)
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2)
message(FATAL_ERROR "AVX2 support required to build fat runtime")
endif ()
if (BUILD_AVX512 AND NOT HAVE_AVX512)
if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX512 AND NOT HAVE_AVX512)
message(FATAL_ERROR "AVX512 support requested but not supported")
endif ()
else (NOT FAT_RUNTIME)
if (NOT HAVE_AVX2)
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2)
message(STATUS "Building without AVX2 support")
endif ()
if (NOT HAVE_AVX512)
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512)
message(STATUS "Building without AVX512 support")
endif ()
if (NOT HAVE_SSSE3)
if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
endif ()
if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON)
message(FATAL_ERROR "NEON support required for ARM support")
endif ()
endif ()

unset (CMAKE_REQUIRED_FLAGS)
Expand Down
9 changes: 9 additions & 0 deletions cmake/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
/* "Define if building for EM64T" */
#cmakedefine ARCH_X86_64

/* "Define if building for ARM32" */
#cmakedefine ARCH_ARM32

/* "Define if building for AARCH64" */
#cmakedefine ARCH_AARCH64

/* internal build, switch on dump support. */
#cmakedefine DUMP_SUPPORT

Expand Down Expand Up @@ -45,6 +51,9 @@
/* C compiler has intrin.h */
#cmakedefine HAVE_C_INTRIN_H

/* C compiler has arm_neon.h */
#cmakedefine HAVE_C_ARM_NEON_H

/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
0 if you don't. */
#cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP
Expand Down
14 changes: 10 additions & 4 deletions cmake/platform.cmake
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
# determine the target arch

# really only interested in the preprocessor here
CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT)
CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64)

CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT)
CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)

set(ARCH_X86_64 ${ARCH_64_BIT})
set(ARCH_IA32 ${ARCH_32_BIT})
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)

if (ARCH_X86_64 OR ARCH_AARCH64)
set(ARCH_64_BIT TRUE)
else()
set(ARCH_32_BIT TRUE)
endif()
49 changes: 1 addition & 48 deletions src/crc32.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
#include "config.h"
#include "ue2common.h"
#include "util/arch.h"
#include "util/intrinsics.h"

#if !defined(HAVE_SSE42)

Expand Down Expand Up @@ -579,53 +578,7 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
}

#else // HAVE_SSE42

#ifdef ARCH_64_BIT
#define CRC_WORD 8
#define CRC_TYPE u64a
#define CRC_FUNC _mm_crc32_u64
#else
#define CRC_WORD 4
#define CRC_TYPE u32
#define CRC_FUNC _mm_crc32_u32
#endif

/*
* Use the crc32 instruction from SSE4.2 to compute our checksum - same
* polynomial as the above function.
*/
static really_inline
u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
const size_t length) {
u32 crc = running_crc;

// Process byte-by-byte until p_buf is aligned

const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD);
size_t init_bytes = aligned_buf - p_buf;
size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD;
size_t end_bytes = length - init_bytes - running_length;

while (p_buf < aligned_buf) {
crc = _mm_crc32_u8(crc, *p_buf++);
}

// Main aligned loop, processes a word at a time.

for (size_t li = 0; li < running_length/CRC_WORD; li++) {
CRC_TYPE block = *(const CRC_TYPE *)p_buf;
crc = CRC_FUNC(crc, block);
p_buf += CRC_WORD;
}

// Remaining bytes

for(size_t li = 0; li < end_bytes; li++) {
crc = _mm_crc32_u8(crc, *p_buf++);
}

return crc;
}
#include "util/arch/x86/crc32.h"
#endif

#ifdef VERIFY_ASSERTION
Expand Down
1 change: 1 addition & 0 deletions src/database.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ extern "C"
// CPU type is the low 6 bits (we can't need more than 64, surely!)

#define HS_PLATFORM_INTEL 1
#define HS_PLATFORM_ARM 2
#define HS_PLATFORM_CPU_MASK 0x3F

#define HS_PLATFORM_NOAVX2 (4<<13)
Expand Down
4 changes: 3 additions & 1 deletion src/dispatcher.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@
#include "hs_common.h"
#include "hs_runtime.h"
#include "ue2common.h"
#include "util/cpuid_inline.h"
#if defined(ARCH_X86_64)
#include "util/arch/x86/cpuid_inline.h"
#endif
#include "util/join.h"

#if defined(DISABLE_AVX512_DISPATCH)
Expand Down
16 changes: 2 additions & 14 deletions src/fdr/fdr.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "teddy.h"
#include "teddy_internal.h"
#include "util/arch.h"
#include "util/bitutils.h"
#include "util/simd_utils.h"
#include "util/uniform_ops.h"

Expand Down Expand Up @@ -119,20 +120,6 @@ const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
};

/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
* so we force its generation.
*/
static really_inline
u64a andn(const u32 a, const u8 *b) {
u64a r;
#if defined(HAVE_BMI) && !defined(NO_ASM)
__asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
#else
r = unaligned_load_u32(b) & ~a;
#endif
return r;
}

/* generates an initial state mask based on the last byte-ish of history rather
* than being all accepting. If there is no history to consider, the state is
* generated based on the minimum length of each bucket in order to prevent
Expand Down Expand Up @@ -739,6 +726,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
assert(ISALIGNED_CL(confBase));
struct zone zones[ZONE_MAX];
assert(fdr->domain > 8 && fdr->domain < 16);
memset(zones, 0, sizeof(zones));

size_t numZone = prepareZones(a->buf, a->len,
a->buf_history + a->len_history,
Expand Down
Loading