diff --git a/folly/hash/BUCK b/folly/hash/BUCK index b334e284edb..760aa20f589 100644 --- a/folly/hash/BUCK +++ b/folly/hash/BUCK @@ -10,6 +10,8 @@ cpp_library( "//folly:cpu_id", "//folly/detail:traponavx512", "//folly/external/fast-crc32:avx512_crc32c_v8s3x4", # @manual + "//folly/external/fast-crc32:neon_crc32c_v3s4x2e_v2", # @manual + "//folly/external/fast-crc32:neon_eor3_crc32c_v8s2x4_s3", # @manual "//folly/external/fast-crc32:sse_crc32c_v8s3x3", # @manual "//folly/hash/detail:checksum_detail", ], diff --git a/folly/hash/Checksum.cpp b/folly/hash/Checksum.cpp index d2aaa1e6caf..7fa8fa55cc2 100644 --- a/folly/hash/Checksum.cpp +++ b/folly/hash/Checksum.cpp @@ -24,6 +24,8 @@ #include #include #include // @manual +#include // @manual +#include // @manual #include // @manual #include @@ -90,6 +92,14 @@ bool crc32_hw_supported() { return id.sse42(); } +bool crc32c_hw_supported_neon() { + return false; +} + +bool crc32c_hw_supported_neon_eor3_sha3() { + return false; +} + #elif FOLLY_ARM_FEATURE_CRC32 // crc32_hw is defined in folly/external/nvidia/hash/Checksum.cpp @@ -106,6 +116,16 @@ bool crc32c_hw_supported_avx512() { return false; } +bool crc32c_hw_supported_neon() { + static bool has_neon = has_neon_crc32c_v3s4x2e_v2(); + return has_neon; +} + +bool crc32c_hw_supported_neon_eor3_sha3() { + static bool has_neon_eor3 = has_neon_eor3_crc32c_v8s2x4_s3(); + return has_neon_eor3; +} + bool crc32_hw_supported() { return true; } @@ -134,6 +154,14 @@ bool crc32c_hw_supported_avx512() { bool crc32_hw_supported() { return false; } + +bool crc32c_hw_supported_neon() { + return false; +} + +bool crc32c_hw_supported_neon_eor3_sha3() { + return false; +} #endif template @@ -179,6 +207,16 @@ uint32_t crc32c(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) { } #endif +#if FOLLY_AARCH64 + if (nbytes >= 2048 && detail::crc32c_hw_supported_neon_eor3_sha3()) { + return detail::neon_eor3_crc32c_v8s2x4_s3(data, nbytes, startingChecksum); + } + + if (nbytes >= 4096 && detail::crc32c_hw_supported_neon()) { + return detail::neon_crc32c_v3s4x2e_v2(data, nbytes, startingChecksum); + } +#endif + if (detail::crc32c_hw_supported()) { #if defined(FOLLY_ENABLE_SSE42_CRC32C_V8S3X3) if (nbytes > 4096) { diff --git a/folly/hash/detail/ChecksumDetail.h b/folly/hash/detail/ChecksumDetail.h index b0d525ad8d9..66bf258a22f 100644 --- a/folly/hash/detail/ChecksumDetail.h +++ b/folly/hash/detail/ChecksumDetail.h @@ -60,6 +60,18 @@ bool crc32c_hw_supported(); */ bool crc32c_hw_supported_avx512(); +/** + * Check whether a NEON hardware-accelerated CRC-32C implementation is + * supported on the current CPU. + */ +bool crc32c_hw_supported_neon(); + +/** + * Check whether a NEON+EOR3+SHA3 hardware-accelerated CRC-32C implementation + * is supported on the current CPU. + */ +bool crc32c_hw_supported_neon_eor3_sha3(); + /** * Compute a CRC-32C checksum of a buffer using a portable, * software-only implementation. diff --git a/folly/hash/test/BUCK b/folly/hash/test/BUCK index c2509742e2c..d5102de9db7 100644 --- a/folly/hash/test/BUCK +++ b/folly/hash/test/BUCK @@ -9,8 +9,11 @@ cpp_unittest( headers = [], deps = [ "//folly:benchmark", + "//folly:portability", "//folly:random", "//folly/external/fast-crc32:avx512_crc32c_v8s3x4", + "//folly/external/fast-crc32:neon_crc32c_v3s4x2e_v2", + "//folly/external/fast-crc32:neon_eor3_crc32c_v8s2x4_s3", "//folly/external/fast-crc32:sse_crc32c_v8s3x3", "//folly/hash:checksum", "//folly/hash:hash", diff --git a/folly/hash/test/ChecksumTest.cpp b/folly/hash/test/ChecksumTest.cpp index 4e7253dff6d..65c013fed97 100644 --- a/folly/hash/test/ChecksumTest.cpp +++ b/folly/hash/test/ChecksumTest.cpp @@ -19,8 +19,11 @@ #include #include +#include #include #include +#include +#include #include #include #include @@ -119,8 +122,10 @@ TEST(Checksum, crc32cHardware) { if (folly::detail::crc32c_hw_supported()) { testCRC32C(folly::detail::crc32c_hw); } else { +#if FOLLY_X64 LOG(WARNING) << "skipping hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -132,8 +137,10 @@ TEST(Checksum, crc32cHardwareEq) { EXPECT_EQ(sw, hw); } } else { +#if FOLLY_X64 LOG(WARNING) << "skipping hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -141,8 +148,10 @@ TEST(Checksum, crc32cContinuationHardware) { if (folly::detail::crc32c_hw_supported()) { testCRC32CContinuation(folly::detail::crc32c_hw); } else { +#if FOLLY_X64 LOG(WARNING) << "skipping hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -150,8 +159,10 @@ TEST(Checksum, crc32cHardwareSse42) { if (folly::detail::crc32c_hw_supported_sse42()) { testCRC32C(folly::detail::sse_crc32c_v8s3x3); } else { +#if FOLLY_X64 LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -163,8 +174,10 @@ TEST(Checksum, crc32cHardwareEqSse42) { ASSERT_EQ(sw, hw); } } else { +#if FOLLY_X64 LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -172,8 +185,10 @@ TEST(Checksum, crc32cContinuationHardwareSse42) { if (folly::detail::crc32c_hw_supported_sse42()) { testCRC32CContinuation(folly::detail::sse_crc32c_v8s3x3); } else { +#if FOLLY_X64 LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -181,8 +196,10 @@ TEST(Checksum, crc32cHardwareAvx512) { if (folly::detail::crc32c_hw_supported_avx512()) { testCRC32C(folly::detail::avx512_crc32c_v8s3x4); } else { +#if FOLLY_X64 LOG(WARNING) << "skipping AVX512 hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -194,8 +211,10 @@ TEST(Checksum, crc32cHardwareEqAvx512) { ASSERT_EQ(sw, hw); } } else { +#if FOLLY_X64 LOG(WARNING) << "skipping AVX512 hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif } } @@ -203,8 +222,84 @@ TEST(Checksum, crc32cContinuationHardwareAvx512) { if (folly::detail::crc32c_hw_supported_avx512()) { testCRC32CContinuation(folly::detail::avx512_crc32c_v8s3x4); } else { +#if FOLLY_X64 LOG(WARNING) << "skipping AVX512 hardware-accelerated CRC-32C tests" << " (not supported on this CPU)"; +#endif + } +} + +TEST(Checksum, crc32cHardwareNeon) { + if (folly::detail::crc32c_hw_supported_neon()) { + testCRC32C(folly::detail::neon_crc32c_v3s4x2e_v2); + } else { +#if FOLLY_AARCH64 + LOG(WARNING) << "skipping NEON hardware-accelerated CRC-32C tests" + << " (not supported on this CPU)"; +#endif + } +} + +TEST(Checksum, crc32cHardwareEqNeon) { + if (folly::detail::crc32c_hw_supported_neon()) { + for (size_t i = 0; i < 1000; i++) { + auto sw = folly::detail::crc32c_sw(buffer, i, 0); + auto hw = folly::detail::neon_crc32c_v3s4x2e_v2(buffer, i, 0); + ASSERT_EQ(sw, hw); + } + } else { +#if FOLLY_AARCH64 + LOG(WARNING) << "skipping NEON hardware-accelerated CRC-32C tests" + << " (not supported on this CPU)"; +#endif + } +} + +TEST(Checksum, crc32cContinuationHardwareNeon) { + if (folly::detail::crc32c_hw_supported_neon()) { + testCRC32CContinuation(folly::detail::neon_crc32c_v3s4x2e_v2); + } else { +#if FOLLY_AARCH64 + LOG(WARNING) << "skipping NEON hardware-accelerated CRC-32C tests" + << " (not supported on this CPU)"; +#endif + } +} + +TEST(Checksum, crc32cHardwareNeonEor3Sha3) { + if (folly::detail::crc32c_hw_supported_neon_eor3_sha3()) { + testCRC32C(folly::detail::neon_eor3_crc32c_v8s2x4_s3); + } else { +#if FOLLY_AARCH64 + LOG(WARNING) << "skipping NEON+EOR3+SHA3 hardware-accelerated CRC-32C tests" + << " (not supported on this CPU)"; +#endif + } +} + +TEST(Checksum, crc32cHardwareEqNeonEor3Sha3) { + if (folly::detail::crc32c_hw_supported_neon_eor3_sha3()) { + for (size_t i = 0; i < 1000; i++) { + auto sw = folly::detail::crc32c_sw(buffer, i, 0); + auto hw = folly::detail::neon_eor3_crc32c_v8s2x4_s3(buffer, i, 0); + ASSERT_EQ(sw, hw); + } + } else { +#if FOLLY_AARCH64 + LOG(WARNING) << "skipping NEON+EOR3+SHA3 hardware-accelerated CRC-32C tests" + << " (not supported on this CPU)"; +#endif + } +} + +TEST(Checksum, crc32cContinuationHardwareNeonEor3Sha3) { + if (folly::detail::crc32c_hw_supported_neon_eor3_sha3()) { + testCRC32CContinuation(folly::detail::neon_eor3_crc32c_v8s2x4_s3); + } else { +#if FOLLY_AARCH64 + LOG(WARNING) << "skipping NEON+EOR3+SHA3 hardware-accelerated CRC-32C tests" + << " (not supported on this CPU)"; +#endif } } @@ -230,6 +325,15 @@ TEST(Checksum, crc32clargeBuffers) { auto crcAvx = folly::detail::avx512_crc32c_v8s3x4(bufp, kLargeBufSz, ~0); ASSERT_EQ(kCrc, crcAvx); } + if (folly::detail::crc32c_hw_supported_neon()) { + auto crcHw = folly::detail::neon_crc32c_v3s4x2e_v2(bufp, kLargeBufSz, ~0); + ASSERT_EQ(kCrc, crcHw); + } + if (folly::detail::crc32c_hw_supported_neon_eor3_sha3()) { + auto crcHw = + folly::detail::neon_eor3_crc32c_v8s2x4_s3(bufp, kLargeBufSz, ~0); + ASSERT_EQ(kCrc, crcHw); + } } #endif