diff --git a/misc.cpp b/misc.cpp index 26bfbc6ee..5738b5846 100644 --- a/misc.cpp +++ b/misc.cpp @@ -19,6 +19,14 @@ #include "integer.h" #include "secblock.h" +#if defined(__AVX__) || defined(__SSE2__) +# include +#endif + +#if (CRYPTOPP_ARM_NEON_HEADER) +# include +#endif + NAMESPACE_BEGIN(CryptoPP) byte* BytePtr(SecByteBlock& str) @@ -43,111 +51,162 @@ size_t BytePtrSize(const SecByteBlock& str) return str.size(); } +// xorbuf simplified at https://github.com/weidai11/cryptopp/issues/1020 void xorbuf(byte *buf, const byte *mask, size_t count) { CRYPTOPP_ASSERT(buf != NULLPTR); CRYPTOPP_ASSERT(mask != NULLPTR); CRYPTOPP_ASSERT(count > 0); - size_t i=0; - if (IsAligned(buf) && IsAligned(mask)) +#if defined(__AVX__) + while (count >= 32) + { + __m256i b = _mm256_loadu_si256(reinterpret_cast(buf)); + __m256i m = _mm256_loadu_si256(reinterpret_cast(mask)); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(buf), _mm256_castps_si256( + _mm256_xor_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(m)))); + buf += 32; mask += 32; count -= 32; + } + // https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties + _mm256_zeroupper(); +#endif +#if defined(__SSE2__) + while (count >= 16) + { + __m128i b = _mm_loadu_si128(reinterpret_cast(buf)); + __m128i m = _mm_loadu_si128(reinterpret_cast(mask)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(buf), _mm_castps_si128( + _mm_xor_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(m)))); + buf += 16; mask += 16; count -= 16; + } + + if (count == 0) return; +#endif +#if defined(__ARM_FEATURE_NEON) + while (count >= 16) + { + vst1q_u8(buf, veorq_u8(vld1q_u8(buf), vld1q_u8(mask))); + buf += 16; mask += 16; count -= 16; + } + + if (count == 0) return; +#endif + + while (count >= 4) { - if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned(buf) && IsAligned(mask)) - { - for (i=0; i 0); - size_t i=0; - if (IsAligned(output) && IsAligned(input) && IsAligned(mask)) +#if defined(__AVX__) + while (count >= 32) + { + __m256i b = _mm256_loadu_si256(reinterpret_cast(input)); + __m256i m = _mm256_loadu_si256(reinterpret_cast(mask)); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), _mm256_castps_si256( + _mm256_xor_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(m)))); + output += 32; input += 32; mask += 32; count -= 32; + } + // https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties + _mm256_zeroupper(); +#endif +#if defined(__SSE2__) + while (count >= 16) + { + __m128i b = _mm_loadu_si128(reinterpret_cast(input)); + __m128i m = _mm_loadu_si128(reinterpret_cast(mask)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(output), _mm_castps_si128( + _mm_xor_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(m)))); + output += 16; input += 16; mask += 16; count -= 16; + } + + if (count == 0) return; +#endif +#if defined(__ARM_FEATURE_NEON) + while (count >= 16) + { + vst1q_u8(output, veorq_u8(vld1q_u8(input), vld1q_u8(mask))); + output += 16; input += 16; mask += 16; count -= 16; + } + + if (count == 0) return; +#endif + + while (count >= 4) { - if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned(output) && IsAligned(input) && IsAligned(mask)) - { - for (i=0; i 0); - size_t i=0; - byte acc8 = 0; +#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_ARM64 || CRYPTOPP_BOOL_PPC64 || CRYPTOPP_BOOL_MIPS64 || CRYPTOPP_BOOL_SPARC64 + word64 acc64 = 0; + while (count >= 8) + { + word64 b, m; + memcpy(&b, buf, 8); memcpy(&m, mask, 8); + acc64 |= b ^ m; + + buf += 8; mask += 8; count -= 8; + } - if (IsAligned(buf) && IsAligned(mask) && count) + word32 acc8 = (acc64 >> 32) | (acc64 & 0xffffffff); + acc8 = static_cast(acc8) | static_cast(acc8 >> 8) | + static_cast(acc8 >> 16) | static_cast(acc8 >> 24); +#else + word32 acc32 = 0; + while (count >= 4) { - word32 acc32 = 0; - if (!CRYPTOPP_BOOL_SLOW_WORD64 && IsAligned(buf) && IsAligned(mask)) - { - word64 acc64 = 0; - for (i=0; i>32); - } - - for (i=0; i>8) | byte(acc32>>16) | byte(acc32>>24); + word32 b, m; + memcpy(&b, buf, 4); memcpy(&m, mask, 4); + acc32 |= b ^ m; + + buf += 4; mask += 4; count -= 4; } - for (i=0; i(acc8) | static_cast(acc8 >> 8) | + static_cast(acc8 >> 16) | static_cast(acc8 >> 24); +#endif + + for (size_t i=0; i