diff --git a/cmd/ztest.c b/cmd/ztest.c index 9dce486ee08c..790835363fef 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -135,6 +135,7 @@ #include #include #include +#include #if (__GLIBC__ && !__UCLIBC__) #include /* for backtrace() */ #endif @@ -6410,6 +6411,7 @@ ztest_blake3(ztest_ds_t *zd, uint64_t id) int i, *ptr; uint32_t size; BLAKE3_CTX ctx; + const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); size = ztest_random_blocksize(); buf = umem_alloc(size, UMEM_NOFAIL); @@ -6434,7 +6436,7 @@ ztest_blake3(ztest_ds_t *zd, uint64_t id) void *res2 = &zc_res2; /* BLAKE3_KEY_LEN = 32 */ - VERIFY0(blake3_impl_setname("generic")); + VERIFY0(blake3->setname("generic")); templ = abd_checksum_blake3_tmpl_init(&salt); Blake3_InitKeyed(&ctx, salt_ptr); Blake3_Update(&ctx, buf, size); @@ -6443,7 +6445,7 @@ ztest_blake3(ztest_ds_t *zd, uint64_t id) ZIO_CHECKSUM_BSWAP(&zc_ref2); abd_checksum_blake3_tmpl_free(templ); - VERIFY0(blake3_impl_setname("cycle")); + VERIFY0(blake3->setname("cycle")); while (run_count-- > 0) { /* Test current implementation */ diff --git a/config/always-arch.m4 b/config/always-arch.m4 index f7090a4826ba..9f413eeddf95 100644 --- a/config/always-arch.m4 +++ b/config/always-arch.m4 @@ -22,6 +22,9 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_ARCH], [ aarch64*) TARGET_CPU=aarch64 ;; + armv*) + TARGET_CPU=arm + ;; sparc64) TARGET_CPU=sparc64 ;; @@ -31,7 +34,8 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_ARCH], [ esac AM_CONDITIONAL([TARGET_CPU_AARCH64], test $TARGET_CPU = aarch64) - AM_CONDITIONAL([TARGET_CPU_X86_64], test $TARGET_CPU = x86_64) + AM_CONDITIONAL([TARGET_CPU_X86_64], test $TARGET_CPU = x86_64) AM_CONDITIONAL([TARGET_CPU_POWERPC], test $TARGET_CPU = powerpc) AM_CONDITIONAL([TARGET_CPU_SPARC64], test $TARGET_CPU = sparc64) + AM_CONDITIONAL([TARGET_CPU_ARM], test $TARGET_CPU = arm) ]) diff --git a/include/Makefile.am b/include/Makefile.am index 1e5c71150eeb..6897e3c5e337 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -75,6 +75,7 @@ COMMON_H = \ sys/rrwlock.h \ sys/sa.h \ sys/sa_impl.h \ + sys/sha2.h \ sys/skein.h \ sys/spa.h \ sys/spa_checkpoint.h \ @@ -124,6 +125,7 @@ COMMON_H = \ sys/zfs_delay.h \ sys/zfs_file.h \ sys/zfs_fuid.h \ + sys/zfs_impl.h \ sys/zfs_project.h \ sys/zfs_quota.h \ sys/zfs_racct.h \ diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am index 89d4ef564d5f..9819e534b7f6 100644 --- a/include/os/freebsd/Makefile.am +++ b/include/os/freebsd/Makefile.am @@ -51,6 +51,8 @@ noinst_HEADERS = \ %D%/spl/sys/sid.h \ %D%/spl/sys/sig.h \ %D%/spl/sys/simd.h \ + %D%/spl/sys/simd_aarch64.h \ + %D%/spl/sys/simd_arm.h \ %D%/spl/sys/simd_powerpc.h \ %D%/spl/sys/simd_x86.h \ %D%/spl/sys/spl_condvar.h \ @@ -79,7 +81,6 @@ noinst_HEADERS = \ %D%/spl/sys/zone.h \ \ %D%/zfs/sys/freebsd_crypto.h \ - %D%/zfs/sys/sha2.h \ %D%/zfs/sys/vdev_os.h \ %D%/zfs/sys/zfs_bootenv_os.h \ %D%/zfs/sys/zfs_context_os.h \ diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index 48e8a2adb8d2..77ce75ca3f11 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -97,6 +97,12 @@ #define blake3_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, blake3_param, "A" +#define sha256_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, sha256_param, "A" + +#define sha512_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, sha512_param, "A" + #include #define module_init(fn) \ static void \ diff --git a/include/os/freebsd/spl/sys/simd.h b/include/os/freebsd/spl/sys/simd.h index 3106e4853c70..4560bb05e978 100644 --- a/include/os/freebsd/spl/sys/simd.h +++ b/include/os/freebsd/spl/sys/simd.h @@ -32,6 +32,12 @@ #if defined(__amd64__) || defined(__i386__) #include +#elif defined(__arm__) +#include + +#elif defined(__aarch64__) +#include + #elif defined(__powerpc__) #include diff --git a/include/os/freebsd/spl/sys/simd_aarch64.h b/include/os/freebsd/spl/sys/simd_aarch64.h new file mode 100644 index 000000000000..847c2ed29189 --- /dev/null +++ b/include/os/freebsd/spl/sys/simd_aarch64.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2022 Tino Reichardt + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * SIMD support: + * + * Following functions should be called to determine whether CPU feature + * is supported. All functions are usable in kernel and user space. + * If a SIMD algorithm is using more than one instruction set + * all relevant feature test functions should be called. + * + * Supported features: + * zfs_neon_available() + * zfs_sha256_available() + * zfs_sha512_available() + */ + +#ifndef _FREEBSD_SIMD_AARCH64_H +#define _FREEBSD_SIMD_AARCH64_H + +#include +#include + +#define kfpu_allowed() 1 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) + +/* + * Check if NEON is available + */ +static inline boolean_t +zfs_neon_available(void) +{ + return (elf_hwcap & HWCAP_FP); +} + +/* + * Check if SHA256 is available + */ +static inline boolean_t +zfs_sha256_available(void) +{ + return (elf_hwcap & HWCAP_SHA2); +} + +/* + * Check if SHA512 is available + */ +static inline boolean_t +zfs_sha512_available(void) +{ + return (elf_hwcap & HWCAP_SHA512); +} + +#endif /* _FREEBSD_SIMD_AARCH64_H */ diff --git a/include/os/freebsd/spl/sys/simd_arm.h b/include/os/freebsd/spl/sys/simd_arm.h new file mode 100644 index 000000000000..f6362cd6bb54 --- /dev/null +++ b/include/os/freebsd/spl/sys/simd_arm.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2022 Tino Reichardt + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * SIMD support: + * + * Following functions should be called to determine whether CPU feature + * is supported. All functions are usable in kernel and user space. + * If a SIMD algorithm is using more than one instruction set + * all relevant feature test functions should be called. + * + * Supported features: + * zfs_neon_available() + * zfs_sha256_available() + */ + +#ifndef _FREEBSD_SIMD_ARM_H +#define _FREEBSD_SIMD_ARM_H + +#include +#include + +#define kfpu_allowed() 1 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) + +/* + * Check if NEON is available + */ +static inline boolean_t +zfs_neon_available(void) +{ + return (elf_hwcap & HWCAP_NEON); +} + +/* + * Check if SHA256 is available + */ +static inline boolean_t +zfs_sha256_available(void) +{ + return (elf_hwcap2 & HWCAP2_SHA2); +} + +#endif /* _FREEBSD_SIMD_ARM_H */ diff --git a/include/os/freebsd/spl/sys/simd_powerpc.h b/include/os/freebsd/spl/sys/simd_powerpc.h index 34d5e23e2fbb..edaab81d15fc 100644 --- a/include/os/freebsd/spl/sys/simd_powerpc.h +++ b/include/os/freebsd/spl/sys/simd_powerpc.h @@ -1,38 +1,32 @@ /* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. + * Copyright (C) 2022 Tino Reichardt + * All rights reserved. * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * - * CDDL HEADER END - */ - -/* - * Copyright (C) 2022 Tino Reichardt + * $FreeBSD$ */ /* - * USER API: - * - * Kernel fpu methods: - * kfpu_allowed() - * kfpu_begin() - * kfpu_end() - * kfpu_init() - * kfpu_fini() - * * SIMD support: * * Following functions should be called to determine whether CPU feature diff --git a/include/os/freebsd/spl/sys/simd_x86.h b/include/os/freebsd/spl/sys/simd_x86.h index 7a0ca243f768..6512d4fcba4f 100644 --- a/include/os/freebsd/spl/sys/simd_x86.h +++ b/include/os/freebsd/spl/sys/simd_x86.h @@ -173,6 +173,19 @@ zfs_avx2_available(void) return (has_avx2 && __ymm_enabled()); } +/* + * Check if SHA_NI instruction set is available + */ +static inline boolean_t +zfs_shani_available(void) +{ + boolean_t has_shani; + + has_shani = (cpu_stdext_feature & CPUID_STDEXT_SHA) != 0; + + return (has_shani && __ymm_enabled()); +} + /* * AVX-512 family of instruction sets: * diff --git a/include/os/freebsd/zfs/sys/sha2.h b/include/os/freebsd/zfs/sys/sha2.h deleted file mode 100644 index 1f520eba0038..000000000000 --- a/include/os/freebsd/zfs/sys/sha2.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* Copyright 2013 Saso Kiselkov. All rights reserved. */ - -#ifndef _SYS_SHA2_H -#define _SYS_SHA2_H - -#include /* for uint_* */ - -#ifdef __cplusplus -extern "C" { -#endif - -#define SHA256_DIGEST_LENGTH 32 /* SHA256 digest length in bytes */ -#define SHA384_DIGEST_LENGTH 48 /* SHA384 digest length in bytes */ -#define SHA512_DIGEST_LENGTH 64 /* SHA512 digest length in bytes */ - -/* Truncated versions of SHA-512 according to FIPS-180-4, section 5.3.6 */ -#define SHA512_224_DIGEST_LENGTH 28 /* SHA512/224 digest length */ -#define SHA512_256_DIGEST_LENGTH 32 /* SHA512/256 digest length */ - -#define SHA256_HMAC_BLOCK_SIZE 64 /* SHA256-HMAC block size */ -#define SHA512_HMAC_BLOCK_SIZE 128 /* SHA512-HMAC block size */ - -#define SHA256 0 -#define SHA256_HMAC 1 -#define SHA256_HMAC_GEN 2 -#define SHA384 3 -#define SHA384_HMAC 4 -#define SHA384_HMAC_GEN 5 -#define SHA512 6 -#define SHA512_HMAC 7 -#define SHA512_HMAC_GEN 8 -#define SHA512_224 9 -#define SHA512_256 10 - -/* - * SHA2 context. - * The contents of this structure are a private interface between the - * Init/Update/Final calls of the functions defined below. - * Callers must never attempt to read or write any of the fields - * in this structure directly. - */ - -#include -#include -#include -#include -typedef struct { - uint32_t algotype; /* Algorithm Type */ - union { - SHA256_CTX SHA256_ctx; - SHA384_CTX SHA384_ctx; - SHA512_CTX SHA512_ctx; - }; -} SHA2_CTX; - -extern void SHA256Init(SHA256_CTX *); - -extern void SHA256Update(SHA256_CTX *, const void *, size_t); - -extern void SHA256Final(void *, SHA256_CTX *); - -extern void SHA384Init(SHA384_CTX *); - -extern void SHA384Update(SHA384_CTX *, const void *, size_t); - -extern void SHA384Final(void *, SHA384_CTX *); - -extern void SHA512Init(SHA512_CTX *); - -extern void SHA512Update(SHA512_CTX *, const void *, size_t); - -extern void SHA512Final(void *, SHA512_CTX *); - - -static inline void -SHA2Init(uint64_t mech, SHA2_CTX *c) -{ - switch (mech) { - case SHA256: - SHA256_Init(&c->SHA256_ctx); - break; - case SHA384: - SHA384_Init(&c->SHA384_ctx); - break; - case SHA512: - SHA512_Init(&c->SHA512_ctx); - break; - case SHA512_256: - SHA512_256_Init(&c->SHA512_ctx); - break; - default: - panic("unknown mechanism %ju", (uintmax_t)mech); - } - c->algotype = (uint32_t)mech; -} - -static inline void -SHA2Update(SHA2_CTX *c, const void *p, size_t s) -{ - switch (c->algotype) { - case SHA256: - SHA256_Update(&c->SHA256_ctx, p, s); - break; - case SHA384: - SHA384_Update(&c->SHA384_ctx, p, s); - break; - case SHA512: - SHA512_Update(&c->SHA512_ctx, p, s); - break; - case SHA512_256: - SHA512_256_Update(&c->SHA512_ctx, p, s); - break; - default: - panic("unknown mechanism %d", c->algotype); - } -} - -static inline void -SHA2Final(void *p, SHA2_CTX *c) -{ - switch (c->algotype) { - case SHA256: - SHA256_Final(p, &c->SHA256_ctx); - break; - case SHA384: - SHA384_Final(p, &c->SHA384_ctx); - break; - case SHA512: - SHA512_Final(p, &c->SHA512_ctx); - break; - case SHA512_256: - SHA512_256_Final(p, &c->SHA512_ctx); - break; - default: - panic("unknown mechanism %d", c->algotype); - } -} - -#ifdef _SHA2_IMPL -/* - * The following types/functions are all private to the implementation - * of the SHA2 functions and must not be used by consumers of the interface - */ - -/* - * List of support mechanisms in this module. - * - * It is important to note that in the module, division or modulus calculations - * are used on the enumerated type to determine which mechanism is being used; - * therefore, changing the order or additional mechanisms should be done - * carefully - */ -typedef enum sha2_mech_type { - SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ - SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ - SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ - SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ - SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ - SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ - SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ - SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ - SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ - SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ - SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ -} sha2_mech_type_t; - -#endif /* _SHA2_IMPL */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SHA2_H */ diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am index e20702d332ac..3830d198dfff 100644 --- a/include/os/linux/Makefile.am +++ b/include/os/linux/Makefile.am @@ -10,6 +10,7 @@ kernel_linux_HEADERS = \ %D%/kernel/linux/percpu_compat.h \ %D%/kernel/linux/simd.h \ %D%/kernel/linux/simd_aarch64.h \ + %D%/kernel/linux/simd_arm.h \ %D%/kernel/linux/simd_powerpc.h \ %D%/kernel/linux/simd_x86.h \ %D%/kernel/linux/utsname_compat.h \ @@ -19,7 +20,6 @@ kernel_linux_HEADERS = \ kernel_sysdir = $(kerneldir)/sys kernel_sys_HEADERS = \ %D%/zfs/sys/policy.h \ - %D%/zfs/sys/sha2.h \ %D%/zfs/sys/trace_acl.h \ %D%/zfs/sys/trace_arc.h \ %D%/zfs/sys/trace_common.h \ diff --git a/include/os/linux/kernel/linux/simd.h b/include/os/linux/kernel/linux/simd.h index b83c536883be..f4376b218a55 100644 --- a/include/os/linux/kernel/linux/simd.h +++ b/include/os/linux/kernel/linux/simd.h @@ -28,13 +28,16 @@ #if defined(__x86) #include +#elif defined(__arm__) +#include + #elif defined(__aarch64__) #include #elif defined(__powerpc__) #include -#else +#else #define kfpu_allowed() 0 #define kfpu_begin() do {} while (0) #define kfpu_end() do {} while (0) diff --git a/include/os/linux/kernel/linux/simd_aarch64.h b/include/os/linux/kernel/linux/simd_aarch64.h index d56a093d4ec2..16276b08c759 100644 --- a/include/os/linux/kernel/linux/simd_aarch64.h +++ b/include/os/linux/kernel/linux/simd_aarch64.h @@ -18,8 +18,11 @@ * * CDDL HEADER END */ + /* * Copyright (C) 2016 Romain Dolbeau . + * Copyright (C) 2022 Tino Reichardt + * Copyright (C) 2022 Sebastian Gottschall */ /* @@ -31,24 +34,83 @@ * kfpu_end() * kfpu_init() * kfpu_fini() + * + * SIMD support: + * + * Following functions should be called to determine whether CPU feature + * is supported. All functions are usable in kernel and user space. + * If a SIMD algorithm is using more than one instruction set + * all relevant feature test functions should be called. + * + * Supported features: + * zfs_neon_available() + * zfs_sha256_available() + * zfs_sha512_available() */ #ifndef _LINUX_SIMD_AARCH64_H #define _LINUX_SIMD_AARCH64_H -#include - -#if defined(__aarch64__) - #include #include +#include +#include +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) +#include +#else +#define sys_reg(op0, op1, crn, crm, op2) ( \ + ((op0) << Op0_shift) | \ + ((op1) << Op1_shift) | \ + ((crn) << CRn_shift) | \ + ((crm) << CRm_shift) | \ + ((op2) << Op2_shift)) +#endif + +#define ID_AA64PFR0_EL1 sys_reg(3, 0, 0, 1, 0) +#define ID_AA64ISAR0_EL1 sys_reg(3, 0, 0, 6, 0) #define kfpu_allowed() 1 #define kfpu_begin() kernel_neon_begin() #define kfpu_end() kernel_neon_end() -#define kfpu_init() 0 -#define kfpu_fini() ((void) 0) +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) + +#define get_ftr(id) { \ + unsigned long __val; \ + asm("mrs %0, "#id : "=r" (__val)); \ + __val; \ +} -#endif /* __aarch64__ */ +/* + * Check if NEON is available + */ +static inline boolean_t +zfs_neon_available(void) +{ + unsigned long ftr = ((get_ftr(ID_AA64PFR0_EL1)) >> 16) & 0xf; + return (ftr == 0 || ftr == 1); +} + +/* + * Check if SHA256 is available + */ +static inline boolean_t +zfs_sha256_available(void) +{ + unsigned long ftr = ((get_ftr(ID_AA64ISAR0_EL1)) >> 12) & 0x3; + return (ftr & 0x1); +} + +/* + * Check if SHA512 is available + */ +static inline boolean_t +zfs_sha512_available(void) +{ + unsigned long ftr = ((get_ftr(ID_AA64ISAR0_EL1)) >> 12) & 0x3; + return (ftr & 0x2); +} #endif /* _LINUX_SIMD_AARCH64_H */ diff --git a/include/os/linux/kernel/linux/simd_arm.h b/include/os/linux/kernel/linux/simd_arm.h new file mode 100644 index 000000000000..c432a6d4abd1 --- /dev/null +++ b/include/os/linux/kernel/linux/simd_arm.h @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2022 Tino Reichardt + */ + +/* + * USER API: + * + * Kernel fpu methods: + * kfpu_allowed() + * kfpu_begin() + * kfpu_end() + * kfpu_init() + * kfpu_fini() + * + * SIMD support: + * + * Following functions should be called to determine whether CPU feature + * is supported. All functions are usable in kernel and user space. + * If a SIMD algorithm is using more than one instruction set + * all relevant feature test functions should be called. + * + * Supported features: + * zfs_neon_available() + * zfs_sha256_available() + */ + +#ifndef _LINUX_SIMD_ARM_H +#define _LINUX_SIMD_ARM_H + +#include +#include +#include +#include + +#define kfpu_allowed() 1 +#define kfpu_begin() kernel_neon_begin() +#define kfpu_end() kernel_neon_end() +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) + +/* + * Check if NEON is available + */ +static inline boolean_t +zfs_neon_available(void) +{ + return (elf_hwcap & HWCAP_NEON); +} + +/* + * Check if SHA256 is available + */ +static inline boolean_t +zfs_sha256_available(void) +{ + return (elf_hwcap2 & HWCAP2_SHA2); +} + +#endif /* _LINUX_SIMD_ARM_H */ diff --git a/include/os/linux/kernel/linux/simd_powerpc.h b/include/os/linux/kernel/linux/simd_powerpc.h index f1de3ad01656..7faee70fe263 100644 --- a/include/os/linux/kernel/linux/simd_powerpc.h +++ b/include/os/linux/kernel/linux/simd_powerpc.h @@ -50,9 +50,6 @@ #ifndef _LINUX_SIMD_POWERPC_H #define _LINUX_SIMD_POWERPC_H -/* only for __powerpc__ */ -#if defined(__powerpc__) - #include #include #include @@ -134,6 +131,4 @@ zfs_isa207_available(void) return (cpu_has_feature(CPU_FTR_ARCH_207S)); } -#endif /* defined(__powerpc) */ - #endif /* _LINUX_SIMD_POWERPC_H */ diff --git a/include/os/linux/kernel/linux/simd_x86.h b/include/os/linux/kernel/linux/simd_x86.h index 2f6c3165ac7a..1d77f0487a30 100644 --- a/include/os/linux/kernel/linux/simd_x86.h +++ b/include/os/linux/kernel/linux/simd_x86.h @@ -53,6 +53,8 @@ * zfs_bmi1_available() * zfs_bmi2_available() * + * zfs_shani_available() + * * zfs_avx512f_available() * zfs_avx512cd_available() * zfs_avx512er_available() @@ -586,6 +588,19 @@ zfs_movbe_available(void) #endif } +/* + * Check if SHA_NI instruction set is available + */ +static inline boolean_t +zfs_shani_available(void) +{ +#if defined(X86_FEATURE_SHA_NI) + return (!!boot_cpu_has(X86_FEATURE_SHA_NI)); +#else + return (B_FALSE); +#endif +} + /* * AVX-512 family of instruction sets: * diff --git a/include/os/linux/zfs/sys/sha2.h b/include/os/linux/zfs/sys/sha2.h deleted file mode 100644 index ef37139dd4da..000000000000 --- a/include/os/linux/zfs/sys/sha2.h +++ /dev/null @@ -1,148 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* Copyright 2013 Saso Kiselkov. All rights reserved. */ - -#ifndef _SYS_SHA2_H -#define _SYS_SHA2_H - -#include /* for uint_* */ - -#ifdef __cplusplus -extern "C" { -#endif - -#define SHA256_DIGEST_LENGTH 32 /* SHA256 digest length in bytes */ -#define SHA384_DIGEST_LENGTH 48 /* SHA384 digest length in bytes */ -#define SHA512_DIGEST_LENGTH 64 /* SHA512 digest length in bytes */ - -/* Truncated versions of SHA-512 according to FIPS-180-4, section 5.3.6 */ -#define SHA512_224_DIGEST_LENGTH 28 /* SHA512/224 digest length */ -#define SHA512_256_DIGEST_LENGTH 32 /* SHA512/256 digest length */ - -#define SHA256_HMAC_BLOCK_SIZE 64 /* SHA256-HMAC block size */ -#define SHA512_HMAC_BLOCK_SIZE 128 /* SHA512-HMAC block size */ - -#define SHA256 0 -#define SHA256_HMAC 1 -#define SHA256_HMAC_GEN 2 -#define SHA384 3 -#define SHA384_HMAC 4 -#define SHA384_HMAC_GEN 5 -#define SHA512 6 -#define SHA512_HMAC 7 -#define SHA512_HMAC_GEN 8 -#define SHA512_224 9 -#define SHA512_256 10 - -/* - * SHA2 context. - * The contents of this structure are a private interface between the - * Init/Update/Final calls of the functions defined below. - * Callers must never attempt to read or write any of the fields - * in this structure directly. - */ -typedef struct { - uint32_t algotype; /* Algorithm Type */ - - /* state (ABCDEFGH) */ - union { - uint32_t s32[8]; /* for SHA256 */ - uint64_t s64[8]; /* for SHA384/512 */ - } state; - /* number of bits */ - union { - uint32_t c32[2]; /* for SHA256 , modulo 2^64 */ - uint64_t c64[2]; /* for SHA384/512, modulo 2^128 */ - } count; - union { - uint8_t buf8[128]; /* undigested input */ - uint32_t buf32[32]; /* realigned input */ - uint64_t buf64[16]; /* realigned input */ - } buf_un; -} SHA2_CTX; - -typedef SHA2_CTX SHA256_CTX; -typedef SHA2_CTX SHA384_CTX; -typedef SHA2_CTX SHA512_CTX; - -extern void SHA2Init(uint64_t mech, SHA2_CTX *); - -extern void SHA2Update(SHA2_CTX *, const void *, size_t); - -extern void SHA2Final(void *, SHA2_CTX *); - -extern void SHA256Init(SHA256_CTX *); - -extern void SHA256Update(SHA256_CTX *, const void *, size_t); - -extern void SHA256Final(void *, SHA256_CTX *); - -extern void SHA384Init(SHA384_CTX *); - -extern void SHA384Update(SHA384_CTX *, const void *, size_t); - -extern void SHA384Final(void *, SHA384_CTX *); - -extern void SHA512Init(SHA512_CTX *); - -extern void SHA512Update(SHA512_CTX *, const void *, size_t); - -extern void SHA512Final(void *, SHA512_CTX *); - -#ifdef _SHA2_IMPL -/* - * The following types/functions are all private to the implementation - * of the SHA2 functions and must not be used by consumers of the interface - */ - -/* - * List of support mechanisms in this module. - * - * It is important to note that in the module, division or modulus calculations - * are used on the enumerated type to determine which mechanism is being used; - * therefore, changing the order or additional mechanisms should be done - * carefully - */ -typedef enum sha2_mech_type { - SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ - SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ - SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ - SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ - SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ - SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ - SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ - SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ - SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ - SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ - SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ -} sha2_mech_type_t; - -#endif /* _SHA2_IMPL */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SHA2_H */ diff --git a/include/sys/blake3.h b/include/sys/blake3.h index ad65fc8db7b9..b981b18db943 100644 --- a/include/sys/blake3.h +++ b/include/sys/blake3.h @@ -22,11 +22,11 @@ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor - * Copyright (c) 2021 Tino Reichardt + * Copyright (c) 2021-2022 Tino Reichardt */ -#ifndef BLAKE3_H -#define BLAKE3_H +#ifndef _SYS_BLAKE3_H +#define _SYS_BLAKE3_H #ifdef _KERNEL #include @@ -97,26 +97,8 @@ extern void **blake3_per_cpu_ctx; extern void blake3_per_cpu_ctx_init(void); extern void blake3_per_cpu_ctx_fini(void); -/* get count of supported implementations */ -extern uint32_t blake3_impl_getcnt(void); - -/* get id of selected implementation */ -extern uint32_t blake3_impl_getid(void); - -/* get name of selected implementation */ -extern const char *blake3_impl_getname(void); - -/* setup id as fastest implementation */ -extern void blake3_impl_set_fastest(uint32_t id); - -/* set implementation by id */ -extern void blake3_impl_setid(uint32_t id); - -/* set implementation by name */ -extern int blake3_impl_setname(const char *name); - #ifdef __cplusplus } #endif -#endif /* BLAKE3_H */ +#endif /* _SYS_BLAKE3_H */ diff --git a/include/sys/sha2.h b/include/sys/sha2.h new file mode 100644 index 000000000000..81dfbbb8cea9 --- /dev/null +++ b/include/sys/sha2.h @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt + */ + +#ifndef _SYS_SHA2_H +#define _SYS_SHA2_H + +#ifdef _KERNEL +#include +#else +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define SHA224_BLOCK_LENGTH 64 +#define SHA256_BLOCK_LENGTH 64 +#define SHA384_BLOCK_LENGTH 128 +#define SHA512_BLOCK_LENGTH 128 + +#define SHA224_DIGEST_LENGTH 28 +#define SHA256_DIGEST_LENGTH 32 +#define SHA384_DIGEST_LENGTH 48 +#define SHA512_DIGEST_LENGTH 64 + +#define SHA512_224_DIGEST_LENGTH 28 +#define SHA512_256_DIGEST_LENGTH 32 + +#define SHA256_HMAC_BLOCK_SIZE 64 +#define SHA512_HMAC_BLOCK_SIZE 128 + +/* sha256 context */ +typedef struct { + uint32_t state[8]; + uint64_t count[2]; + uint8_t wbuf[64]; + + /* const sha256_ops_t *ops */ + const void *ops; +} sha256_ctx; + +/* sha512 context */ +typedef struct { + uint64_t state[8]; + uint64_t count[2]; + uint8_t wbuf[128]; + + /* const sha256_ops_t *ops */ + const void *ops; +} sha512_ctx; + +/* SHA2 context */ +typedef struct { + union { + sha256_ctx sha256; + sha512_ctx sha512; + }; + + /* algorithm type */ + int algotype; +} SHA2_CTX; + +/* SHA2 algorithm types */ +typedef enum sha2_mech_type { + SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ + SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ + SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ + SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ + SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ + SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ + SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ + SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ + SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ + SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ + SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ +} sha2_mech_type_t; + +#define SHA256 0 +#define SHA256_HMAC 1 +#define SHA256_HMAC_GEN 2 +#define SHA384 3 +#define SHA384_HMAC 4 +#define SHA384_HMAC_GEN 5 +#define SHA512 6 +#define SHA512_HMAC 7 +#define SHA512_HMAC_GEN 8 +#define SHA512_224 9 +#define SHA512_256 10 + +/* SHA2 Init function */ +extern void SHA2Init(int algotype, SHA2_CTX *ctx); + +/* SHA2 Update function */ +extern void SHA2Update(SHA2_CTX *ctx, const void *data, size_t len); + +/* SHA2 Final function */ +extern void SHA2Final(void *digest, SHA2_CTX *ctx); + +#ifdef __cplusplus +} +#endif + +#endif /* SYS_SHA2_H */ diff --git a/include/sys/zfs_impl.h b/include/sys/zfs_impl.h new file mode 100644 index 000000000000..df4899f132b8 --- /dev/null +++ b/include/sys/zfs_impl.h @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt + */ + +#ifndef _SYS_ZFS_IMPL_H +#define _SYS_ZFS_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* generic implementation backends */ +typedef struct +{ + /* algorithm name */ + const char *name; + + /* get number of supported implementations */ + uint32_t (*getcnt)(void); + + /* get id of selected implementation */ + uint32_t (*getid)(void); + + /* get name of selected implementation */ + const char *(*getname)(void); + + /* setup id as fastest implementation */ + void (*set_fastest)(uint32_t id); + + /* set implementation by id */ + void (*setid)(uint32_t id); + + /* set implementation by name */ + int (*setname)(const char *val); +} zfs_impl_t; + +/* return some set of function pointer */ +extern const zfs_impl_t *zfs_impl_get_ops(const char *algo); + +extern const zfs_impl_t zfs_blake3_ops; +extern const zfs_impl_t zfs_sha256_ops; +extern const zfs_impl_t zfs_sha512_ops; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IMPL_H */ diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index 5903678dfb41..9fb79ab4a54b 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -110,9 +110,9 @@ _SYS_ZIO_CHECKSUM_H zio_checksum_info_t */ /* SHA2 */ -extern zio_checksum_t abd_checksum_SHA256; -extern zio_checksum_t abd_checksum_SHA512_native; -extern zio_checksum_t abd_checksum_SHA512_byteswap; +extern zio_checksum_t abd_checksum_sha256; +extern zio_checksum_t abd_checksum_sha512_native; +extern zio_checksum_t abd_checksum_sha512_byteswap; /* Skein */ extern zio_checksum_t abd_checksum_skein_native; diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am index 7c6cf71de242..4ba55b2158bc 100644 --- a/lib/libicp/Makefile.am +++ b/lib/libicp/Makefile.am @@ -16,7 +16,6 @@ nodist_libicp_la_SOURCES = \ module/icp/algs/blake3/blake3.c \ module/icp/algs/blake3/blake3_generic.c \ module/icp/algs/blake3/blake3_impl.c \ - module/icp/algs/blake3/blake3_x86-64.c \ module/icp/algs/edonr/edonr.c \ module/icp/algs/modes/modes.c \ module/icp/algs/modes/cbc.c \ @@ -26,7 +25,9 @@ nodist_libicp_la_SOURCES = \ module/icp/algs/modes/ctr.c \ module/icp/algs/modes/ccm.c \ module/icp/algs/modes/ecb.c \ - module/icp/algs/sha2/sha2.c \ + module/icp/algs/sha2/sha2_generic.c \ + module/icp/algs/sha2/sha256_impl.c \ + module/icp/algs/sha2/sha512_impl.c \ module/icp/algs/skein/skein.c \ module/icp/algs/skein/skein_block.c \ module/icp/algs/skein/skein_iv.c \ @@ -38,18 +39,31 @@ nodist_libicp_la_SOURCES = \ module/icp/core/kcf_prov_lib.c \ module/icp/core/kcf_callprov.c \ module/icp/core/kcf_mech_tabs.c \ - module/icp/core/kcf_prov_tabs.c + module/icp/core/kcf_prov_tabs.c \ + module/zfs/zfs_impl.c if TARGET_CPU_AARCH64 nodist_libicp_la_SOURCES += \ module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S \ - module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S + module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S \ + module/icp/asm-aarch64/sha2/sha256-armv8.S \ + module/icp/asm-aarch64/sha2/sha512-armv8.S +endif + +if TARGET_CPU_ARM +nodist_libicp_la_SOURCES += \ + module/icp/asm-arm/sha2/sha256-armv7.S \ + module/icp/asm-arm/sha2/sha512-armv7.S endif if TARGET_CPU_POWERPC nodist_libicp_la_SOURCES += \ module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S \ - module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S + module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S \ + module/icp/asm-ppc64/sha2/sha256-ppc.S \ + module/icp/asm-ppc64/sha2/sha512-ppc.S \ + module/icp/asm-ppc64/sha2/sha256-p8.S \ + module/icp/asm-ppc64/sha2/sha512-p8.S endif if TARGET_CPU_X86_64 @@ -60,8 +74,8 @@ nodist_libicp_la_SOURCES += \ module/icp/asm-x86_64/modes/gcm_pclmulqdq.S \ module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S \ module/icp/asm-x86_64/modes/ghash-x86_64.S \ - module/icp/asm-x86_64/sha2/sha256_impl.S \ - module/icp/asm-x86_64/sha2/sha512_impl.S \ + module/icp/asm-x86_64/sha2/sha256-x86_64.S \ + module/icp/asm-x86_64/sha2/sha512-x86_64.S \ module/icp/asm-x86_64/blake3/blake3_avx2.S \ module/icp/asm-x86_64/blake3/blake3_avx512.S \ module/icp/asm-x86_64/blake3/blake3_sse2.S \ diff --git a/lib/libspl/include/Makefile.am b/lib/libspl/include/Makefile.am index c8b41bbc296e..2c1d21edf19d 100644 --- a/lib/libspl/include/Makefile.am +++ b/lib/libspl/include/Makefile.am @@ -46,7 +46,6 @@ libspl_sys_HEADERS = \ %D%/sys/poll.h \ %D%/sys/priv.h \ %D%/sys/processor.h \ - %D%/sys/sha2.h \ %D%/sys/simd.h \ %D%/sys/stack.h \ %D%/sys/stdtypes.h \ diff --git a/lib/libspl/include/sys/sha2.h b/lib/libspl/include/sys/sha2.h deleted file mode 100644 index 40db1a678cea..000000000000 --- a/lib/libspl/include/sys/sha2.h +++ /dev/null @@ -1,148 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* Copyright 2013 Saso Kiselkov. All rights reserved. */ - -#ifndef _SYS_SHA2_H -#define _SYS_SHA2_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define SHA256_DIGEST_LENGTH 32 /* SHA256 digest length in bytes */ -#define SHA384_DIGEST_LENGTH 48 /* SHA384 digest length in bytes */ -#define SHA512_DIGEST_LENGTH 64 /* SHA512 digest length in bytes */ - -/* Truncated versions of SHA-512 according to FIPS-180-4, section 5.3.6 */ -#define SHA512_224_DIGEST_LENGTH 28 /* SHA512/224 digest length */ -#define SHA512_256_DIGEST_LENGTH 32 /* SHA512/256 digest length */ - -#define SHA256_HMAC_BLOCK_SIZE 64 /* SHA256-HMAC block size */ -#define SHA512_HMAC_BLOCK_SIZE 128 /* SHA512-HMAC block size */ - -#define SHA256 0 -#define SHA256_HMAC 1 -#define SHA256_HMAC_GEN 2 -#define SHA384 3 -#define SHA384_HMAC 4 -#define SHA384_HMAC_GEN 5 -#define SHA512 6 -#define SHA512_HMAC 7 -#define SHA512_HMAC_GEN 8 -#define SHA512_224 9 -#define SHA512_256 10 - -/* - * SHA2 context. - * The contents of this structure are a private interface between the - * Init/Update/Final calls of the functions defined below. - * Callers must never attempt to read or write any of the fields - * in this structure directly. - */ -typedef struct { - uint32_t algotype; /* Algorithm Type */ - - /* state (ABCDEFGH) */ - union { - uint32_t s32[8]; /* for SHA256 */ - uint64_t s64[8]; /* for SHA384/512 */ - } state; - /* number of bits */ - union { - uint32_t c32[2]; /* for SHA256 , modulo 2^64 */ - uint64_t c64[2]; /* for SHA384/512, modulo 2^128 */ - } count; - union { - uint8_t buf8[128]; /* undigested input */ - uint32_t buf32[32]; /* realigned input */ - uint64_t buf64[16]; /* realigned input */ - } buf_un; -} SHA2_CTX; - -typedef SHA2_CTX SHA256_CTX; -typedef SHA2_CTX SHA384_CTX; -typedef SHA2_CTX SHA512_CTX; - -extern void SHA256Init(SHA256_CTX *); - -extern void SHA256Update(SHA256_CTX *, const void *, size_t); - -extern void SHA256Final(void *, SHA256_CTX *); - -extern void SHA384Init(SHA384_CTX *); - -extern void SHA384Update(SHA384_CTX *, const void *, size_t); - -extern void SHA384Final(void *, SHA384_CTX *); - -extern void SHA512Init(SHA512_CTX *); - -extern void SHA512Update(SHA512_CTX *, const void *, size_t); - -extern void SHA512Final(void *, SHA512_CTX *); - -extern void SHA2Init(uint64_t mech, SHA2_CTX *); - -extern void SHA2Update(SHA2_CTX *, const void *, size_t); - -extern void SHA2Final(void *, SHA2_CTX *); - -#ifdef _SHA2_IMPL -/* - * The following types/functions are all private to the implementation - * of the SHA2 functions and must not be used by consumers of the interface - */ - -/* - * List of support mechanisms in this module. - * - * It is important to note that in the module, division or modulus calculations - * are used on the enumerated type to determine which mechanism is being used; - * therefore, changing the order or additional mechanisms should be done - * carefully - */ -typedef enum sha2_mech_type { - SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ - SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ - SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ - SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ - SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ - SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ - SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ - SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ - SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ - SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ - SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ -} sha2_mech_type_t; - -#endif /* _SHA2_IMPL */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SHA2_H */ diff --git a/lib/libspl/include/sys/simd.h b/lib/libspl/include/sys/simd.h index 2be5173f317c..7b06ddf58305 100644 --- a/lib/libspl/include/sys/simd.h +++ b/lib/libspl/include/sys/simd.h @@ -30,6 +30,28 @@ #include #include +/* including clashes with AT_UID and others */ +#if defined(__arm__) || defined(__aarch64__) || defined(powerpc) +#if defined(__FreeBSD__) +#define AT_HWCAP 25 +#define AT_HWCAP2 26 +extern int elf_aux_info(int aux, void *buf, int buflen); +static inline unsigned long getauxval(unsigned long key) +{ + unsigned long val = 0UL; + + if (elf_aux_info((int)key, &val, sizeof (val)) != 0) + return (0UL); + + return (val); +} +#elif defined(__linux__) +#define AT_HWCAP 16 +#define AT_HWCAP2 26 +extern unsigned long getauxval(unsigned long type); +#endif /* __linux__ */ +#endif /* arm || aarch64 || powerpc */ + #if defined(__x86) #include @@ -78,7 +100,8 @@ typedef enum cpuid_inst_sets { AVX512VL, AES, PCLMULQDQ, - MOVBE + MOVBE, + SHA_NI } cpuid_inst_sets_t; /* @@ -103,6 +126,7 @@ typedef struct cpuid_feature_desc { #define _AES_BIT (1U << 25) #define _PCLMULQDQ_BIT (1U << 1) #define _MOVBE_BIT (1U << 22) +#define _SHA_NI_BIT (1U << 29) /* * Descriptions of supported instruction sets @@ -131,6 +155,7 @@ static const cpuid_feature_desc_t cpuid_features[] = { [AES] = {1U, 0U, _AES_BIT, ECX }, [PCLMULQDQ] = {1U, 0U, _PCLMULQDQ_BIT, ECX }, [MOVBE] = {1U, 0U, _MOVBE_BIT, ECX }, + [SHA_NI] = {7U, 0U, _SHA_NI_BIT, EBX }, }; /* @@ -204,6 +229,7 @@ CPUID_FEATURE_CHECK(avx512vl, AVX512VL); CPUID_FEATURE_CHECK(aes, AES); CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ); CPUID_FEATURE_CHECK(movbe, MOVBE); +CPUID_FEATURE_CHECK(shani, SHA_NI); /* * Detect register set support @@ -345,6 +371,15 @@ zfs_movbe_available(void) return (__cpuid_has_movbe()); } +/* + * Check if SHA_NI instruction is available + */ +static inline boolean_t +zfs_shani_available(void) +{ + return (__cpuid_has_shani()); +} + /* * AVX-512 family of instruction sets: * @@ -443,6 +478,36 @@ zfs_avx512vbmi_available(void) __zmm_enabled()); } +#elif defined(__arm__) + +#define kfpu_allowed() 1 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) + +#define HWCAP_NEON 0x00001000 +#define HWCAP2_SHA2 0x00000008 + +/* + * Check if NEON is available + */ +static inline boolean_t +zfs_neon_available(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_NEON); +} + +/* + * Check if SHA2 is available + */ +static inline boolean_t +zfs_sha256_available(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP2_SHA2); +} + #elif defined(__aarch64__) #define kfpu_allowed() 1 @@ -450,28 +515,41 @@ zfs_avx512vbmi_available(void) #define kfpu_begin() do {} while (0) #define kfpu_end() do {} while (0) -#elif defined(__powerpc__) +#define HWCAP_FP 0x00000001 +#define HWCAP_SHA2 0x00000040 +#define HWCAP_SHA512 0x00200000 -/* including clashes with AT_UID and others */ -#if defined(__FreeBSD__) -#define AT_HWCAP 25 /* CPU feature flags. */ -#define AT_HWCAP2 26 /* CPU feature flags 2. */ -extern int elf_aux_info(int aux, void *buf, int buflen); -static inline unsigned long -getauxval(unsigned long key) +/* + * Check if NEON is available + */ +static inline boolean_t +zfs_neon_available(void) { - unsigned long val = 0UL; + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_FP); +} - if (elf_aux_info((int)key, &val, sizeof (val)) != 0) - return (0UL); +/* + * Check if SHA2 is available + */ +static inline boolean_t +zfs_sha256_available(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_SHA2); +} - return (val); +/* + * Check if SHA512 is available + */ +static inline boolean_t +zfs_sha512_available(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_SHA512); } -#elif defined(__linux__) -#define AT_HWCAP 16 /* CPU feature flags. */ -#define AT_HWCAP2 26 /* CPU feature flags 2. */ -extern unsigned long getauxval(unsigned long type); -#endif + +#elif defined(__powerpc__) #define kfpu_allowed() 1 #define kfpu_initialize(tsk) do {} while (0) @@ -479,30 +557,28 @@ extern unsigned long getauxval(unsigned long type); #define kfpu_end() do {} while (0) #define PPC_FEATURE_HAS_ALTIVEC 0x10000000 +#define PPC_FEATURE_HAS_VSX 0x00000080 +#define PPC_FEATURE2_ARCH_2_07 0x80000000 + static inline boolean_t zfs_altivec_available(void) { unsigned long hwcap = getauxval(AT_HWCAP); - return (hwcap & PPC_FEATURE_HAS_ALTIVEC); } -#define PPC_FEATURE_HAS_VSX 0x00000080 static inline boolean_t zfs_vsx_available(void) { unsigned long hwcap = getauxval(AT_HWCAP); - return (hwcap & PPC_FEATURE_HAS_VSX); } -#define PPC_FEATURE2_ARCH_2_07 0x80000000 static inline boolean_t zfs_isa207_available(void) { unsigned long hwcap = getauxval(AT_HWCAP); unsigned long hwcap2 = getauxval(AT_HWCAP2); - return ((hwcap & PPC_FEATURE_HAS_VSX) && (hwcap2 & PPC_FEATURE2_ARCH_2_07)); } diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index f5eb84679204..cffe341220c2 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -34,8 +34,6 @@ dist_libzfs_la_SOURCES += \ endif nodist_libzfs_la_SOURCES = \ - module/icp/algs/sha2/sha2.c \ - \ module/zcommon/cityhash.c \ module/zcommon/zfeature_common.c \ module/zcommon/zfs_comutil.c \ @@ -52,7 +50,6 @@ nodist_libzfs_la_SOURCES = \ module/zcommon/zpool_prop.c \ module/zcommon/zprop_common.c - libzfs_la_LIBADD = \ libshare.la \ libzfs_core.la \ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 0cc1997f7a99..0748f1240db9 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -118,7 +118,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/refcount.c \ module/zfs/rrwlock.c \ module/zfs/sa.c \ - module/zfs/sha256.c \ + module/zfs/sha2_zfs.c \ module/zfs/skein_zfs.c \ module/zfs/spa.c \ module/zfs/spa_checkpoint.c \ diff --git a/module/Kbuild.in b/module/Kbuild.in index a1ea08cd4348..21606b8cae27 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -85,7 +85,6 @@ ICP_OBJS := \ algs/blake3/blake3.o \ algs/blake3/blake3_generic.o \ algs/blake3/blake3_impl.o \ - algs/blake3/blake3_x86-64.o \ algs/edonr/edonr.o \ algs/modes/cbc.o \ algs/modes/ccm.o \ @@ -94,7 +93,9 @@ ICP_OBJS := \ algs/modes/gcm.o \ algs/modes/gcm_generic.o \ algs/modes/modes.o \ - algs/sha2/sha2.o \ + algs/sha2/sha2_generic.o \ + algs/sha2/sha256_impl.o \ + algs/sha2/sha512_impl.o \ algs/skein/skein.o \ algs/skein/skein_block.o \ algs/skein/skein_iv.o \ @@ -120,32 +121,40 @@ ICP_OBJS_X86_64 := \ asm-x86_64/blake3/blake3_avx512.o \ asm-x86_64/blake3/blake3_sse2.o \ asm-x86_64/blake3/blake3_sse41.o \ + asm-x86_64/sha2/sha256-x86_64.o \ + asm-x86_64/sha2/sha512-x86_64.o \ asm-x86_64/modes/aesni-gcm-x86_64.o \ asm-x86_64/modes/gcm_pclmulqdq.o \ - asm-x86_64/modes/ghash-x86_64.o \ - asm-x86_64/sha2/sha256_impl.o \ - asm-x86_64/sha2/sha512_impl.o - + asm-x86_64/modes/ghash-x86_64.o ICP_OBJS_X86 := \ algs/aes/aes_impl_aesni.o \ algs/aes/aes_impl_x86-64.o \ algs/modes/gcm_pclmulqdq.o +ICP_OBJS_ARM := \ + asm-arm/sha2/sha256-armv7.o \ + asm-arm/sha2/sha512-armv7.o ICP_OBJS_ARM64 := \ asm-aarch64/blake3/b3_aarch64_sse2.o \ - asm-aarch64/blake3/b3_aarch64_sse41.o - + asm-aarch64/blake3/b3_aarch64_sse41.o \ + asm-aarch64/sha2/sha256-armv8.o \ + asm-aarch64/sha2/sha512-armv8.o ICP_OBJS_PPC_PPC64 := \ asm-ppc64/blake3/b3_ppc64le_sse2.o \ - asm-ppc64/blake3/b3_ppc64le_sse41.o + asm-ppc64/blake3/b3_ppc64le_sse41.o \ + asm-ppc64/sha2/sha256-p8.o \ + asm-ppc64/sha2/sha512-p8.o \ + asm-ppc64/sha2/sha256-ppc.o \ + asm-ppc64/sha2/sha512-ppc.o zfs-objs += $(addprefix icp/,$(ICP_OBJS)) zfs-$(CONFIG_X86) += $(addprefix icp/,$(ICP_OBJS_X86)) zfs-$(CONFIG_UML_X86)+= $(addprefix icp/,$(ICP_OBJS_X86)) zfs-$(CONFIG_X86_64) += $(addprefix icp/,$(ICP_OBJS_X86_64)) +zfs-$(CONFIG_ARM) += $(addprefix icp/,$(ICP_OBJS_ARM)) zfs-$(CONFIG_ARM64) += $(addprefix icp/,$(ICP_OBJS_ARM64)) zfs-$(CONFIG_PPC) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64)) zfs-$(CONFIG_PPC64) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64)) @@ -159,12 +168,10 @@ $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ # Suppress objtool "return with modified stack frame" warnings. OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y -# Suppress objtool "unsupported stack pointer realignment" warnings. We are -# not using a DRAP register while aligning the stack to a 64 byte boundary. +# Suppress objtool "unsupported stack pointer realignment" warnings. # See #6950 for the reasoning. -OBJECT_FILES_NON_STANDARD_sha256_impl.o := y -OBJECT_FILES_NON_STANDARD_sha512_impl.o := y - +OBJECT_FILES_NON_STANDARD_sha256-x86_64.o := y +OBJECT_FILES_NON_STANDARD_sha512-x86_64.o := y LUA_OBJS := \ lapi.o \ @@ -344,7 +351,7 @@ ZFS_OBJS := \ refcount.o \ rrwlock.o \ sa.o \ - sha256.o \ + sha2_zfs.o \ skein_zfs.o \ spa.o \ spa_checkpoint.o \ @@ -392,6 +399,7 @@ ZFS_OBJS := \ zfs_chksum.o \ zfs_fm.o \ zfs_fuid.o \ + zfs_impl.o \ zfs_ioctl.o \ zfs_log.o \ zfs_onexit.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 999dc90ff59f..667678796779 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -13,10 +13,15 @@ KMOD= openzfs ${SRCDIR}/lua \ ${SRCDIR}/nvpair \ ${SRCDIR}/icp/algs/blake3 \ + ${SRCDIR}/icp/algs/edonr \ + ${SRCDIR}/icp/algs/sha2 \ ${SRCDIR}/icp/asm-aarch64/blake3 \ + ${SRCDIR}/icp/asm-aarch64/sha2 \ + ${SRCDIR}/icp/asm-arm/sha2 \ + ${SRCDIR}/icp/asm-ppc64/sha2 \ ${SRCDIR}/icp/asm-ppc64/blake3 \ ${SRCDIR}/icp/asm-x86_64/blake3 \ - ${SRCDIR}/icp/algs/edonr \ + ${SRCDIR}/icp/asm-x86_64/sha2 \ ${SRCDIR}/os/freebsd/spl \ ${SRCDIR}/os/freebsd/zfs \ ${SRCDIR}/unicode \ @@ -27,8 +32,6 @@ KMOD= openzfs ${SRCDIR}/zstd/lib/compress \ ${SRCDIR}/zstd/lib/decompress - - CFLAGS+= -I${INCDIR} CFLAGS+= -I${INCDIR}/os/freebsd CFLAGS+= -I${INCDIR}/os/freebsd/spl @@ -88,8 +91,7 @@ SRCS+= edonr.c #icp/algs/blake3 SRCS+= blake3.c \ blake3_generic.c \ - blake3_impl.c \ - blake3_x86-64.c + blake3_impl.c #icp/asm-aarch64/blake3 SRCS+= b3_aarch64_sse2.S \ @@ -105,6 +107,29 @@ SRCS+= blake3_avx2.S \ blake3_sse2.S \ blake3_sse41.S +#icp/algs/sha2 +SRCS+= sha2_generic.c \ + sha256_impl.c \ + sha512_impl.c + +#icp/asm-arm/sha2 +SRCS+= sha256-armv7.S \ + sha512-armv7.S + +#icp/asm-aarch64/sha2 +SRCS+= sha256-armv8.S \ + sha512-armv8.S + +#icp/asm-ppc64/sha2 +SRCS+= sha256-p8.S \ + sha512-p8.S \ + sha256-ppc.S \ + sha512-ppc.S + +#icp/asm-x86_64/sha2 +SRCS+= sha256-x86_64.S \ + sha512-x86_64.S + #lua SRCS+= lapi.c \ lauxlib.c \ @@ -141,8 +166,6 @@ SRCS+= nvpair.c \ SRCS+= acl_common.c \ callb.c \ list.c \ - sha256c.c \ - sha512c.c \ spl_acl.c \ spl_cmn_err.c \ spl_dtrace.c \ @@ -268,7 +291,7 @@ SRCS+= abd.c \ refcount.c \ rrwlock.c \ sa.c \ - sha256.c \ + sha2_zfs.c \ skein_zfs.c \ spa.c \ spa_checkpoint.c \ @@ -322,6 +345,7 @@ SRCS+= abd.c \ zfs_file_os.c \ zfs_fm.c \ zfs_fuid.c \ + zfs_impl.c \ zfs_ioctl.c \ zfs_log.c \ zfs_onexit.c \ diff --git a/module/icp/algs/blake3/blake3.c b/module/icp/algs/blake3/blake3.c index 8e441f454a72..4f93e4ff2051 100644 --- a/module/icp/algs/blake3/blake3.c +++ b/module/icp/algs/blake3/blake3.c @@ -432,7 +432,7 @@ static void hasher_init_base(BLAKE3_CTX *ctx, const uint32_t key[8], memcpy(ctx->key, key, BLAKE3_KEY_LEN); chunk_state_init(&ctx->chunk, key, flags); ctx->cv_stack_len = 0; - ctx->ops = blake3_impl_get_ops(); + ctx->ops = blake3_get_ops(); } /* diff --git a/module/icp/algs/blake3/blake3_generic.c b/module/icp/algs/blake3/blake3_generic.c index 94a1f108236e..ca7197a26f39 100644 --- a/module/icp/algs/blake3/blake3_generic.c +++ b/module/icp/algs/blake3/blake3_generic.c @@ -187,7 +187,8 @@ static inline void blake3_hash_many_generic(const uint8_t * const *inputs, } } -static inline boolean_t blake3_is_generic_supported(void) +/* the generic implementation is always okay */ +static boolean_t blake3_is_supported(void) { return (B_TRUE); } @@ -196,7 +197,7 @@ const blake3_ops_t blake3_generic_impl = { .compress_in_place = blake3_compress_in_place_generic, .compress_xof = blake3_compress_xof_generic, .hash_many = blake3_hash_many_generic, - .is_supported = blake3_is_generic_supported, + .is_supported = blake3_is_supported, .degree = 4, .name = "generic" }; diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c index 7bc4db2c9806..f68a5edfeaa4 100644 --- a/module/icp/algs/blake3/blake3_impl.c +++ b/module/icp/algs/blake3/blake3_impl.c @@ -24,222 +24,266 @@ */ #include -#include +#include +#include +#include #include "blake3_impl.h" -static const blake3_ops_t *const blake3_impls[] = { - &blake3_generic_impl, #if defined(__aarch64__) || \ (defined(__x86_64) && defined(HAVE_SSE2)) || \ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - &blake3_sse2_impl, -#endif -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - &blake3_sse41_impl, -#endif -#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) - &blake3_avx2_impl, -#endif -#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) - &blake3_avx512_impl, -#endif -}; - -/* Select BLAKE3 implementation */ -#define IMPL_FASTEST (UINT32_MAX) -#define IMPL_CYCLE (UINT32_MAX - 1) - -#define IMPL_READ(i) (*(volatile uint32_t *) &(i)) - -/* Indicate that benchmark has been done */ -static boolean_t blake3_initialized = B_FALSE; - -/* Implementation that contains the fastest methods */ -static blake3_ops_t blake3_fastest_impl = { - .name = "fastest" -}; -/* Hold all supported implementations */ -static const blake3_ops_t *blake3_supp_impls[ARRAY_SIZE(blake3_impls)]; -static uint32_t blake3_supp_impls_cnt = 0; +extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, + flags); + kfpu_end(); +} -/* Currently selected implementation */ -static uint32_t blake3_impl_chosen = IMPL_FASTEST; +static void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} -static struct blake3_impl_selector { - const char *name; - uint32_t sel; -} blake3_impl_selectors[] = { - { "cycle", IMPL_CYCLE }, - { "fastest", IMPL_FASTEST } -}; +static void blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} -/* check the supported implementations */ -static void blake3_impl_init(void) +static boolean_t blake3_is_sse2_supported(void) { - int i, c; - - /* init only once */ - if (likely(blake3_initialized)) - return; +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse2_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif +} - /* move supported implementations into blake3_supp_impls */ - for (i = 0, c = 0; i < ARRAY_SIZE(blake3_impls); i++) { - const blake3_ops_t *impl = blake3_impls[i]; +const blake3_ops_t blake3_sse2_impl = { + .compress_in_place = blake3_compress_in_place_sse2, + .compress_xof = blake3_compress_xof_sse2, + .hash_many = blake3_hash_many_sse2, + .is_supported = blake3_is_sse2_supported, + .degree = 4, + .name = "sse2" +}; +#endif - if (impl->is_supported && impl->is_supported()) - blake3_supp_impls[c++] = impl; - } - blake3_supp_impls_cnt = c; +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - /* first init generic impl, may be changed via set_fastest() */ - memcpy(&blake3_fastest_impl, blake3_impls[0], - sizeof (blake3_fastest_impl)); - blake3_initialized = B_TRUE; +extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, + flags); + kfpu_end(); } -/* get number of supported implementations */ -uint32_t -blake3_impl_getcnt(void) -{ - blake3_impl_init(); - return (blake3_supp_impls_cnt); +static void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, + out); + kfpu_end(); } -/* get id of selected implementation */ -uint32_t -blake3_impl_getid(void) -{ - return (IMPL_READ(blake3_impl_chosen)); +static void blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); } -/* get name of selected implementation */ -const char * -blake3_impl_getname(void) +static boolean_t blake3_is_sse41_supported(void) { - uint32_t impl = IMPL_READ(blake3_impl_chosen); - - blake3_impl_init(); - switch (impl) { - case IMPL_FASTEST: - return ("fastest"); - case IMPL_CYCLE: - return ("cycle"); - default: - return (blake3_supp_impls[impl]->name); - } +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse4_1_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif } -/* setup id as fastest implementation */ -void -blake3_impl_set_fastest(uint32_t id) -{ - /* setup fastest impl */ - memcpy(&blake3_fastest_impl, blake3_supp_impls[id], - sizeof (blake3_fastest_impl)); +const blake3_ops_t blake3_sse41_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_sse41, + .is_supported = blake3_is_sse41_supported, + .degree = 4, + .name = "sse41" +}; +#endif + +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) +extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); } -/* set implementation by id */ -void -blake3_impl_setid(uint32_t id) +static boolean_t blake3_is_avx2_supported(void) { - blake3_impl_init(); - switch (id) { - case IMPL_FASTEST: - atomic_swap_32(&blake3_impl_chosen, IMPL_FASTEST); - break; - case IMPL_CYCLE: - atomic_swap_32(&blake3_impl_chosen, IMPL_CYCLE); - break; - default: - ASSERT3U(id, <, blake3_supp_impls_cnt); - atomic_swap_32(&blake3_impl_chosen, id); - break; - } + return (kfpu_allowed() && zfs_sse4_1_available() && + zfs_avx2_available()); } -/* set implementation by name */ -int -blake3_impl_setname(const char *val) -{ - uint32_t impl = IMPL_READ(blake3_impl_chosen); - size_t val_len; - int i, err = -EINVAL; - - blake3_impl_init(); - val_len = strlen(val); - while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */ - val_len--; - - /* check mandatory implementations */ - for (i = 0; i < ARRAY_SIZE(blake3_impl_selectors); i++) { - const char *name = blake3_impl_selectors[i].name; - - if (val_len == strlen(name) && - strncmp(val, name, val_len) == 0) { - impl = blake3_impl_selectors[i].sel; - err = 0; - break; - } - } +const blake3_ops_t +blake3_avx2_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_avx2, + .is_supported = blake3_is_avx2_supported, + .degree = 8, + .name = "avx2" +}; +#endif - if (err != 0 && blake3_initialized) { - /* check all supported implementations */ - for (i = 0; i < blake3_supp_impls_cnt; i++) { - const char *name = blake3_supp_impls[i]->name; - - if (val_len == strlen(name) && - strncmp(val, name, val_len) == 0) { - impl = i; - err = 0; - break; - } - } - } +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) +extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, + flags); + kfpu_end(); +} - if (err == 0) { - atomic_swap_32(&blake3_impl_chosen, impl); - } +static void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} - return (err); +static void blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); } -const blake3_ops_t * -blake3_impl_get_ops(void) +static boolean_t blake3_is_avx512_supported(void) { - const blake3_ops_t *ops = NULL; - uint32_t impl = IMPL_READ(blake3_impl_chosen); - - blake3_impl_init(); - switch (impl) { - case IMPL_FASTEST: - ASSERT(blake3_initialized); - ops = &blake3_fastest_impl; - break; - case IMPL_CYCLE: - /* Cycle through supported implementations */ - ASSERT(blake3_initialized); - ASSERT3U(blake3_supp_impls_cnt, >, 0); - static uint32_t cycle_count = 0; - uint32_t idx = (++cycle_count) % blake3_supp_impls_cnt; - ops = blake3_supp_impls[idx]; - break; - default: - ASSERT3U(blake3_supp_impls_cnt, >, 0); - ASSERT3U(impl, <, blake3_supp_impls_cnt); - ops = blake3_supp_impls[impl]; - break; - } - - ASSERT3P(ops, !=, NULL); - return (ops); + return (kfpu_allowed() && zfs_avx512f_available() && + zfs_avx512vl_available()); } -#if defined(_KERNEL) +const blake3_ops_t blake3_avx512_impl = { + .compress_in_place = blake3_compress_in_place_avx512, + .compress_xof = blake3_compress_xof_avx512, + .hash_many = blake3_hash_many_avx512, + .is_supported = blake3_is_avx512_supported, + .degree = 16, + .name = "avx512" +}; +#endif + +extern const blake3_ops_t blake3_generic_impl; + +static const blake3_ops_t *const blake3_impls[] = { + &blake3_generic_impl, +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse2_impl, +#endif +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse41_impl, +#endif +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) + &blake3_avx2_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) + &blake3_avx512_impl, +#endif +}; +/* use the generic implementation functions */ +#define IMPL_NAME "blake3" +#define IMPL_OPS_T blake3_ops_t +#define IMPL_ARRAY blake3_impls +#define IMPL_GET_OPS blake3_get_ops +#define ZFS_IMPL_OPS zfs_blake3_ops +#include + +#ifdef _KERNEL void **blake3_per_cpu_ctx; void @@ -253,9 +297,6 @@ blake3_per_cpu_ctx_init(void) blake3_per_cpu_ctx[i] = kmem_alloc(sizeof (BLAKE3_CTX), KM_SLEEP); } - - /* init once in kernel mode */ - blake3_impl_init(); } void @@ -276,7 +317,7 @@ blake3_per_cpu_ctx_fini(void) static int blake3_param_get(char *buffer, zfs_kernel_param_t *unused) { - const uint32_t impl = IMPL_READ(blake3_impl_chosen); + const uint32_t impl = IMPL_READ(generic_impl_chosen); char *fmt; int cnt = 0; @@ -289,10 +330,11 @@ blake3_param_get(char *buffer, zfs_kernel_param_t *unused) cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest"); /* list all supported implementations */ - for (uint32_t i = 0; i < blake3_supp_impls_cnt; ++i) { + generic_impl_init(); + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { fmt = IMPL_FMT(impl, i); cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, - blake3_supp_impls[i]->name); + blake3_impls[i]->name); } return (cnt); @@ -302,7 +344,7 @@ static int blake3_param_set(const char *val, zfs_kernel_param_t *unused) { (void) unused; - return (blake3_impl_setname(val)); + return (generic_impl_setname(val)); } #elif defined(__FreeBSD__) @@ -314,8 +356,9 @@ blake3_param(ZFS_MODULE_PARAM_ARGS) { int err; + generic_impl_init(); if (req->newptr == NULL) { - const uint32_t impl = IMPL_READ(blake3_impl_chosen); + const uint32_t impl = IMPL_READ(generic_impl_chosen); const int init_buflen = 64; const char *fmt; struct sbuf *s; @@ -331,9 +374,9 @@ blake3_param(ZFS_MODULE_PARAM_ARGS) (void) sbuf_printf(s, fmt, "fastest"); /* list all supported implementations */ - for (uint32_t i = 0; i < blake3_supp_impls_cnt; ++i) { + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { fmt = IMPL_FMT(impl, i); - (void) sbuf_printf(s, fmt, blake3_supp_impls[i]->name); + (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name); } err = sbuf_finish(s); @@ -349,7 +392,7 @@ blake3_param(ZFS_MODULE_PARAM_ARGS) return (err); } - return (-blake3_impl_setname(buf)); + return (-generic_impl_setname(buf)); } #endif diff --git a/module/icp/algs/blake3/blake3_impl.h b/module/icp/algs/blake3/blake3_impl.h index ecb51e3a3010..90d508fac08f 100644 --- a/module/icp/algs/blake3/blake3_impl.h +++ b/module/icp/algs/blake3/blake3_impl.h @@ -25,14 +25,13 @@ * Copyright (c) 2021-2022 Tino Reichardt */ -#ifndef BLAKE3_IMPL_H +#ifndef BLAKE3_IMPL_H #define BLAKE3_IMPL_H #ifdef __cplusplus extern "C" { #endif -#include #include #include #include @@ -56,7 +55,7 @@ typedef void (*blake3_hash_many_f)(const uint8_t * const *inputs, typedef boolean_t (*blake3_is_supported_f)(void); -typedef struct blake3_impl_ops { +typedef struct { blake3_compress_in_place_f compress_in_place; blake3_compress_xof_f compress_xof; blake3_hash_many_f hash_many; @@ -65,30 +64,8 @@ typedef struct blake3_impl_ops { const char *name; } blake3_ops_t; -/* Return selected BLAKE3 implementation ops */ -extern const blake3_ops_t *blake3_impl_get_ops(void); - -extern const blake3_ops_t blake3_generic_impl; - -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE2)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern const blake3_ops_t blake3_sse2_impl; -#endif - -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern const blake3_ops_t blake3_sse41_impl; -#endif - -#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) -extern const blake3_ops_t blake3_avx2_impl; -#endif - -#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) -extern const blake3_ops_t blake3_avx512_impl; -#endif +/* return selected BLAKE3 implementation ops */ +extern const blake3_ops_t *blake3_get_ops(void); #if defined(__x86_64) #define MAX_SIMD_DEGREE 16 diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c deleted file mode 100644 index 04a8b3333656..000000000000 --- a/module/icp/algs/blake3/blake3_x86-64.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2021-2022 Tino Reichardt - */ - -#include "blake3_impl.h" - -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE2)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - -extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags); - -extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]); - -extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -static void blake3_compress_in_place_sse2(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags) { - kfpu_begin(); - zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, - flags); - kfpu_end(); -} - -static void blake3_compress_xof_sse2(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]) { - kfpu_begin(); - zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, - out); - kfpu_end(); -} - -static void blake3_hash_many_sse2(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - kfpu_begin(); - zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - kfpu_end(); -} - -static boolean_t blake3_is_sse2_supported(void) -{ -#if defined(__x86_64) - return (kfpu_allowed() && zfs_sse2_available()); -#elif defined(__PPC64__) && defined(__linux__) - return (kfpu_allowed() && zfs_vsx_available()); -#else - return (kfpu_allowed()); -#endif -} - -const blake3_ops_t blake3_sse2_impl = { - .compress_in_place = blake3_compress_in_place_sse2, - .compress_xof = blake3_compress_xof_sse2, - .hash_many = blake3_hash_many_sse2, - .is_supported = blake3_is_sse2_supported, - .degree = 4, - .name = "sse2" -}; -#endif - -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE2)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - -extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags); - -extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]); - -extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -static void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags) { - kfpu_begin(); - zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, - flags); - kfpu_end(); -} - -static void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]) { - kfpu_begin(); - zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, - out); - kfpu_end(); -} - -static void blake3_hash_many_sse41(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - kfpu_begin(); - zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - kfpu_end(); -} - -static boolean_t blake3_is_sse41_supported(void) -{ -#if defined(__x86_64) - return (kfpu_allowed() && zfs_sse4_1_available()); -#elif defined(__PPC64__) && defined(__linux__) - return (kfpu_allowed() && zfs_vsx_available()); -#else - return (kfpu_allowed()); -#endif -} - -const blake3_ops_t blake3_sse41_impl = { - .compress_in_place = blake3_compress_in_place_sse41, - .compress_xof = blake3_compress_xof_sse41, - .hash_many = blake3_hash_many_sse41, - .is_supported = blake3_is_sse41_supported, - .degree = 4, - .name = "sse41" -}; -#endif - -#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) -extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -static void blake3_hash_many_avx2(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - kfpu_begin(); - zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - kfpu_end(); -} - -static boolean_t blake3_is_avx2_supported(void) -{ - return (kfpu_allowed() && zfs_sse4_1_available() && - zfs_avx2_available()); -} - -const blake3_ops_t blake3_avx2_impl = { - .compress_in_place = blake3_compress_in_place_sse41, - .compress_xof = blake3_compress_xof_sse41, - .hash_many = blake3_hash_many_avx2, - .is_supported = blake3_is_avx2_supported, - .degree = 8, - .name = "avx2" -}; -#endif - -#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) -extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags); - -extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]); - -extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -static void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags) { - kfpu_begin(); - zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, - flags); - kfpu_end(); -} - -static void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]) { - kfpu_begin(); - zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, - out); - kfpu_end(); -} - -static void blake3_hash_many_avx512(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - kfpu_begin(); - zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - kfpu_end(); -} - -static boolean_t blake3_is_avx512_supported(void) -{ - return (kfpu_allowed() && zfs_avx512f_available() && - zfs_avx512vl_available()); -} - -const blake3_ops_t blake3_avx512_impl = { - .compress_in_place = blake3_compress_in_place_avx512, - .compress_xof = blake3_compress_xof_avx512, - .hash_many = blake3_hash_many_avx512, - .is_supported = blake3_is_avx512_supported, - .degree = 16, - .name = "avx512" -}; -#endif diff --git a/module/icp/algs/sha2/sha2.c b/module/icp/algs/sha2/sha2.c deleted file mode 100644 index e6bbe34eaa57..000000000000 --- a/module/icp/algs/sha2/sha2.c +++ /dev/null @@ -1,957 +0,0 @@ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright 2013 Saso Kiselkov. All rights reserved. - */ - -/* - * The basic framework for this code came from the reference - * implementation for MD5. That implementation is Copyright (C) - * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved. - * - * License to copy and use this software is granted provided that it - * is identified as the "RSA Data Security, Inc. MD5 Message-Digest - * Algorithm" in all material mentioning or referencing this software - * or this function. - * - * License is also granted to make and use derivative works provided - * that such works are identified as "derived from the RSA Data - * Security, Inc. MD5 Message-Digest Algorithm" in all material - * mentioning or referencing the derived work. - * - * RSA Data Security, Inc. makes no representations concerning either - * the merchantability of this software or the suitability of this - * software for any particular purpose. It is provided "as is" - * without express or implied warranty of any kind. - * - * These notices must be retained in any copies of any part of this - * documentation and/or software. - * - * NOTE: Cleaned-up and optimized, version of SHA2, based on the FIPS 180-2 - * standard, available at - * http://csrc.nist.gov/publications/fips/fips180-2/fips180-2.pdf - * Not as fast as one would like -- further optimizations are encouraged - * and appreciated. - */ - -#include -#define _SHA2_IMPL -#include -#include - -#define _RESTRICT_KYWD - -#ifdef _ZFS_LITTLE_ENDIAN -#include -#define HAVE_HTONL -#endif -#include /* for _ILP32 */ -#include - -static void Encode(uint8_t *, uint32_t *, size_t); -static void Encode64(uint8_t *, uint64_t *, size_t); - -/* userspace only supports the generic version */ -#if defined(__amd64) && defined(_KERNEL) -#define SHA512Transform(ctx, in) SHA512TransformBlocks((ctx), (in), 1) -#define SHA256Transform(ctx, in) SHA256TransformBlocks((ctx), (in), 1) - -void ASMABI SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); -void ASMABI SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); - -#else -static void SHA256Transform(SHA2_CTX *, const uint8_t *); -static void SHA512Transform(SHA2_CTX *, const uint8_t *); -#endif /* __amd64 && _KERNEL */ - -static const uint8_t PADDING[128] = { 0x80, /* all zeros */ }; - -/* - * The low-level checksum routines use a lot of stack space. On systems where - * small stacks are enforced (like 32-bit kernel builds), insert compiler memory - * barriers to reduce stack frame size. This can reduce the SHA512Transform() - * stack frame usage from 3k to <1k on ARM32, for example. - */ -#if defined(_ILP32) || defined(__powerpc) /* small stack */ -#define SMALL_STACK_MEMORY_BARRIER asm volatile("": : :"memory"); -#else -#define SMALL_STACK_MEMORY_BARRIER -#endif - -/* Ch and Maj are the basic SHA2 functions. */ -#define Ch(b, c, d) (((b) & (c)) ^ ((~b) & (d))) -#define Maj(b, c, d) (((b) & (c)) ^ ((b) & (d)) ^ ((c) & (d))) - -/* Rotates x right n bits. */ -#define ROTR(x, n) \ - (((x) >> (n)) | ((x) << ((sizeof (x) * NBBY)-(n)))) - -/* Shift x right n bits */ -#define SHR(x, n) ((x) >> (n)) - -/* SHA256 Functions */ -#define BIGSIGMA0_256(x) (ROTR((x), 2) ^ ROTR((x), 13) ^ ROTR((x), 22)) -#define BIGSIGMA1_256(x) (ROTR((x), 6) ^ ROTR((x), 11) ^ ROTR((x), 25)) -#define SIGMA0_256(x) (ROTR((x), 7) ^ ROTR((x), 18) ^ SHR((x), 3)) -#define SIGMA1_256(x) (ROTR((x), 17) ^ ROTR((x), 19) ^ SHR((x), 10)) - -#define SHA256ROUND(a, b, c, d, e, f, g, h, i, w) \ - T1 = h + BIGSIGMA1_256(e) + Ch(e, f, g) + SHA256_CONST(i) + w; \ - d += T1; \ - T2 = BIGSIGMA0_256(a) + Maj(a, b, c); \ - h = T1 + T2 - -/* SHA384/512 Functions */ -#define BIGSIGMA0(x) (ROTR((x), 28) ^ ROTR((x), 34) ^ ROTR((x), 39)) -#define BIGSIGMA1(x) (ROTR((x), 14) ^ ROTR((x), 18) ^ ROTR((x), 41)) -#define SIGMA0(x) (ROTR((x), 1) ^ ROTR((x), 8) ^ SHR((x), 7)) -#define SIGMA1(x) (ROTR((x), 19) ^ ROTR((x), 61) ^ SHR((x), 6)) -#define SHA512ROUND(a, b, c, d, e, f, g, h, i, w) \ - T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + SHA512_CONST(i) + w; \ - d += T1; \ - T2 = BIGSIGMA0(a) + Maj(a, b, c); \ - h = T1 + T2; \ - SMALL_STACK_MEMORY_BARRIER; - -/* - * sparc optimization: - * - * on the sparc, we can load big endian 32-bit data easily. note that - * special care must be taken to ensure the address is 32-bit aligned. - * in the interest of speed, we don't check to make sure, since - * careful programming can guarantee this for us. - */ - -#if defined(_ZFS_BIG_ENDIAN) -#define LOAD_BIG_32(addr) (*(uint32_t *)(addr)) -#define LOAD_BIG_64(addr) (*(uint64_t *)(addr)) - -#elif defined(HAVE_HTONL) -#define LOAD_BIG_32(addr) htonl(*((uint32_t *)(addr))) -#define LOAD_BIG_64(addr) htonll(*((uint64_t *)(addr))) - -#else -/* little endian -- will work on big endian, but slowly */ -#define LOAD_BIG_32(addr) \ - (((addr)[0] << 24) | ((addr)[1] << 16) | ((addr)[2] << 8) | (addr)[3]) -#define LOAD_BIG_64(addr) \ - (((uint64_t)(addr)[0] << 56) | ((uint64_t)(addr)[1] << 48) | \ - ((uint64_t)(addr)[2] << 40) | ((uint64_t)(addr)[3] << 32) | \ - ((uint64_t)(addr)[4] << 24) | ((uint64_t)(addr)[5] << 16) | \ - ((uint64_t)(addr)[6] << 8) | (uint64_t)(addr)[7]) -#endif /* _BIG_ENDIAN */ - - -#if !defined(__amd64) || !defined(_KERNEL) -/* SHA256 Transform */ - -static void -SHA256Transform(SHA2_CTX *ctx, const uint8_t *blk) -{ - uint32_t a = ctx->state.s32[0]; - uint32_t b = ctx->state.s32[1]; - uint32_t c = ctx->state.s32[2]; - uint32_t d = ctx->state.s32[3]; - uint32_t e = ctx->state.s32[4]; - uint32_t f = ctx->state.s32[5]; - uint32_t g = ctx->state.s32[6]; - uint32_t h = ctx->state.s32[7]; - - uint32_t w0, w1, w2, w3, w4, w5, w6, w7; - uint32_t w8, w9, w10, w11, w12, w13, w14, w15; - uint32_t T1, T2; - -#if defined(__sparc) - static const uint32_t sha256_consts[] = { - SHA256_CONST_0, SHA256_CONST_1, SHA256_CONST_2, - SHA256_CONST_3, SHA256_CONST_4, SHA256_CONST_5, - SHA256_CONST_6, SHA256_CONST_7, SHA256_CONST_8, - SHA256_CONST_9, SHA256_CONST_10, SHA256_CONST_11, - SHA256_CONST_12, SHA256_CONST_13, SHA256_CONST_14, - SHA256_CONST_15, SHA256_CONST_16, SHA256_CONST_17, - SHA256_CONST_18, SHA256_CONST_19, SHA256_CONST_20, - SHA256_CONST_21, SHA256_CONST_22, SHA256_CONST_23, - SHA256_CONST_24, SHA256_CONST_25, SHA256_CONST_26, - SHA256_CONST_27, SHA256_CONST_28, SHA256_CONST_29, - SHA256_CONST_30, SHA256_CONST_31, SHA256_CONST_32, - SHA256_CONST_33, SHA256_CONST_34, SHA256_CONST_35, - SHA256_CONST_36, SHA256_CONST_37, SHA256_CONST_38, - SHA256_CONST_39, SHA256_CONST_40, SHA256_CONST_41, - SHA256_CONST_42, SHA256_CONST_43, SHA256_CONST_44, - SHA256_CONST_45, SHA256_CONST_46, SHA256_CONST_47, - SHA256_CONST_48, SHA256_CONST_49, SHA256_CONST_50, - SHA256_CONST_51, SHA256_CONST_52, SHA256_CONST_53, - SHA256_CONST_54, SHA256_CONST_55, SHA256_CONST_56, - SHA256_CONST_57, SHA256_CONST_58, SHA256_CONST_59, - SHA256_CONST_60, SHA256_CONST_61, SHA256_CONST_62, - SHA256_CONST_63 - }; -#endif /* __sparc */ - - if ((uintptr_t)blk & 0x3) { /* not 4-byte aligned? */ - memcpy(ctx->buf_un.buf32, blk, sizeof (ctx->buf_un.buf32)); - blk = (uint8_t *)ctx->buf_un.buf32; - } - - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w0 = LOAD_BIG_32(blk + 4 * 0); - SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w1 = LOAD_BIG_32(blk + 4 * 1); - SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w2 = LOAD_BIG_32(blk + 4 * 2); - SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w3 = LOAD_BIG_32(blk + 4 * 3); - SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w4 = LOAD_BIG_32(blk + 4 * 4); - SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w5 = LOAD_BIG_32(blk + 4 * 5); - SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w6 = LOAD_BIG_32(blk + 4 * 6); - SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w7 = LOAD_BIG_32(blk + 4 * 7); - SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w8 = LOAD_BIG_32(blk + 4 * 8); - SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w9 = LOAD_BIG_32(blk + 4 * 9); - SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w10 = LOAD_BIG_32(blk + 4 * 10); - SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w11 = LOAD_BIG_32(blk + 4 * 11); - SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w12 = LOAD_BIG_32(blk + 4 * 12); - SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w13 = LOAD_BIG_32(blk + 4 * 13); - SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w14 = LOAD_BIG_32(blk + 4 * 14); - SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w15 = LOAD_BIG_32(blk + 4 * 15); - SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15); - - w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0; - SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0); - w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1; - SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1); - w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2; - SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2); - w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3; - SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3); - w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4; - SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4); - w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5; - SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5); - w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6; - SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6); - w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7; - SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7); - w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8; - SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8); - w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9; - SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9); - w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10; - SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10); - w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11; - SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11); - w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12; - SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12); - w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13; - SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13); - w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14; - SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14); - w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15; - SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15); - - w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0; - SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0); - w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1; - SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1); - w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2; - SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2); - w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3; - SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3); - w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4; - SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4); - w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5; - SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5); - w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6; - SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6); - w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7; - SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7); - w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8; - SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8); - w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9; - SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9); - w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10; - SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10); - w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11; - SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11); - w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12; - SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12); - w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13; - SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13); - w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14; - SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14); - w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15; - SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15); - - w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0; - SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0); - w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1; - SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1); - w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2; - SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2); - w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3; - SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3); - w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4; - SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4); - w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5; - SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5); - w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6; - SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6); - w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7; - SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7); - w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8; - SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8); - w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9; - SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9); - w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10; - SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10); - w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11; - SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11); - w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12; - SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12); - w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13; - SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13); - w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14; - SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14); - w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15; - SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15); - - ctx->state.s32[0] += a; - ctx->state.s32[1] += b; - ctx->state.s32[2] += c; - ctx->state.s32[3] += d; - ctx->state.s32[4] += e; - ctx->state.s32[5] += f; - ctx->state.s32[6] += g; - ctx->state.s32[7] += h; -} - - -/* SHA384 and SHA512 Transform */ - -static void -SHA512Transform(SHA2_CTX *ctx, const uint8_t *blk) -{ - - uint64_t a = ctx->state.s64[0]; - uint64_t b = ctx->state.s64[1]; - uint64_t c = ctx->state.s64[2]; - uint64_t d = ctx->state.s64[3]; - uint64_t e = ctx->state.s64[4]; - uint64_t f = ctx->state.s64[5]; - uint64_t g = ctx->state.s64[6]; - uint64_t h = ctx->state.s64[7]; - - uint64_t w0, w1, w2, w3, w4, w5, w6, w7; - uint64_t w8, w9, w10, w11, w12, w13, w14, w15; - uint64_t T1, T2; - -#if defined(__sparc) - static const uint64_t sha512_consts[] = { - SHA512_CONST_0, SHA512_CONST_1, SHA512_CONST_2, - SHA512_CONST_3, SHA512_CONST_4, SHA512_CONST_5, - SHA512_CONST_6, SHA512_CONST_7, SHA512_CONST_8, - SHA512_CONST_9, SHA512_CONST_10, SHA512_CONST_11, - SHA512_CONST_12, SHA512_CONST_13, SHA512_CONST_14, - SHA512_CONST_15, SHA512_CONST_16, SHA512_CONST_17, - SHA512_CONST_18, SHA512_CONST_19, SHA512_CONST_20, - SHA512_CONST_21, SHA512_CONST_22, SHA512_CONST_23, - SHA512_CONST_24, SHA512_CONST_25, SHA512_CONST_26, - SHA512_CONST_27, SHA512_CONST_28, SHA512_CONST_29, - SHA512_CONST_30, SHA512_CONST_31, SHA512_CONST_32, - SHA512_CONST_33, SHA512_CONST_34, SHA512_CONST_35, - SHA512_CONST_36, SHA512_CONST_37, SHA512_CONST_38, - SHA512_CONST_39, SHA512_CONST_40, SHA512_CONST_41, - SHA512_CONST_42, SHA512_CONST_43, SHA512_CONST_44, - SHA512_CONST_45, SHA512_CONST_46, SHA512_CONST_47, - SHA512_CONST_48, SHA512_CONST_49, SHA512_CONST_50, - SHA512_CONST_51, SHA512_CONST_52, SHA512_CONST_53, - SHA512_CONST_54, SHA512_CONST_55, SHA512_CONST_56, - SHA512_CONST_57, SHA512_CONST_58, SHA512_CONST_59, - SHA512_CONST_60, SHA512_CONST_61, SHA512_CONST_62, - SHA512_CONST_63, SHA512_CONST_64, SHA512_CONST_65, - SHA512_CONST_66, SHA512_CONST_67, SHA512_CONST_68, - SHA512_CONST_69, SHA512_CONST_70, SHA512_CONST_71, - SHA512_CONST_72, SHA512_CONST_73, SHA512_CONST_74, - SHA512_CONST_75, SHA512_CONST_76, SHA512_CONST_77, - SHA512_CONST_78, SHA512_CONST_79 - }; -#endif /* __sparc */ - - - if ((uintptr_t)blk & 0x7) { /* not 8-byte aligned? */ - memcpy(ctx->buf_un.buf64, blk, sizeof (ctx->buf_un.buf64)); - blk = (uint8_t *)ctx->buf_un.buf64; - } - - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w0 = LOAD_BIG_64(blk + 8 * 0); - SHA512ROUND(a, b, c, d, e, f, g, h, 0, w0); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w1 = LOAD_BIG_64(blk + 8 * 1); - SHA512ROUND(h, a, b, c, d, e, f, g, 1, w1); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w2 = LOAD_BIG_64(blk + 8 * 2); - SHA512ROUND(g, h, a, b, c, d, e, f, 2, w2); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w3 = LOAD_BIG_64(blk + 8 * 3); - SHA512ROUND(f, g, h, a, b, c, d, e, 3, w3); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w4 = LOAD_BIG_64(blk + 8 * 4); - SHA512ROUND(e, f, g, h, a, b, c, d, 4, w4); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w5 = LOAD_BIG_64(blk + 8 * 5); - SHA512ROUND(d, e, f, g, h, a, b, c, 5, w5); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w6 = LOAD_BIG_64(blk + 8 * 6); - SHA512ROUND(c, d, e, f, g, h, a, b, 6, w6); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w7 = LOAD_BIG_64(blk + 8 * 7); - SHA512ROUND(b, c, d, e, f, g, h, a, 7, w7); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w8 = LOAD_BIG_64(blk + 8 * 8); - SHA512ROUND(a, b, c, d, e, f, g, h, 8, w8); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w9 = LOAD_BIG_64(blk + 8 * 9); - SHA512ROUND(h, a, b, c, d, e, f, g, 9, w9); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w10 = LOAD_BIG_64(blk + 8 * 10); - SHA512ROUND(g, h, a, b, c, d, e, f, 10, w10); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w11 = LOAD_BIG_64(blk + 8 * 11); - SHA512ROUND(f, g, h, a, b, c, d, e, 11, w11); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w12 = LOAD_BIG_64(blk + 8 * 12); - SHA512ROUND(e, f, g, h, a, b, c, d, 12, w12); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w13 = LOAD_BIG_64(blk + 8 * 13); - SHA512ROUND(d, e, f, g, h, a, b, c, 13, w13); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w14 = LOAD_BIG_64(blk + 8 * 14); - SHA512ROUND(c, d, e, f, g, h, a, b, 14, w14); - /* LINTED E_BAD_PTR_CAST_ALIGN */ - w15 = LOAD_BIG_64(blk + 8 * 15); - SHA512ROUND(b, c, d, e, f, g, h, a, 15, w15); - - w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; - SHA512ROUND(a, b, c, d, e, f, g, h, 16, w0); - w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; - SHA512ROUND(h, a, b, c, d, e, f, g, 17, w1); - w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; - SHA512ROUND(g, h, a, b, c, d, e, f, 18, w2); - w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; - SHA512ROUND(f, g, h, a, b, c, d, e, 19, w3); - w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; - SHA512ROUND(e, f, g, h, a, b, c, d, 20, w4); - w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; - SHA512ROUND(d, e, f, g, h, a, b, c, 21, w5); - w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; - SHA512ROUND(c, d, e, f, g, h, a, b, 22, w6); - w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; - SHA512ROUND(b, c, d, e, f, g, h, a, 23, w7); - w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; - SHA512ROUND(a, b, c, d, e, f, g, h, 24, w8); - w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; - SHA512ROUND(h, a, b, c, d, e, f, g, 25, w9); - w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; - SHA512ROUND(g, h, a, b, c, d, e, f, 26, w10); - w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; - SHA512ROUND(f, g, h, a, b, c, d, e, 27, w11); - w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; - SHA512ROUND(e, f, g, h, a, b, c, d, 28, w12); - w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; - SHA512ROUND(d, e, f, g, h, a, b, c, 29, w13); - w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; - SHA512ROUND(c, d, e, f, g, h, a, b, 30, w14); - w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; - SHA512ROUND(b, c, d, e, f, g, h, a, 31, w15); - - w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; - SHA512ROUND(a, b, c, d, e, f, g, h, 32, w0); - w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; - SHA512ROUND(h, a, b, c, d, e, f, g, 33, w1); - w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; - SHA512ROUND(g, h, a, b, c, d, e, f, 34, w2); - w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; - SHA512ROUND(f, g, h, a, b, c, d, e, 35, w3); - w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; - SHA512ROUND(e, f, g, h, a, b, c, d, 36, w4); - w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; - SHA512ROUND(d, e, f, g, h, a, b, c, 37, w5); - w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; - SHA512ROUND(c, d, e, f, g, h, a, b, 38, w6); - w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; - SHA512ROUND(b, c, d, e, f, g, h, a, 39, w7); - w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; - SHA512ROUND(a, b, c, d, e, f, g, h, 40, w8); - w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; - SHA512ROUND(h, a, b, c, d, e, f, g, 41, w9); - w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; - SHA512ROUND(g, h, a, b, c, d, e, f, 42, w10); - w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; - SHA512ROUND(f, g, h, a, b, c, d, e, 43, w11); - w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; - SHA512ROUND(e, f, g, h, a, b, c, d, 44, w12); - w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; - SHA512ROUND(d, e, f, g, h, a, b, c, 45, w13); - w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; - SHA512ROUND(c, d, e, f, g, h, a, b, 46, w14); - w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; - SHA512ROUND(b, c, d, e, f, g, h, a, 47, w15); - - w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; - SHA512ROUND(a, b, c, d, e, f, g, h, 48, w0); - w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; - SHA512ROUND(h, a, b, c, d, e, f, g, 49, w1); - w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; - SHA512ROUND(g, h, a, b, c, d, e, f, 50, w2); - w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; - SHA512ROUND(f, g, h, a, b, c, d, e, 51, w3); - w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; - SHA512ROUND(e, f, g, h, a, b, c, d, 52, w4); - w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; - SHA512ROUND(d, e, f, g, h, a, b, c, 53, w5); - w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; - SHA512ROUND(c, d, e, f, g, h, a, b, 54, w6); - w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; - SHA512ROUND(b, c, d, e, f, g, h, a, 55, w7); - w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; - SHA512ROUND(a, b, c, d, e, f, g, h, 56, w8); - w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; - SHA512ROUND(h, a, b, c, d, e, f, g, 57, w9); - w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; - SHA512ROUND(g, h, a, b, c, d, e, f, 58, w10); - w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; - SHA512ROUND(f, g, h, a, b, c, d, e, 59, w11); - w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; - SHA512ROUND(e, f, g, h, a, b, c, d, 60, w12); - w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; - SHA512ROUND(d, e, f, g, h, a, b, c, 61, w13); - w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; - SHA512ROUND(c, d, e, f, g, h, a, b, 62, w14); - w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; - SHA512ROUND(b, c, d, e, f, g, h, a, 63, w15); - - w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; - SHA512ROUND(a, b, c, d, e, f, g, h, 64, w0); - w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; - SHA512ROUND(h, a, b, c, d, e, f, g, 65, w1); - w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; - SHA512ROUND(g, h, a, b, c, d, e, f, 66, w2); - w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; - SHA512ROUND(f, g, h, a, b, c, d, e, 67, w3); - w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; - SHA512ROUND(e, f, g, h, a, b, c, d, 68, w4); - w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; - SHA512ROUND(d, e, f, g, h, a, b, c, 69, w5); - w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; - SHA512ROUND(c, d, e, f, g, h, a, b, 70, w6); - w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; - SHA512ROUND(b, c, d, e, f, g, h, a, 71, w7); - w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; - SHA512ROUND(a, b, c, d, e, f, g, h, 72, w8); - w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; - SHA512ROUND(h, a, b, c, d, e, f, g, 73, w9); - w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; - SHA512ROUND(g, h, a, b, c, d, e, f, 74, w10); - w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; - SHA512ROUND(f, g, h, a, b, c, d, e, 75, w11); - w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; - SHA512ROUND(e, f, g, h, a, b, c, d, 76, w12); - w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; - SHA512ROUND(d, e, f, g, h, a, b, c, 77, w13); - w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; - SHA512ROUND(c, d, e, f, g, h, a, b, 78, w14); - w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; - SHA512ROUND(b, c, d, e, f, g, h, a, 79, w15); - - ctx->state.s64[0] += a; - ctx->state.s64[1] += b; - ctx->state.s64[2] += c; - ctx->state.s64[3] += d; - ctx->state.s64[4] += e; - ctx->state.s64[5] += f; - ctx->state.s64[6] += g; - ctx->state.s64[7] += h; - -} -#endif /* !__amd64 || !_KERNEL */ - - -/* - * Encode() - * - * purpose: to convert a list of numbers from little endian to big endian - * input: uint8_t * : place to store the converted big endian numbers - * uint32_t * : place to get numbers to convert from - * size_t : the length of the input in bytes - * output: void - */ - -static void -Encode(uint8_t *_RESTRICT_KYWD output, uint32_t *_RESTRICT_KYWD input, - size_t len) -{ - size_t i, j; - -#if defined(__sparc) - if (IS_P2ALIGNED(output, sizeof (uint32_t))) { - for (i = 0, j = 0; j < len; i++, j += 4) { - /* LINTED E_BAD_PTR_CAST_ALIGN */ - *((uint32_t *)(output + j)) = input[i]; - } - } else { -#endif /* little endian -- will work on big endian, but slowly */ - for (i = 0, j = 0; j < len; i++, j += 4) { - output[j] = (input[i] >> 24) & 0xff; - output[j + 1] = (input[i] >> 16) & 0xff; - output[j + 2] = (input[i] >> 8) & 0xff; - output[j + 3] = input[i] & 0xff; - } -#if defined(__sparc) - } -#endif -} - -static void -Encode64(uint8_t *_RESTRICT_KYWD output, uint64_t *_RESTRICT_KYWD input, - size_t len) -{ - size_t i, j; - -#if defined(__sparc) - if (IS_P2ALIGNED(output, sizeof (uint64_t))) { - for (i = 0, j = 0; j < len; i++, j += 8) { - /* LINTED E_BAD_PTR_CAST_ALIGN */ - *((uint64_t *)(output + j)) = input[i]; - } - } else { -#endif /* little endian -- will work on big endian, but slowly */ - for (i = 0, j = 0; j < len; i++, j += 8) { - - output[j] = (input[i] >> 56) & 0xff; - output[j + 1] = (input[i] >> 48) & 0xff; - output[j + 2] = (input[i] >> 40) & 0xff; - output[j + 3] = (input[i] >> 32) & 0xff; - output[j + 4] = (input[i] >> 24) & 0xff; - output[j + 5] = (input[i] >> 16) & 0xff; - output[j + 6] = (input[i] >> 8) & 0xff; - output[j + 7] = input[i] & 0xff; - } -#if defined(__sparc) - } -#endif -} - - -void -SHA2Init(uint64_t mech, SHA2_CTX *ctx) -{ - - switch (mech) { - case SHA256_MECH_INFO_TYPE: - case SHA256_HMAC_MECH_INFO_TYPE: - case SHA256_HMAC_GEN_MECH_INFO_TYPE: - ctx->state.s32[0] = 0x6a09e667U; - ctx->state.s32[1] = 0xbb67ae85U; - ctx->state.s32[2] = 0x3c6ef372U; - ctx->state.s32[3] = 0xa54ff53aU; - ctx->state.s32[4] = 0x510e527fU; - ctx->state.s32[5] = 0x9b05688cU; - ctx->state.s32[6] = 0x1f83d9abU; - ctx->state.s32[7] = 0x5be0cd19U; - break; - case SHA384_MECH_INFO_TYPE: - case SHA384_HMAC_MECH_INFO_TYPE: - case SHA384_HMAC_GEN_MECH_INFO_TYPE: - ctx->state.s64[0] = 0xcbbb9d5dc1059ed8ULL; - ctx->state.s64[1] = 0x629a292a367cd507ULL; - ctx->state.s64[2] = 0x9159015a3070dd17ULL; - ctx->state.s64[3] = 0x152fecd8f70e5939ULL; - ctx->state.s64[4] = 0x67332667ffc00b31ULL; - ctx->state.s64[5] = 0x8eb44a8768581511ULL; - ctx->state.s64[6] = 0xdb0c2e0d64f98fa7ULL; - ctx->state.s64[7] = 0x47b5481dbefa4fa4ULL; - break; - case SHA512_MECH_INFO_TYPE: - case SHA512_HMAC_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: - ctx->state.s64[0] = 0x6a09e667f3bcc908ULL; - ctx->state.s64[1] = 0xbb67ae8584caa73bULL; - ctx->state.s64[2] = 0x3c6ef372fe94f82bULL; - ctx->state.s64[3] = 0xa54ff53a5f1d36f1ULL; - ctx->state.s64[4] = 0x510e527fade682d1ULL; - ctx->state.s64[5] = 0x9b05688c2b3e6c1fULL; - ctx->state.s64[6] = 0x1f83d9abfb41bd6bULL; - ctx->state.s64[7] = 0x5be0cd19137e2179ULL; - break; - case SHA512_224_MECH_INFO_TYPE: - ctx->state.s64[0] = 0x8C3D37C819544DA2ULL; - ctx->state.s64[1] = 0x73E1996689DCD4D6ULL; - ctx->state.s64[2] = 0x1DFAB7AE32FF9C82ULL; - ctx->state.s64[3] = 0x679DD514582F9FCFULL; - ctx->state.s64[4] = 0x0F6D2B697BD44DA8ULL; - ctx->state.s64[5] = 0x77E36F7304C48942ULL; - ctx->state.s64[6] = 0x3F9D85A86A1D36C8ULL; - ctx->state.s64[7] = 0x1112E6AD91D692A1ULL; - break; - case SHA512_256_MECH_INFO_TYPE: - ctx->state.s64[0] = 0x22312194FC2BF72CULL; - ctx->state.s64[1] = 0x9F555FA3C84C64C2ULL; - ctx->state.s64[2] = 0x2393B86B6F53B151ULL; - ctx->state.s64[3] = 0x963877195940EABDULL; - ctx->state.s64[4] = 0x96283EE2A88EFFE3ULL; - ctx->state.s64[5] = 0xBE5E1E2553863992ULL; - ctx->state.s64[6] = 0x2B0199FC2C85B8AAULL; - ctx->state.s64[7] = 0x0EB72DDC81C52CA2ULL; - break; -#ifdef _KERNEL - default: - cmn_err(CE_PANIC, - "sha2_init: failed to find a supported algorithm: 0x%x", - (uint32_t)mech); - -#endif /* _KERNEL */ - } - - ctx->algotype = (uint32_t)mech; - ctx->count.c64[0] = ctx->count.c64[1] = 0; -} - -#ifndef _KERNEL - -// #pragma inline(SHA256Init, SHA384Init, SHA512Init) -void -SHA256Init(SHA256_CTX *ctx) -{ - SHA2Init(SHA256, ctx); -} - -void -SHA384Init(SHA384_CTX *ctx) -{ - SHA2Init(SHA384, ctx); -} - -void -SHA512Init(SHA512_CTX *ctx) -{ - SHA2Init(SHA512, ctx); -} - -#endif /* _KERNEL */ - -/* - * SHA2Update() - * - * purpose: continues an sha2 digest operation, using the message block - * to update the context. - * input: SHA2_CTX * : the context to update - * void * : the message block - * size_t : the length of the message block, in bytes - * output: void - */ - -void -SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len) -{ - uint32_t i, buf_index, buf_len, buf_limit; - const uint8_t *input = inptr; - uint32_t algotype = ctx->algotype; - - /* check for noop */ - if (input_len == 0) - return; - - if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { - buf_limit = 64; - - /* compute number of bytes mod 64 */ - buf_index = (ctx->count.c32[1] >> 3) & 0x3F; - - /* update number of bits */ - if ((ctx->count.c32[1] += (input_len << 3)) < (input_len << 3)) - ctx->count.c32[0]++; - - ctx->count.c32[0] += (input_len >> 29); - - } else { - buf_limit = 128; - - /* compute number of bytes mod 128 */ - buf_index = (ctx->count.c64[1] >> 3) & 0x7F; - - /* update number of bits */ - if ((ctx->count.c64[1] += (input_len << 3)) < (input_len << 3)) - ctx->count.c64[0]++; - - ctx->count.c64[0] += (input_len >> 29); - } - - buf_len = buf_limit - buf_index; - - /* transform as many times as possible */ - i = 0; - if (input_len >= buf_len) { - - /* - * general optimization: - * - * only do initial memcpy() and SHA2Transform() if - * buf_index != 0. if buf_index == 0, we're just - * wasting our time doing the memcpy() since there - * wasn't any data left over from a previous call to - * SHA2Update(). - */ - if (buf_index) { - memcpy(&ctx->buf_un.buf8[buf_index], input, buf_len); - if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) - SHA256Transform(ctx, ctx->buf_un.buf8); - else - SHA512Transform(ctx, ctx->buf_un.buf8); - - i = buf_len; - } - -#if !defined(__amd64) || !defined(_KERNEL) - if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { - for (; i + buf_limit - 1 < input_len; i += buf_limit) { - SHA256Transform(ctx, &input[i]); - } - } else { - for (; i + buf_limit - 1 < input_len; i += buf_limit) { - SHA512Transform(ctx, &input[i]); - } - } - -#else - uint32_t block_count; - if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { - block_count = (input_len - i) >> 6; - if (block_count > 0) { - SHA256TransformBlocks(ctx, &input[i], - block_count); - i += block_count << 6; - } - } else { - block_count = (input_len - i) >> 7; - if (block_count > 0) { - SHA512TransformBlocks(ctx, &input[i], - block_count); - i += block_count << 7; - } - } -#endif /* !__amd64 || !_KERNEL */ - - /* - * general optimization: - * - * if i and input_len are the same, return now instead - * of calling memcpy(), since the memcpy() in this case - * will be an expensive noop. - */ - - if (input_len == i) - return; - - buf_index = 0; - } - - /* buffer remaining input */ - memcpy(&ctx->buf_un.buf8[buf_index], &input[i], input_len - i); -} - - -/* - * SHA2Final() - * - * purpose: ends an sha2 digest operation, finalizing the message digest and - * zeroing the context. - * input: uchar_t * : a buffer to store the digest - * : The function actually uses void* because many - * : callers pass things other than uchar_t here. - * SHA2_CTX * : the context to finalize, save, and zero - * output: void - */ - -void -SHA2Final(void *digest, SHA2_CTX *ctx) -{ - uint8_t bitcount_be[sizeof (ctx->count.c32)]; - uint8_t bitcount_be64[sizeof (ctx->count.c64)]; - uint32_t index; - uint32_t algotype = ctx->algotype; - - if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { - index = (ctx->count.c32[1] >> 3) & 0x3f; - Encode(bitcount_be, ctx->count.c32, sizeof (bitcount_be)); - SHA2Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index); - SHA2Update(ctx, bitcount_be, sizeof (bitcount_be)); - Encode(digest, ctx->state.s32, sizeof (ctx->state.s32)); - } else { - index = (ctx->count.c64[1] >> 3) & 0x7f; - Encode64(bitcount_be64, ctx->count.c64, - sizeof (bitcount_be64)); - SHA2Update(ctx, PADDING, ((index < 112) ? 112 : 240) - index); - SHA2Update(ctx, bitcount_be64, sizeof (bitcount_be64)); - if (algotype <= SHA384_HMAC_GEN_MECH_INFO_TYPE) { - ctx->state.s64[6] = ctx->state.s64[7] = 0; - Encode64(digest, ctx->state.s64, - sizeof (uint64_t) * 6); - } else if (algotype == SHA512_224_MECH_INFO_TYPE) { - uint8_t last[sizeof (uint64_t)]; - /* - * Since SHA-512/224 doesn't align well to 64-bit - * boundaries, we must do the encoding in three steps: - * 1) encode the three 64-bit words that fit neatly - * 2) encode the last 64-bit word to a temp buffer - * 3) chop out the lower 32-bits from the temp buffer - * and append them to the digest - */ - Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 3); - Encode64(last, &ctx->state.s64[3], sizeof (uint64_t)); - memcpy((uint8_t *)digest + 24, last, 4); - } else if (algotype == SHA512_256_MECH_INFO_TYPE) { - Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 4); - } else { - Encode64(digest, ctx->state.s64, - sizeof (ctx->state.s64)); - } - } - - /* zeroize sensitive information */ - memset(ctx, 0, sizeof (*ctx)); -} - -#ifdef _KERNEL -EXPORT_SYMBOL(SHA2Init); -EXPORT_SYMBOL(SHA2Update); -EXPORT_SYMBOL(SHA2Final); -#endif diff --git a/module/icp/algs/sha2/sha256_impl.c b/module/icp/algs/sha2/sha256_impl.c new file mode 100644 index 000000000000..024cfb1e45c7 --- /dev/null +++ b/module/icp/algs/sha2/sha256_impl.c @@ -0,0 +1,299 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt + */ + +#include +#include +#include +#include + +#include + +#define TF(E, N) \ + extern void E(uint32_t s[8], const void *, size_t); \ + static inline void N(uint32_t s[8], const void *d, size_t b) { \ + kfpu_begin(); E(s, d, b); kfpu_end(); \ +} + +/* some implementation is always okay */ +static inline boolean_t sha2_is_supported(void) +{ + return (B_TRUE); +} + +#if defined(__x86_64) + +extern void zfs_sha256_transform_x64(uint32_t s[8], const void *, size_t); +const sha256_ops_t sha256_x64_impl = { + .is_supported = sha2_is_supported, + .transform = zfs_sha256_transform_x64, + .name = "x64" +}; + +#if defined(HAVE_SSSE3) +static boolean_t sha2_have_ssse3(void) +{ + return (kfpu_allowed() && zfs_ssse3_available()); +} + +TF(zfs_sha256_transform_ssse3, tf_sha256_ssse3); +const sha256_ops_t sha256_ssse3_impl = { + .is_supported = sha2_have_ssse3, + .transform = tf_sha256_ssse3, + .name = "ssse3" +}; +#endif + +#if defined(HAVE_AVX) +static boolean_t sha2_have_avx(void) +{ + return (kfpu_allowed() && zfs_avx_available()); +} + +TF(zfs_sha256_transform_avx, tf_sha256_avx); +const sha256_ops_t sha256_avx_impl = { + .is_supported = sha2_have_avx, + .transform = tf_sha256_avx, + .name = "avx" +}; +#endif + +#if defined(HAVE_AVX2) +static boolean_t sha2_have_avx2(void) +{ + return (kfpu_allowed() && zfs_avx2_available()); +} + +TF(zfs_sha256_transform_avx2, tf_sha256_avx2); +const sha256_ops_t sha256_avx2_impl = { + .is_supported = sha2_have_avx2, + .transform = tf_sha256_avx2, + .name = "avx2" +}; +#endif + +#if defined(HAVE_SSE4_1) +static boolean_t sha2_have_shani(void) +{ + return (kfpu_allowed() && zfs_sse4_1_available() && \ + zfs_shani_available()); +} + +TF(zfs_sha256_transform_shani, tf_sha256_shani); +const sha256_ops_t sha256_shani_impl = { + .is_supported = sha2_have_shani, + .transform = tf_sha256_shani, + .name = "shani" +}; +#endif + +#elif defined(__aarch64__) || defined(__arm__) +static boolean_t sha256_have_neon(void) +{ + return (kfpu_allowed() && zfs_neon_available()); +} + +static boolean_t sha256_have_armv8ce(void) +{ + return (kfpu_allowed() && zfs_sha256_available()); +} + +extern void zfs_sha256_block_armv7(uint32_t s[8], const void *, size_t); +const sha256_ops_t sha256_armv7_impl = { + .is_supported = sha2_is_supported, + .transform = zfs_sha256_block_armv7, + .name = "armv7" +}; + +TF(zfs_sha256_block_neon, tf_sha256_neon); +const sha256_ops_t sha256_neon_impl = { + .is_supported = sha256_have_neon, + .transform = tf_sha256_neon, + .name = "neon" +}; + +TF(zfs_sha256_block_armv8, tf_sha256_armv8ce); +const sha256_ops_t sha256_armv8_impl = { + .is_supported = sha256_have_armv8ce, + .transform = tf_sha256_armv8ce, + .name = "armv8-ce" +}; + +#elif defined(__PPC64__) +static boolean_t sha256_have_vsx(void) +{ + return (kfpu_allowed() && zfs_vsx_available()); +} + +TF(zfs_sha256_ppc, tf_sha256_ppc); +const sha256_ops_t sha256_ppc_impl = { + .is_supported = sha2_is_supported, + .transform = tf_sha256_ppc, + .name = "ppc" +}; + +TF(zfs_sha256_power8, tf_sha256_power8); +const sha256_ops_t sha256_power8_impl = { + .is_supported = sha256_have_vsx, + .transform = tf_sha256_power8, + .name = "power8" +}; +#endif /* __PPC64__ */ + +/* the two generic ones */ +extern const sha256_ops_t sha256_generic_impl; + +/* array with all sha256 implementations */ +static const sha256_ops_t *const sha256_impls[] = { + &sha256_generic_impl, +#if defined(__x86_64) + &sha256_x64_impl, +#endif +#if defined(__x86_64) && defined(HAVE_SSSE3) + &sha256_ssse3_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX) + &sha256_avx_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX2) + &sha256_avx2_impl, +#endif +#if defined(__x86_64) && defined(HAVE_SSE4_1) + &sha256_shani_impl, +#endif +#if defined(__aarch64__) || defined(__arm__) + &sha256_armv7_impl, + &sha256_neon_impl, + &sha256_armv8_impl, +#endif +#if defined(__PPC64__) + &sha256_ppc_impl, + &sha256_power8_impl, +#endif /* __PPC64__ */ +}; + +/* use the generic implementation functions */ +#define IMPL_NAME "sha256" +#define IMPL_OPS_T sha256_ops_t +#define IMPL_ARRAY sha256_impls +#define IMPL_GET_OPS sha256_get_ops +#define ZFS_IMPL_OPS zfs_sha256_ops +#include + +#ifdef _KERNEL + +#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") + +#if defined(__linux__) + +static int +sha256_param_get(char *buffer, zfs_kernel_param_t *unused) +{ + const uint32_t impl = IMPL_READ(generic_impl_chosen); + char *fmt; + int cnt = 0; + + /* cycling */ + fmt = IMPL_FMT(impl, IMPL_CYCLE); + cnt += sprintf(buffer + cnt, fmt, "cycle"); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + cnt += sprintf(buffer + cnt, fmt, "fastest"); + + /* list all supported implementations */ + generic_impl_init(); + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + cnt += sprintf(buffer + cnt, fmt, + generic_supp_impls[i]->name); + } + + return (cnt); +} + +static int +sha256_param_set(const char *val, zfs_kernel_param_t *unused) +{ + (void) unused; + return (generic_impl_setname(val)); +} + +#elif defined(__FreeBSD__) + +#include + +static int +sha256_param(ZFS_MODULE_PARAM_ARGS) +{ + int err; + + generic_impl_init(); + if (req->newptr == NULL) { + const uint32_t impl = IMPL_READ(generic_impl_chosen); + const int init_buflen = 64; + const char *fmt; + struct sbuf *s; + + s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); + + /* cycling */ + fmt = IMPL_FMT(impl, IMPL_CYCLE); + (void) sbuf_printf(s, fmt, "cycle"); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + (void) sbuf_printf(s, fmt, "fastest"); + + /* list all supported implementations */ + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name); + } + + err = sbuf_finish(s); + sbuf_delete(s); + + return (err); + } + + char buf[16]; + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) { + return (err); + } + + return (-generic_impl_setname(buf)); +} +#endif + +#undef IMPL_FMT + +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, sha256_impl, + sha256_param_set, sha256_param_get, ZMOD_RW, \ + "Select SHA256 implementation."); +#endif + +#undef TF diff --git a/module/icp/algs/sha2/sha2_generic.c b/module/icp/algs/sha2/sha2_generic.c new file mode 100644 index 000000000000..e69dc7771b86 --- /dev/null +++ b/module/icp/algs/sha2/sha2_generic.c @@ -0,0 +1,562 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on public domain code in cppcrypto 0.10. + * Copyright (c) 2022 Tino Reichardt + */ + +#include +#include +#include + +#include + +/* + * On i386, gcc brings this for sha512_generic(): + * error: the frame size of 1040 bytes is larger than 1024 + */ +#if defined(__GNUC__) && defined(_ILP32) +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +#endif + +/* SHA256 */ +static const uint32_t SHA256_K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define Maj(x, y, z) (((y) & (z)) | (((y) | (z)) & (x))) + +#define rotr32(x, n) (((x) >> n) | ((x) << (32 - n))) +#define sum0(x) (rotr32((x), 2) ^ rotr32((x), 13) ^ rotr32((x), 22)) +#define sum1(x) (rotr32((x), 6) ^ rotr32((x), 11) ^ rotr32((x), 25)) +#define sigma0(x) (rotr32((x), 7) ^ rotr32((x), 18) ^ ((x) >> 3)) +#define sigma1(x) (rotr32((x), 17) ^ rotr32((x), 19) ^ ((x) >> 10)) + +#define WU(j) (W[j & 15] += sigma1(W[(j + 14) & 15]) \ + + W[(j + 9) & 15] + sigma0(W[(j + 1) & 15])) + +#define COMPRESS(i, j, K) \ + T1 = h + sum1(e) + Ch(e, f, g) + K[i + j] + (i? WU(j): W[j]); \ + T2 = sum0(a) + Maj(a, b, c); \ + h = g, g = f, f = e, e = d + T1; \ + d = c, c = b, b = a, a = T1 + T2; + +static void sha256_generic(uint32_t state[8], const void *data, size_t num_blks) +{ + uint64_t blk; + + for (blk = 0; blk < num_blks; blk++) { + uint32_t W[16]; + uint32_t a, b, c, d, e, f, g, h; + uint32_t T1, T2; + int i; + + for (i = 0; i < 16; i++) { + W[i] = BE_32( \ + (((const uint32_t *)(data))[blk * 16 + i])); + } + + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + f = state[5]; + g = state[6]; + h = state[7]; + + for (i = 0; i <= 63; i += 16) { + COMPRESS(i, 0, SHA256_K); + COMPRESS(i, 1, SHA256_K); + COMPRESS(i, 2, SHA256_K); + COMPRESS(i, 3, SHA256_K); + COMPRESS(i, 4, SHA256_K); + COMPRESS(i, 5, SHA256_K); + COMPRESS(i, 6, SHA256_K); + COMPRESS(i, 7, SHA256_K); + COMPRESS(i, 8, SHA256_K); + COMPRESS(i, 9, SHA256_K); + COMPRESS(i, 10, SHA256_K); + COMPRESS(i, 11, SHA256_K); + COMPRESS(i, 12, SHA256_K); + COMPRESS(i, 13, SHA256_K); + COMPRESS(i, 14, SHA256_K); + COMPRESS(i, 15, SHA256_K); + } + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + } +} + +#undef sum0 +#undef sum1 +#undef sigma0 +#undef sigma1 + +#define rotr64(x, n) (((x) >> n) | ((x) << (64 - n))) +#define sum0(x) (rotr64((x), 28) ^ rotr64((x), 34) ^ rotr64((x), 39)) +#define sum1(x) (rotr64((x), 14) ^ rotr64((x), 18) ^ rotr64((x), 41)) +#define sigma0(x) (rotr64((x), 1) ^ rotr64((x), 8) ^ ((x) >> 7)) +#define sigma1(x) (rotr64((x), 19) ^ rotr64((x), 61) ^ ((x) >> 6)) + +/* SHA512 */ +static const uint64_t SHA512_K[80] = { + 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, + 0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019, + 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242, + 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, + 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, + 0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, + 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 0x2de92c6f592b0275, + 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, + 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, + 0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725, + 0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc, + 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, + 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, + 0x92722c851482353b, 0xa2bfe8a14cf10364, 0xa81a664bbc423001, + 0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218, + 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8, + 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, + 0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, + 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc, + 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec, + 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, + 0xc67178f2e372532b, 0xca273eceea26619c, 0xd186b8c721c0c207, + 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba, + 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b, + 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, + 0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, + 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 +}; + +static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks) +{ + uint64_t blk; + + for (blk = 0; blk < num_blks; blk++) { + uint64_t W[16]; + uint64_t a, b, c, d, e, f, g, h; + uint64_t T1, T2; + int i; + + for (i = 0; i < 16; i++) { + W[i] = BE_64( \ + (((const uint64_t *)(data))[blk * 16 + i])); + } + + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + f = state[5]; + g = state[6]; + h = state[7]; + + for (i = 0; i <= 79; i += 16) { + COMPRESS(i, 0, SHA512_K); + COMPRESS(i, 1, SHA512_K); + COMPRESS(i, 2, SHA512_K); + COMPRESS(i, 3, SHA512_K); + COMPRESS(i, 4, SHA512_K); + COMPRESS(i, 5, SHA512_K); + COMPRESS(i, 6, SHA512_K); + COMPRESS(i, 7, SHA512_K); + COMPRESS(i, 8, SHA512_K); + COMPRESS(i, 9, SHA512_K); + COMPRESS(i, 10, SHA512_K); + COMPRESS(i, 11, SHA512_K); + COMPRESS(i, 12, SHA512_K); + COMPRESS(i, 13, SHA512_K); + COMPRESS(i, 14, SHA512_K); + COMPRESS(i, 15, SHA512_K); + } + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + } +} + +static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len) +{ + uint64_t pos = ctx->count[0]; + uint64_t total = ctx->count[1]; + uint8_t *m = ctx->wbuf; + const sha256_ops_t *ops = ctx->ops; + + if (pos && pos + len >= 64) { + memcpy(m + pos, data, 64 - pos); + ops->transform(ctx->state, m, 1); + len -= 64 - pos; + total += (64 - pos) * 8; + data += 64 - pos; + pos = 0; + } + + if (len >= 64) { + uint32_t blocks = len / 64; + uint32_t bytes = blocks * 64; + ops->transform(ctx->state, data, blocks); + len -= bytes; + total += (bytes) * 8; + data += bytes; + } + memcpy(m + pos, data, len); + + pos += len; + total += len * 8; + ctx->count[0] = pos; + ctx->count[1] = total; +} + +static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len) +{ + uint64_t pos = ctx->count[0]; + uint64_t total = ctx->count[1]; + uint8_t *m = ctx->wbuf; + const sha512_ops_t *ops = ctx->ops; + + if (pos && pos + len >= 128) { + memcpy(m + pos, data, 128 - pos); + ops->transform(ctx->state, m, 1); + len -= 128 - pos; + total += (128 - pos) * 8; + data += 128 - pos; + pos = 0; + } + + if (len >= 128) { + uint64_t blocks = len / 128; + uint64_t bytes = blocks * 128; + ops->transform(ctx->state, data, blocks); + len -= bytes; + total += (bytes) * 8; + data += bytes; + } + memcpy(m + pos, data, len); + + pos += len; + total += len * 8; + ctx->count[0] = pos; + ctx->count[1] = total; +} + +static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits) +{ + uint64_t mlen, pos = ctx->count[0]; + uint8_t *m = ctx->wbuf; + uint32_t *R = (uint32_t *)result; + const sha256_ops_t *ops = ctx->ops; + + m[pos++] = 0x80; + if (pos > 56) { + memset(m + pos, 0, 64 - pos); + ops->transform(ctx->state, m, 1); + pos = 0; + } + + memset(m + pos, 0, 64 - pos); + mlen = BE_64(ctx->count[1]); + memcpy(m + (64 - 8), &mlen, 64 / 8); + ops->transform(ctx->state, m, 1); + + switch (bits) { + case 224: /* 28 - unused currently /TR */ + R[0] = BE_32(ctx->state[0]); + R[1] = BE_32(ctx->state[1]); + R[2] = BE_32(ctx->state[2]); + R[3] = BE_32(ctx->state[3]); + R[4] = BE_32(ctx->state[4]); + R[5] = BE_32(ctx->state[5]); + R[6] = BE_32(ctx->state[6]); + break; + case 256: /* 32 */ + R[0] = BE_32(ctx->state[0]); + R[1] = BE_32(ctx->state[1]); + R[2] = BE_32(ctx->state[2]); + R[3] = BE_32(ctx->state[3]); + R[4] = BE_32(ctx->state[4]); + R[5] = BE_32(ctx->state[5]); + R[6] = BE_32(ctx->state[6]); + R[7] = BE_32(ctx->state[7]); + break; + } + + memset(ctx, 0, sizeof (*ctx)); +} + +static void sha512_final(sha512_ctx *ctx, uint8_t *result, int bits) +{ + uint64_t mlen, pos = ctx->count[0]; + uint8_t *m = ctx->wbuf, *r; + uint64_t *R = (uint64_t *)result; + const sha512_ops_t *ops = ctx->ops; + + m[pos++] = 0x80; + if (pos > 112) { + memset(m + pos, 0, 128 - pos); + ops->transform(ctx->state, m, 1); + pos = 0; + } + + memset(m + pos, 0, 128 - pos); + mlen = BE_64(ctx->count[1]); + memcpy(m + (128 - 8), &mlen, 64 / 8); + ops->transform(ctx->state, m, 1); + + switch (bits) { + case 224: /* 28 => 3,5 x 8 */ + r = result + 24; + R[0] = BE_64(ctx->state[0]); + R[1] = BE_64(ctx->state[1]); + R[2] = BE_64(ctx->state[2]); + /* last 4 bytes are special here */ + *r++ = (uint8_t)(ctx->state[3] >> 56); + *r++ = (uint8_t)(ctx->state[3] >> 48); + *r++ = (uint8_t)(ctx->state[3] >> 40); + *r++ = (uint8_t)(ctx->state[3] >> 32); + break; + case 256: /* 32 */ + R[0] = BE_64(ctx->state[0]); + R[1] = BE_64(ctx->state[1]); + R[2] = BE_64(ctx->state[2]); + R[3] = BE_64(ctx->state[3]); + break; + case 384: /* 48 */ + R[0] = BE_64(ctx->state[0]); + R[1] = BE_64(ctx->state[1]); + R[2] = BE_64(ctx->state[2]); + R[3] = BE_64(ctx->state[3]); + R[4] = BE_64(ctx->state[4]); + R[5] = BE_64(ctx->state[5]); + break; + case 512: /* 64 */ + R[0] = BE_64(ctx->state[0]); + R[1] = BE_64(ctx->state[1]); + R[2] = BE_64(ctx->state[2]); + R[3] = BE_64(ctx->state[3]); + R[4] = BE_64(ctx->state[4]); + R[5] = BE_64(ctx->state[5]); + R[6] = BE_64(ctx->state[6]); + R[7] = BE_64(ctx->state[7]); + break; + } + + memset(ctx, 0, sizeof (*ctx)); +} + +/* SHA2 Init function */ +void +SHA2Init(int algotype, SHA2_CTX *ctx) +{ + sha256_ctx *ctx256 = &ctx->sha256; + sha512_ctx *ctx512 = &ctx->sha512; + + ASSERT3U(algotype, >=, SHA256_MECH_INFO_TYPE); + ASSERT3U(algotype, <=, SHA512_256_MECH_INFO_TYPE); + + memset(ctx, 0, sizeof (*ctx)); + ctx->algotype = algotype; + switch (ctx->algotype) { + case SHA256_MECH_INFO_TYPE: + case SHA256_HMAC_MECH_INFO_TYPE: + case SHA256_HMAC_GEN_MECH_INFO_TYPE: + ctx256->state[0] = 0x6a09e667; + ctx256->state[1] = 0xbb67ae85; + ctx256->state[2] = 0x3c6ef372; + ctx256->state[3] = 0xa54ff53a; + ctx256->state[4] = 0x510e527f; + ctx256->state[5] = 0x9b05688c; + ctx256->state[6] = 0x1f83d9ab; + ctx256->state[7] = 0x5be0cd19; + ctx256->count[0] = 0; + ctx256->ops = sha256_get_ops(); + break; + case SHA384_MECH_INFO_TYPE: + case SHA384_HMAC_MECH_INFO_TYPE: + case SHA384_HMAC_GEN_MECH_INFO_TYPE: + ctx512->state[0] = 0xcbbb9d5dc1059ed8ULL; + ctx512->state[1] = 0x629a292a367cd507ULL; + ctx512->state[2] = 0x9159015a3070dd17ULL; + ctx512->state[3] = 0x152fecd8f70e5939ULL; + ctx512->state[4] = 0x67332667ffc00b31ULL; + ctx512->state[5] = 0x8eb44a8768581511ULL; + ctx512->state[6] = 0xdb0c2e0d64f98fa7ULL; + ctx512->state[7] = 0x47b5481dbefa4fa4ULL; + ctx512->count[0] = 0; + ctx512->count[1] = 0; + ctx512->ops = sha512_get_ops(); + break; + case SHA512_MECH_INFO_TYPE: + case SHA512_HMAC_MECH_INFO_TYPE: + case SHA512_HMAC_GEN_MECH_INFO_TYPE: + ctx512->state[0] = 0x6a09e667f3bcc908ULL; + ctx512->state[1] = 0xbb67ae8584caa73bULL; + ctx512->state[2] = 0x3c6ef372fe94f82bULL; + ctx512->state[3] = 0xa54ff53a5f1d36f1ULL; + ctx512->state[4] = 0x510e527fade682d1ULL; + ctx512->state[5] = 0x9b05688c2b3e6c1fULL; + ctx512->state[6] = 0x1f83d9abfb41bd6bULL; + ctx512->state[7] = 0x5be0cd19137e2179ULL; + ctx512->count[0] = 0; + ctx512->count[1] = 0; + ctx512->ops = sha512_get_ops(); + break; + case SHA512_224_MECH_INFO_TYPE: + ctx512->state[0] = 0x8c3d37c819544da2ULL; + ctx512->state[1] = 0x73e1996689dcd4d6ULL; + ctx512->state[2] = 0x1dfab7ae32ff9c82ULL; + ctx512->state[3] = 0x679dd514582f9fcfULL; + ctx512->state[4] = 0x0f6d2b697bd44da8ULL; + ctx512->state[5] = 0x77e36f7304c48942ULL; + ctx512->state[6] = 0x3f9d85a86a1d36c8ULL; + ctx512->state[7] = 0x1112e6ad91d692a1ULL; + ctx512->count[0] = 0; + ctx512->count[1] = 0; + ctx512->ops = sha512_get_ops(); + break; + case SHA512_256_MECH_INFO_TYPE: + ctx512->state[0] = 0x22312194fc2bf72cULL; + ctx512->state[1] = 0x9f555fa3c84c64c2ULL; + ctx512->state[2] = 0x2393b86b6f53b151ULL; + ctx512->state[3] = 0x963877195940eabdULL; + ctx512->state[4] = 0x96283ee2a88effe3ULL; + ctx512->state[5] = 0xbe5e1e2553863992ULL; + ctx512->state[6] = 0x2b0199fc2c85b8aaULL; + ctx512->state[7] = 0x0eb72ddc81c52ca2ULL; + ctx512->count[0] = 0; + ctx512->count[1] = 0; + ctx512->ops = sha512_get_ops(); + break; + } +} + +/* SHA2 Update function */ +void +SHA2Update(SHA2_CTX *ctx, const void *data, size_t len) +{ + /* check for zero input length */ + if (len == 0) + return; + + ASSERT3P(data, !=, NULL); + + switch (ctx->algotype) { + case SHA256_MECH_INFO_TYPE: + case SHA256_HMAC_MECH_INFO_TYPE: + case SHA256_HMAC_GEN_MECH_INFO_TYPE: + sha256_update(&ctx->sha256, data, len); + break; + case SHA384_MECH_INFO_TYPE: + case SHA384_HMAC_MECH_INFO_TYPE: + case SHA384_HMAC_GEN_MECH_INFO_TYPE: + sha512_update(&ctx->sha512, data, len); + break; + case SHA512_MECH_INFO_TYPE: + case SHA512_HMAC_MECH_INFO_TYPE: + case SHA512_HMAC_GEN_MECH_INFO_TYPE: + sha512_update(&ctx->sha512, data, len); + break; + case SHA512_224_MECH_INFO_TYPE: + sha512_update(&ctx->sha512, data, len); + break; + case SHA512_256_MECH_INFO_TYPE: + sha512_update(&ctx->sha512, data, len); + break; + } +} + +/* SHA2Final function */ +void +SHA2Final(void *digest, SHA2_CTX *ctx) +{ + switch (ctx->algotype) { + case SHA256_MECH_INFO_TYPE: + case SHA256_HMAC_MECH_INFO_TYPE: + case SHA256_HMAC_GEN_MECH_INFO_TYPE: + sha256_final(&ctx->sha256, digest, 256); + break; + case SHA384_MECH_INFO_TYPE: + case SHA384_HMAC_MECH_INFO_TYPE: + case SHA384_HMAC_GEN_MECH_INFO_TYPE: + sha512_final(&ctx->sha512, digest, 384); + break; + case SHA512_MECH_INFO_TYPE: + case SHA512_HMAC_MECH_INFO_TYPE: + case SHA512_HMAC_GEN_MECH_INFO_TYPE: + sha512_final(&ctx->sha512, digest, 512); + break; + case SHA512_224_MECH_INFO_TYPE: + sha512_final(&ctx->sha512, digest, 224); + break; + case SHA512_256_MECH_INFO_TYPE: + sha512_final(&ctx->sha512, digest, 256); + break; + } +} + +/* the generic implementation is always okay */ +static boolean_t sha2_is_supported(void) +{ + return (B_TRUE); +} + +const sha256_ops_t sha256_generic_impl = { + .name = "generic", + .transform = sha256_generic, + .is_supported = sha2_is_supported +}; + +const sha512_ops_t sha512_generic_impl = { + .name = "generic", + .transform = sha512_generic, + .is_supported = sha2_is_supported +}; diff --git a/module/icp/algs/sha2/sha512_impl.c b/module/icp/algs/sha2/sha512_impl.c new file mode 100644 index 000000000000..d21312336f8d --- /dev/null +++ b/module/icp/algs/sha2/sha512_impl.c @@ -0,0 +1,276 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt + */ + +#include +#include +#include +#include + +#include + +#define TF(E, N) \ + extern void E(uint64_t s[8], const void *, size_t); \ + static inline void N(uint64_t s[8], const void *d, size_t b) { \ + kfpu_begin(); E(s, d, b); kfpu_end(); \ +} + +/* some implementation is always okay */ +static inline boolean_t sha2_is_supported(void) +{ + return (B_TRUE); +} + +#if defined(__x86_64) + +extern void zfs_sha512_transform_x64(uint64_t s[8], const void *, size_t); +const sha512_ops_t sha512_x64_impl = { + .is_supported = sha2_is_supported, + .transform = zfs_sha512_transform_x64, + .name = "x64" +}; + +#if defined(HAVE_AVX) +static boolean_t sha2_have_avx(void) +{ + return (kfpu_allowed() && zfs_avx_available()); +} + +TF(zfs_sha512_transform_avx, tf_sha512_avx); +const sha512_ops_t sha512_avx_impl = { + .is_supported = sha2_have_avx, + .transform = tf_sha512_avx, + .name = "avx" +}; +#endif + +#if defined(HAVE_AVX2) +static boolean_t sha2_have_avx2(void) +{ + return (kfpu_allowed() && zfs_avx2_available()); +} + +TF(zfs_sha512_transform_avx2, tf_sha512_avx2); +const sha512_ops_t sha512_avx2_impl = { + .is_supported = sha2_have_avx2, + .transform = tf_sha512_avx2, + .name = "avx2" +}; +#endif + +#elif defined(__aarch64__) +extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t); +const sha512_ops_t sha512_armv7_impl = { + .is_supported = sha2_is_supported, + .transform = zfs_sha512_block_armv7, + .name = "armv7" +}; + +static boolean_t sha512_have_armv8ce(void) +{ + return (kfpu_allowed() && zfs_sha512_available()); +} + +TF(zfs_sha512_block_armv8, tf_sha512_armv8ce); +const sha512_ops_t sha512_armv8_impl = { + .is_supported = sha512_have_armv8ce, + .transform = tf_sha512_armv8ce, + .name = "armv8-ce" +}; + +#elif defined(__arm__) +extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t); +const sha512_ops_t sha512_armv7_impl = { + .is_supported = sha2_is_supported, + .transform = zfs_sha512_block_armv7, + .name = "armv7" +}; + +static boolean_t sha512_have_neon(void) +{ + return (kfpu_allowed() && zfs_neon_available()); +} + +TF(zfs_sha512_block_neon, tf_sha512_neon); +const sha512_ops_t sha512_neon_impl = { + .is_supported = sha512_have_neon, + .transform = tf_sha512_neon, + .name = "neon" +}; + +#elif defined(__PPC64__) +TF(zfs_sha512_ppc, tf_sha512_ppc); +const sha512_ops_t sha512_ppc_impl = { + .is_supported = sha2_is_supported, + .transform = tf_sha512_ppc, + .name = "ppc" +}; + +static boolean_t sha512_have_vsx(void) +{ + return (kfpu_allowed() && zfs_vsx_available()); +} + +TF(zfs_sha512_power8, tf_sha512_power8); +const sha512_ops_t sha512_power8_impl = { + .is_supported = sha512_have_vsx, + .transform = tf_sha512_power8, + .name = "power8" +}; +#endif /* __PPC64__ */ + +/* the two generic ones */ +extern const sha512_ops_t sha512_generic_impl; + +/* array with all sha512 implementations */ +static const sha512_ops_t *const sha512_impls[] = { + &sha512_generic_impl, +#if defined(__x86_64) + &sha512_x64_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX) + &sha512_avx_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX2) + &sha512_avx2_impl, +#endif +#if defined(__aarch64__) + &sha512_armv7_impl, + &sha512_armv8_impl, +#endif +#if defined(__arm__) + &sha512_armv7_impl, + &sha512_neon_impl, +#endif +#if defined(__PPC64__) + &sha512_ppc_impl, + &sha512_power8_impl, +#endif /* __PPC64__ */ +}; + +/* use the generic implementation functions */ +#define IMPL_NAME "sha512" +#define IMPL_OPS_T sha512_ops_t +#define IMPL_ARRAY sha512_impls +#define IMPL_GET_OPS sha512_get_ops +#define ZFS_IMPL_OPS zfs_sha512_ops +#include + +#ifdef _KERNEL + +#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") + +#if defined(__linux__) + +static int +sha512_param_get(char *buffer, zfs_kernel_param_t *unused) +{ + const uint32_t impl = IMPL_READ(generic_impl_chosen); + char *fmt; + int cnt = 0; + + /* cycling */ + fmt = IMPL_FMT(impl, IMPL_CYCLE); + cnt += sprintf(buffer + cnt, fmt, "cycle"); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + cnt += sprintf(buffer + cnt, fmt, "fastest"); + + /* list all supported implementations */ + generic_impl_init(); + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + cnt += sprintf(buffer + cnt, fmt, + generic_supp_impls[i]->name); + } + + return (cnt); +} + +static int +sha512_param_set(const char *val, zfs_kernel_param_t *unused) +{ + (void) unused; + return (generic_impl_setname(val)); +} + +#elif defined(__FreeBSD__) + +#include + +static int +sha512_param(ZFS_MODULE_PARAM_ARGS) +{ + int err; + + generic_impl_init(); + if (req->newptr == NULL) { + const uint32_t impl = IMPL_READ(generic_impl_chosen); + const int init_buflen = 64; + const char *fmt; + struct sbuf *s; + + s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); + + /* cycling */ + fmt = IMPL_FMT(impl, IMPL_CYCLE); + (void) sbuf_printf(s, fmt, "cycle"); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + (void) sbuf_printf(s, fmt, "fastest"); + + /* list all supported implementations */ + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name); + } + + err = sbuf_finish(s); + sbuf_delete(s); + + return (err); + } + + /* we got module parameter */ + char buf[16]; + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) { + return (err); + } + + return (-generic_impl_setname(buf)); +} +#endif + +#undef IMPL_FMT + +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, sha512_impl, + sha512_param_set, sha512_param_get, ZMOD_RW, \ + "Select SHA512 implementation."); +#endif + +#undef TF diff --git a/module/icp/asm-aarch64/sha2/sha256-armv8.S b/module/icp/asm-aarch64/sha2/sha256-armv8.S new file mode 100644 index 000000000000..fa50c4e74d59 --- /dev/null +++ b/module/icp/asm-aarch64/sha2/sha256-armv8.S @@ -0,0 +1,1999 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Portions Copyright (c) 2022 Tino Reichardt + * - modified assembly to fit into OpenZFS + */ + +#if defined(__aarch64__) + +.text + +.align 6 +.type .LK256,%object +.LK256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +.size .LK256,.-.LK256 + +.globl zfs_sha256_block_armv7 +.type zfs_sha256_block_armv7,%function +.align 6 +zfs_sha256_block_armv7: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*4 + + ldp w20,w21,[x0] // load context + ldp w22,w23,[x0,#2*4] + ldp w24,w25,[x0,#4*4] + add x2,x1,x2,lsl#6 // end of input + ldp w26,w27,[x0,#6*4] + adr x30,.LK256 + stp x0,x2,[x29,#96] + +.Loop: + ldp w3,w4,[x1],#2*4 + ldr w19,[x30],#4 // *K++ + eor w28,w21,w22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev w3,w3 // 0 +#endif + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w6,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w3 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w4,w4 // 1 +#endif + ldp w5,w6,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w7,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w4 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w5,w5 // 2 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w8,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w5 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w6,w6 // 3 +#endif + ldp w7,w8,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w9,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w6 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w7,w7 // 4 +#endif + add w24,w24,w17 // h+=Sigma0(a) + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w10,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w7 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w10,ror#11 // Sigma1(e) + ror w10,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w10,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w8,w8 // 5 +#endif + ldp w9,w10,[x1],#2*4 + add w23,w23,w17 // h+=Sigma0(a) + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w11,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w8 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w11,ror#11 // Sigma1(e) + ror w11,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w11,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w9,w9 // 6 +#endif + add w22,w22,w17 // h+=Sigma0(a) + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w12,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w9 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w12,ror#11 // Sigma1(e) + ror w12,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w12,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w10,w10 // 7 +#endif + ldp w11,w12,[x1],#2*4 + add w21,w21,w17 // h+=Sigma0(a) + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + eor w13,w25,w25,ror#14 + and w17,w26,w25 + bic w28,w27,w25 + add w20,w20,w10 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w13,ror#11 // Sigma1(e) + ror w13,w21,#2 + add w20,w20,w17 // h+=Ch(e,f,g) + eor w17,w21,w21,ror#9 + add w20,w20,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w24,w24,w20 // d+=h + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w13,w17,ror#13 // Sigma0(a) + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w20,w20,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w11,w11 // 8 +#endif + add w20,w20,w17 // h+=Sigma0(a) + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w14,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w11 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w14,ror#11 // Sigma1(e) + ror w14,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w14,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w12,w12 // 9 +#endif + ldp w13,w14,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w15,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w12 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w15,ror#11 // Sigma1(e) + ror w15,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w15,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w13,w13 // 10 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w0,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w13 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w0,ror#11 // Sigma1(e) + ror w0,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w0,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w14,w14 // 11 +#endif + ldp w15,w0,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w6,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w14 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w15,w15 // 12 +#endif + add w24,w24,w17 // h+=Sigma0(a) + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w7,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w15 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w0,w0 // 13 +#endif + ldp w1,w2,[x1] + add w23,w23,w17 // h+=Sigma0(a) + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w8,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w0 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w1,w1 // 14 +#endif + ldr w6,[sp,#12] + add w22,w22,w17 // h+=Sigma0(a) + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w9,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w1 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w2,w2 // 15 +#endif + ldr w7,[sp,#0] + add w21,w21,w17 // h+=Sigma0(a) + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 +.Loop_16_xx: + ldr w8,[sp,#4] + str w11,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w10,w5,#7 + and w17,w25,w24 + ror w9,w2,#17 + bic w19,w26,w24 + ror w11,w20,#2 + add w27,w27,w3 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w10,w10,w5,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w11,w11,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w9,w9,w2,ror#19 + eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w11,w20,ror#22 // Sigma0(a) + eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) + add w4,w4,w13 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w4,w4,w10 + add w27,w27,w17 // h+=Sigma0(a) + add w4,w4,w9 + ldr w9,[sp,#8] + str w12,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w11,w6,#7 + and w17,w24,w23 + ror w10,w3,#17 + bic w28,w25,w23 + ror w12,w27,#2 + add w26,w26,w4 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w11,w11,w6,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w12,w12,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w10,w10,w3,ror#19 + eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w12,w27,ror#22 // Sigma0(a) + eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) + add w5,w5,w14 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w5,w5,w11 + add w26,w26,w17 // h+=Sigma0(a) + add w5,w5,w10 + ldr w10,[sp,#12] + str w13,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w12,w7,#7 + and w17,w23,w22 + ror w11,w4,#17 + bic w19,w24,w22 + ror w13,w26,#2 + add w25,w25,w5 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w12,w12,w7,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w13,w13,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w11,w11,w4,ror#19 + eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w13,w26,ror#22 // Sigma0(a) + eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) + add w6,w6,w15 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w6,w6,w12 + add w25,w25,w17 // h+=Sigma0(a) + add w6,w6,w11 + ldr w11,[sp,#0] + str w14,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w13,w8,#7 + and w17,w22,w21 + ror w12,w5,#17 + bic w28,w23,w21 + ror w14,w25,#2 + add w24,w24,w6 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w13,w13,w8,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w14,w14,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w12,w12,w5,ror#19 + eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w14,w25,ror#22 // Sigma0(a) + eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) + add w7,w7,w0 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w7,w7,w13 + add w24,w24,w17 // h+=Sigma0(a) + add w7,w7,w12 + ldr w12,[sp,#4] + str w15,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w14,w9,#7 + and w17,w21,w20 + ror w13,w6,#17 + bic w19,w22,w20 + ror w15,w24,#2 + add w23,w23,w7 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w14,w14,w9,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w15,w15,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w13,w13,w6,ror#19 + eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w15,w24,ror#22 // Sigma0(a) + eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) + add w8,w8,w1 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w8,w8,w14 + add w23,w23,w17 // h+=Sigma0(a) + add w8,w8,w13 + ldr w13,[sp,#8] + str w0,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w15,w10,#7 + and w17,w20,w27 + ror w14,w7,#17 + bic w28,w21,w27 + ror w0,w23,#2 + add w22,w22,w8 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w15,w15,w10,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w0,w0,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w14,w14,w7,ror#19 + eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w0,w23,ror#22 // Sigma0(a) + eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) + add w9,w9,w2 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w9,w9,w15 + add w22,w22,w17 // h+=Sigma0(a) + add w9,w9,w14 + ldr w14,[sp,#12] + str w1,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w0,w11,#7 + and w17,w27,w26 + ror w15,w8,#17 + bic w19,w20,w26 + ror w1,w22,#2 + add w21,w21,w9 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w0,w0,w11,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w1,w1,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w15,w15,w8,ror#19 + eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w1,w22,ror#22 // Sigma0(a) + eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) + add w10,w10,w3 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w10,w10,w0 + add w21,w21,w17 // h+=Sigma0(a) + add w10,w10,w15 + ldr w15,[sp,#0] + str w2,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w1,w12,#7 + and w17,w26,w25 + ror w0,w9,#17 + bic w28,w27,w25 + ror w2,w21,#2 + add w20,w20,w10 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w1,w1,w12,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w2,w2,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w0,w0,w9,ror#19 + eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w2,w21,ror#22 // Sigma0(a) + eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) + add w11,w11,w4 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w11,w11,w1 + add w20,w20,w17 // h+=Sigma0(a) + add w11,w11,w0 + ldr w0,[sp,#4] + str w3,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w2,w13,#7 + and w17,w25,w24 + ror w1,w10,#17 + bic w19,w26,w24 + ror w3,w20,#2 + add w27,w27,w11 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w2,w2,w13,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w3,w3,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w1,w1,w10,ror#19 + eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w3,w20,ror#22 // Sigma0(a) + eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) + add w12,w12,w5 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w12,w12,w2 + add w27,w27,w17 // h+=Sigma0(a) + add w12,w12,w1 + ldr w1,[sp,#8] + str w4,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w3,w14,#7 + and w17,w24,w23 + ror w2,w11,#17 + bic w28,w25,w23 + ror w4,w27,#2 + add w26,w26,w12 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w3,w3,w14,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w4,w4,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w2,w2,w11,ror#19 + eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w4,w27,ror#22 // Sigma0(a) + eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) + add w13,w13,w6 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w13,w13,w3 + add w26,w26,w17 // h+=Sigma0(a) + add w13,w13,w2 + ldr w2,[sp,#12] + str w5,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w4,w15,#7 + and w17,w23,w22 + ror w3,w12,#17 + bic w19,w24,w22 + ror w5,w26,#2 + add w25,w25,w13 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w4,w4,w15,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w5,w5,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w3,w3,w12,ror#19 + eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w5,w26,ror#22 // Sigma0(a) + eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) + add w14,w14,w7 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w14,w14,w4 + add w25,w25,w17 // h+=Sigma0(a) + add w14,w14,w3 + ldr w3,[sp,#0] + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w5,w0,#7 + and w17,w22,w21 + ror w4,w13,#17 + bic w28,w23,w21 + ror w6,w25,#2 + add w24,w24,w14 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w5,w5,w0,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w6,w6,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w4,w4,w13,ror#19 + eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w25,ror#22 // Sigma0(a) + eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) + add w15,w15,w8 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w15,w15,w5 + add w24,w24,w17 // h+=Sigma0(a) + add w15,w15,w4 + ldr w4,[sp,#4] + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w6,w1,#7 + and w17,w21,w20 + ror w5,w14,#17 + bic w19,w22,w20 + ror w7,w24,#2 + add w23,w23,w15 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w6,w6,w1,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w7,w7,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w5,w5,w14,ror#19 + eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w24,ror#22 // Sigma0(a) + eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) + add w0,w0,w9 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w0,w0,w6 + add w23,w23,w17 // h+=Sigma0(a) + add w0,w0,w5 + ldr w5,[sp,#8] + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w7,w2,#7 + and w17,w20,w27 + ror w6,w15,#17 + bic w28,w21,w27 + ror w8,w23,#2 + add w22,w22,w0 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w7,w7,w2,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w8,w8,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w6,w6,w15,ror#19 + eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w23,ror#22 // Sigma0(a) + eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) + add w1,w1,w10 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w1,w1,w7 + add w22,w22,w17 // h+=Sigma0(a) + add w1,w1,w6 + ldr w6,[sp,#12] + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w8,w3,#7 + and w17,w27,w26 + ror w7,w0,#17 + bic w19,w20,w26 + ror w9,w22,#2 + add w21,w21,w1 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w8,w8,w3,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w9,w9,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w7,w7,w0,ror#19 + eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w22,ror#22 // Sigma0(a) + eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) + add w2,w2,w11 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w2,w2,w8 + add w21,w21,w17 // h+=Sigma0(a) + add w2,w2,w7 + ldr w7,[sp,#0] + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 + cbnz w19,.Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#260 // rewind + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#2*4] + add x1,x1,#14*4 // advance input pointer + ldp w7,w8,[x0,#4*4] + add w20,w20,w3 + ldp w9,w10,[x0,#6*4] + add w21,w21,w4 + add w22,w22,w5 + add w23,w23,w6 + stp w20,w21,[x0] + add w24,w24,w7 + add w25,w25,w8 + stp w22,w23,[x0,#2*4] + add w26,w26,w9 + add w27,w27,w10 + cmp x1,x2 + stp w24,w25,[x0,#4*4] + stp w26,w27,[x0,#6*4] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*4 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size zfs_sha256_block_armv7,.-zfs_sha256_block_armv7 + +.globl zfs_sha256_block_armv8 +.type zfs_sha256_block_armv8,%function +.align 6 +zfs_sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,.LK256 + +.Loop_hw: + ld1 {v4.16b-v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret +.size zfs_sha256_block_armv8,.-zfs_sha256_block_armv8 + +.globl zfs_sha256_block_neon +.type zfs_sha256_block_neon,%function +.align 4 +zfs_sha256_block_neon: +.Lneon_entry: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,.LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s-v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s-v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b .L_00_48 + +.align 4 +.L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne .L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size zfs_sha256_block_neon,.-zfs_sha256_block_neon + +#endif diff --git a/module/icp/asm-aarch64/sha2/sha512-armv8.S b/module/icp/asm-aarch64/sha2/sha512-armv8.S new file mode 100644 index 000000000000..1683fc1ca53c --- /dev/null +++ b/module/icp/asm-aarch64/sha2/sha512-armv8.S @@ -0,0 +1,1558 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Portions Copyright (c) 2022 Tino Reichardt + * - modified assembly to fit into OpenZFS + */ + +#if defined(__aarch64__) + +.text + +.align 6 +.type .LK512,%object +.LK512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0 // terminator +.size .LK512,.-.LK512 + +.globl zfs_sha512_block_armv7 +.type zfs_sha512_block_armv7,%function +.align 6 +zfs_sha512_block_armv7: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*8 + + ldp x20,x21,[x0] // load context + ldp x22,x23,[x0,#2*8] + ldp x24,x25,[x0,#4*8] + add x2,x1,x2,lsl#7 // end of input + ldp x26,x27,[x0,#6*8] + adr x30,.LK512 + stp x0,x2,[x29,#96] + +.Loop: + ldp x3,x4,[x1],#2*8 + ldr x19,[x30],#8 // *K++ + eor x28,x21,x22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev x3,x3 // 0 +#endif + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x6,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x3 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x4,x4 // 1 +#endif + ldp x5,x6,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x7,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x4 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x5,x5 // 2 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x8,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x5 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x6,x6 // 3 +#endif + ldp x7,x8,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x9,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x6 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x7,x7 // 4 +#endif + add x24,x24,x17 // h+=Sigma0(a) + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x10,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x7 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x10,ror#18 // Sigma1(e) + ror x10,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x10,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x8,x8 // 5 +#endif + ldp x9,x10,[x1],#2*8 + add x23,x23,x17 // h+=Sigma0(a) + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x11,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x8 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x11,ror#18 // Sigma1(e) + ror x11,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x11,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x9,x9 // 6 +#endif + add x22,x22,x17 // h+=Sigma0(a) + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x12,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x9 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x12,ror#18 // Sigma1(e) + ror x12,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x12,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x10,x10 // 7 +#endif + ldp x11,x12,[x1],#2*8 + add x21,x21,x17 // h+=Sigma0(a) + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + eor x13,x25,x25,ror#23 + and x17,x26,x25 + bic x28,x27,x25 + add x20,x20,x10 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x13,ror#18 // Sigma1(e) + ror x13,x21,#28 + add x20,x20,x17 // h+=Ch(e,f,g) + eor x17,x21,x21,ror#5 + add x20,x20,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x24,x24,x20 // d+=h + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x13,x17,ror#34 // Sigma0(a) + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x20,x20,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x11,x11 // 8 +#endif + add x20,x20,x17 // h+=Sigma0(a) + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x14,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x11 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x14,ror#18 // Sigma1(e) + ror x14,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x14,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x12,x12 // 9 +#endif + ldp x13,x14,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x15,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x12 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x15,ror#18 // Sigma1(e) + ror x15,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x15,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x13,x13 // 10 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x0,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x13 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x0,ror#18 // Sigma1(e) + ror x0,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x0,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x14,x14 // 11 +#endif + ldp x15,x0,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x6,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x14 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x15,x15 // 12 +#endif + add x24,x24,x17 // h+=Sigma0(a) + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x7,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x15 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x0,x0 // 13 +#endif + ldp x1,x2,[x1] + add x23,x23,x17 // h+=Sigma0(a) + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x8,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x0 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x1,x1 // 14 +#endif + ldr x6,[sp,#24] + add x22,x22,x17 // h+=Sigma0(a) + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x9,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x1 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x2,x2 // 15 +#endif + ldr x7,[sp,#0] + add x21,x21,x17 // h+=Sigma0(a) + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 +.Loop_16_xx: + ldr x8,[sp,#8] + str x11,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x10,x5,#1 + and x17,x25,x24 + ror x9,x2,#19 + bic x19,x26,x24 + ror x11,x20,#28 + add x27,x27,x3 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x10,x10,x5,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x11,x11,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x9,x9,x2,ror#61 + eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x11,x20,ror#39 // Sigma0(a) + eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) + add x4,x4,x13 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x4,x4,x10 + add x27,x27,x17 // h+=Sigma0(a) + add x4,x4,x9 + ldr x9,[sp,#16] + str x12,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x11,x6,#1 + and x17,x24,x23 + ror x10,x3,#19 + bic x28,x25,x23 + ror x12,x27,#28 + add x26,x26,x4 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x11,x11,x6,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x12,x12,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x10,x10,x3,ror#61 + eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x12,x27,ror#39 // Sigma0(a) + eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) + add x5,x5,x14 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x5,x5,x11 + add x26,x26,x17 // h+=Sigma0(a) + add x5,x5,x10 + ldr x10,[sp,#24] + str x13,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x12,x7,#1 + and x17,x23,x22 + ror x11,x4,#19 + bic x19,x24,x22 + ror x13,x26,#28 + add x25,x25,x5 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x12,x12,x7,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x13,x13,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x11,x11,x4,ror#61 + eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x13,x26,ror#39 // Sigma0(a) + eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) + add x6,x6,x15 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x6,x6,x12 + add x25,x25,x17 // h+=Sigma0(a) + add x6,x6,x11 + ldr x11,[sp,#0] + str x14,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x13,x8,#1 + and x17,x22,x21 + ror x12,x5,#19 + bic x28,x23,x21 + ror x14,x25,#28 + add x24,x24,x6 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x13,x13,x8,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x14,x14,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x12,x12,x5,ror#61 + eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x14,x25,ror#39 // Sigma0(a) + eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) + add x7,x7,x0 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x7,x7,x13 + add x24,x24,x17 // h+=Sigma0(a) + add x7,x7,x12 + ldr x12,[sp,#8] + str x15,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x14,x9,#1 + and x17,x21,x20 + ror x13,x6,#19 + bic x19,x22,x20 + ror x15,x24,#28 + add x23,x23,x7 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x14,x14,x9,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x15,x15,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x13,x13,x6,ror#61 + eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x15,x24,ror#39 // Sigma0(a) + eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) + add x8,x8,x1 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x8,x8,x14 + add x23,x23,x17 // h+=Sigma0(a) + add x8,x8,x13 + ldr x13,[sp,#16] + str x0,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x15,x10,#1 + and x17,x20,x27 + ror x14,x7,#19 + bic x28,x21,x27 + ror x0,x23,#28 + add x22,x22,x8 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x15,x15,x10,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x0,x0,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x14,x14,x7,ror#61 + eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x0,x23,ror#39 // Sigma0(a) + eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) + add x9,x9,x2 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x9,x9,x15 + add x22,x22,x17 // h+=Sigma0(a) + add x9,x9,x14 + ldr x14,[sp,#24] + str x1,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x0,x11,#1 + and x17,x27,x26 + ror x15,x8,#19 + bic x19,x20,x26 + ror x1,x22,#28 + add x21,x21,x9 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x0,x0,x11,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x1,x1,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x15,x15,x8,ror#61 + eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x1,x22,ror#39 // Sigma0(a) + eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) + add x10,x10,x3 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x10,x10,x0 + add x21,x21,x17 // h+=Sigma0(a) + add x10,x10,x15 + ldr x15,[sp,#0] + str x2,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x1,x12,#1 + and x17,x26,x25 + ror x0,x9,#19 + bic x28,x27,x25 + ror x2,x21,#28 + add x20,x20,x10 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x1,x1,x12,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x2,x2,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x0,x0,x9,ror#61 + eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x2,x21,ror#39 // Sigma0(a) + eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) + add x11,x11,x4 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x11,x11,x1 + add x20,x20,x17 // h+=Sigma0(a) + add x11,x11,x0 + ldr x0,[sp,#8] + str x3,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x2,x13,#1 + and x17,x25,x24 + ror x1,x10,#19 + bic x19,x26,x24 + ror x3,x20,#28 + add x27,x27,x11 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x2,x2,x13,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x3,x3,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x1,x1,x10,ror#61 + eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x3,x20,ror#39 // Sigma0(a) + eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) + add x12,x12,x5 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x12,x12,x2 + add x27,x27,x17 // h+=Sigma0(a) + add x12,x12,x1 + ldr x1,[sp,#16] + str x4,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x3,x14,#1 + and x17,x24,x23 + ror x2,x11,#19 + bic x28,x25,x23 + ror x4,x27,#28 + add x26,x26,x12 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x3,x3,x14,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x4,x4,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x2,x2,x11,ror#61 + eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x4,x27,ror#39 // Sigma0(a) + eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) + add x13,x13,x6 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x13,x13,x3 + add x26,x26,x17 // h+=Sigma0(a) + add x13,x13,x2 + ldr x2,[sp,#24] + str x5,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x4,x15,#1 + and x17,x23,x22 + ror x3,x12,#19 + bic x19,x24,x22 + ror x5,x26,#28 + add x25,x25,x13 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x4,x4,x15,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x5,x5,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x3,x3,x12,ror#61 + eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x5,x26,ror#39 // Sigma0(a) + eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) + add x14,x14,x7 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x14,x14,x4 + add x25,x25,x17 // h+=Sigma0(a) + add x14,x14,x3 + ldr x3,[sp,#0] + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x5,x0,#1 + and x17,x22,x21 + ror x4,x13,#19 + bic x28,x23,x21 + ror x6,x25,#28 + add x24,x24,x14 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x5,x5,x0,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x6,x6,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x4,x4,x13,ror#61 + eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x25,ror#39 // Sigma0(a) + eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) + add x15,x15,x8 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x15,x15,x5 + add x24,x24,x17 // h+=Sigma0(a) + add x15,x15,x4 + ldr x4,[sp,#8] + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x6,x1,#1 + and x17,x21,x20 + ror x5,x14,#19 + bic x19,x22,x20 + ror x7,x24,#28 + add x23,x23,x15 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x6,x6,x1,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x7,x7,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x5,x5,x14,ror#61 + eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x24,ror#39 // Sigma0(a) + eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) + add x0,x0,x9 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x0,x0,x6 + add x23,x23,x17 // h+=Sigma0(a) + add x0,x0,x5 + ldr x5,[sp,#16] + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x7,x2,#1 + and x17,x20,x27 + ror x6,x15,#19 + bic x28,x21,x27 + ror x8,x23,#28 + add x22,x22,x0 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x7,x7,x2,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x8,x8,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x6,x6,x15,ror#61 + eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x23,ror#39 // Sigma0(a) + eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) + add x1,x1,x10 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x1,x1,x7 + add x22,x22,x17 // h+=Sigma0(a) + add x1,x1,x6 + ldr x6,[sp,#24] + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x8,x3,#1 + and x17,x27,x26 + ror x7,x0,#19 + bic x19,x20,x26 + ror x9,x22,#28 + add x21,x21,x1 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x8,x8,x3,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x9,x9,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x7,x7,x0,ror#61 + eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x22,ror#39 // Sigma0(a) + eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) + add x2,x2,x11 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x2,x2,x8 + add x21,x21,x17 // h+=Sigma0(a) + add x2,x2,x7 + ldr x7,[sp,#0] + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 + cbnz x19,.Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#648 // rewind + + ldp x3,x4,[x0] + ldp x5,x6,[x0,#2*8] + add x1,x1,#14*8 // advance input pointer + ldp x7,x8,[x0,#4*8] + add x20,x20,x3 + ldp x9,x10,[x0,#6*8] + add x21,x21,x4 + add x22,x22,x5 + add x23,x23,x6 + stp x20,x21,[x0] + add x24,x24,x7 + add x25,x25,x8 + stp x22,x23,[x0,#2*8] + add x26,x26,x9 + add x27,x27,x10 + cmp x1,x2 + stp x24,x25,[x0,#4*8] + stp x26,x27,[x0,#6*8] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*8 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size zfs_sha512_block_armv7,.-zfs_sha512_block_armv7 + + +.globl zfs_sha512_block_armv8 +.type zfs_sha512_block_armv8,%function +.align 6 +zfs_sha512_block_armv8: +.Lv8_entry: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v16.16b-v19.16b},[x1],#64 // load input + ld1 {v20.16b-v23.16b},[x1],#64 + + ld1 {v0.2d-v3.2d},[x0] // load context + adr x3,.LK512 + + rev64 v16.16b,v16.16b + rev64 v17.16b,v17.16b + rev64 v18.16b,v18.16b + rev64 v19.16b,v19.16b + rev64 v20.16b,v20.16b + rev64 v21.16b,v21.16b + rev64 v22.16b,v22.16b + rev64 v23.16b,v23.16b + b .Loop_hw + +.align 4 +.Loop_hw: + ld1 {v24.2d},[x3],#16 + subs x2,x2,#1 + sub x4,x1,#128 + orr v26.16b,v0.16b,v0.16b // offload + orr v27.16b,v1.16b,v1.16b + orr v28.16b,v2.16b,v2.16b + orr v29.16b,v3.16b,v3.16b + csel x1,x1,x4,ne // conditional rewind + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 + .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b + .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" + .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 + .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b + .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" + .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 + .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" + .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 + .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" + .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 + .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" + .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 + .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b + .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" + .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 + .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b + .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" + .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 + .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" + .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 + .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" + .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 + .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" + .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 + .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b + .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" + .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 + .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b + .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" + .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 + .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" + .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 + .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" + .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 + .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" + .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 + .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b + .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" + .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 + .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b + .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" + .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 + .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" + .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 + .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" + .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 + .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" + .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 + .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b + .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" + .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 + .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b + .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" + .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 + .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" + .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 + .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" + .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 + .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" + .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 + .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b + .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" + .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 + .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b + .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" + .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 + .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" + .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 + .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" + .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 + .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" + .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 + .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b + .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" + .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 + .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b + .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" + .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v16.2d + ld1 {v16.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + rev64 v16.16b,v16.16b + add v0.2d,v3.2d,v4.2d // "D + T1" + .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v17.2d + ld1 {v17.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + rev64 v17.16b,v17.16b + add v3.2d,v2.2d,v1.2d // "D + T1" + .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v18.2d + ld1 {v18.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + rev64 v18.16b,v18.16b + add v2.2d,v4.2d,v0.2d // "D + T1" + .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v19.2d + ld1 {v19.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b + rev64 v19.16b,v19.16b + add v4.2d,v1.2d,v3.2d // "D + T1" + .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v20.2d + ld1 {v20.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b + rev64 v20.16b,v20.16b + add v1.2d,v0.2d,v2.2d // "D + T1" + .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v21.2d + ld1 {v21.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + rev64 v21.16b,v21.16b + add v0.2d,v3.2d,v4.2d // "D + T1" + .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v22.2d + ld1 {v22.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" + .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + rev64 v22.16b,v22.16b + add v3.2d,v2.2d,v1.2d // "D + T1" + .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + sub x3,x3,#80*8 // rewind + add v25.2d,v25.2d,v23.2d + ld1 {v23.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" + .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + rev64 v23.16b,v23.16b + add v2.2d,v4.2d,v0.2d // "D + T1" + .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v0.2d,v0.2d,v26.2d // accumulate + add v1.2d,v1.2d,v27.2d + add v2.2d,v2.2d,v28.2d + add v3.2d,v3.2d,v29.2d + + cbnz x2,.Loop_hw + + st1 {v0.2d-v3.2d},[x0] // store context + + ldr x29,[sp],#16 + ret +.size zfs_sha512_block_armv8,.-zfs_sha512_block_armv8 +#endif diff --git a/module/icp/asm-arm/sha2/sha256-armv7.S b/module/icp/asm-arm/sha2/sha256-armv7.S new file mode 100644 index 000000000000..0001e4d69055 --- /dev/null +++ b/module/icp/asm-arm/sha2/sha256-armv7.S @@ -0,0 +1,2769 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Portions Copyright (c) 2022 Tino Reichardt + * - modified assembly to fit into OpenZFS + */ + +#if defined(__arm__) + +#define __ARM_ARCH__ 7 +#define __ARM_MAX_ARCH__ 7 + +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.text + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator + +.align 5 +.globl zfs_sha256_block_armv7 +.type zfs_sha256_block_armv7,%function +zfs_sha256_block_armv7: +.Lzfs_sha256_block_armv7: + +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r3,pc,#8 @ zfs_sha256_block_armv7 +#else + adr r3,.Lzfs_sha256_block_armv7 +#endif + + add r2,r1,r2,lsl#6 @ len to point at the end of inp + stmdb sp!,{r0,r1,r2,r4-r11,lr} + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + sub r14,r3,#256+32 @ K256 + sub sp,sp,#16*4 @ alloca(X[16]) +.Loop: +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ magic + eor r12,r12,r12 +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 0 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 0 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 0==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 0<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 1 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 1 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 1==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 1<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 2 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 2 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 2==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 2<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 3 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 3 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 3==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 3<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 4 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 4 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 4==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 4<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 5 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 5==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 5<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 6 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 6 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 6==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 6<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 7 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 7==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 7<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 8 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 8 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 8==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 8<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 9 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 9 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 9==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 9<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 10 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 10 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 10==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 10<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 11 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 11 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 11==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 11<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 12 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 12 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 12==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 12<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 13 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 13 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 13==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 13<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 14 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 14 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 14==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 14<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 15 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 15 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 15==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 15<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +.Lrounds_16_xx: + @ ldr r2,[sp,#1*4] @ 16 + @ ldr r1,[sp,#14*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#0*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#9*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 16==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 16<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#2*4] @ 17 + @ ldr r1,[sp,#15*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#1*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#10*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 17==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 17<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#3*4] @ 18 + @ ldr r1,[sp,#0*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#2*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#11*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 18==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 18<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#4*4] @ 19 + @ ldr r1,[sp,#1*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#3*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#12*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 19==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 19<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#5*4] @ 20 + @ ldr r1,[sp,#2*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#4*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#13*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 20==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 20<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#6*4] @ 21 + @ ldr r1,[sp,#3*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#5*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#14*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 21==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 21<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#7*4] @ 22 + @ ldr r1,[sp,#4*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#6*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#15*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 22==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 22<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#8*4] @ 23 + @ ldr r1,[sp,#5*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#7*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#0*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 23==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 23<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#9*4] @ 24 + @ ldr r1,[sp,#6*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#8*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#1*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 24==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 24<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#10*4] @ 25 + @ ldr r1,[sp,#7*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#9*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#2*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 25==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 25<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#11*4] @ 26 + @ ldr r1,[sp,#8*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#10*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#3*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 26==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 26<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#12*4] @ 27 + @ ldr r1,[sp,#9*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#11*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#4*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 27==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 27<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#13*4] @ 28 + @ ldr r1,[sp,#10*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#12*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#5*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 28==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 28<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#14*4] @ 29 + @ ldr r1,[sp,#11*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#13*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#6*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 29==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 29<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#15*4] @ 30 + @ ldr r1,[sp,#12*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#14*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#7*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 30==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 30<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#0*4] @ 31 + @ ldr r1,[sp,#13*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#15*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#8*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 31==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 31<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#ifdef __thumb2__ + ite eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq r3,[sp,#16*4] @ pull ctx + bne .Lrounds_16_xx + + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r0,[r3,#0] + ldr r2,[r3,#4] + ldr r12,[r3,#8] + add r4,r4,r0 + ldr r0,[r3,#12] + add r5,r5,r2 + ldr r2,[r3,#16] + add r6,r6,r12 + ldr r12,[r3,#20] + add r7,r7,r0 + ldr r0,[r3,#24] + add r8,r8,r2 + ldr r2,[r3,#28] + add r9,r9,r12 + ldr r1,[sp,#17*4] @ pull inp + ldr r12,[sp,#18*4] @ pull inp+len + add r10,r10,r0 + add r11,r11,r2 + stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} + cmp r1,r12 + sub r14,r14,#256 @ rewind Ktbl + bne .Loop + + add sp,sp,#19*4 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size zfs_sha256_block_armv7,.-zfs_sha256_block_armv7 + +.arch armv7-a +.fpu neon + +.globl zfs_sha256_block_neon +.type zfs_sha256_block_neon,%function +.align 5 +.skip 16 +zfs_sha256_block_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + + sub r11,sp,#16*4+16 + adr r14,K256 + bic r11,r11,#15 @ align for 128-bit stores + mov r12,sp + mov sp,r11 @ alloca + add r2,r1,r2,lsl#6 @ len to point at the end of inp + + vld1.8 {q0},[r1]! + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + vld1.32 {q8},[r14,:128]! + vld1.32 {q9},[r14,:128]! + vld1.32 {q10},[r14,:128]! + vld1.32 {q11},[r14,:128]! + vrev32.8 q0,q0 @ yes, even on + str r0,[sp,#64] + vrev32.8 q1,q1 @ big-endian + str r1,[sp,#68] + mov r1,sp + vrev32.8 q2,q2 + str r2,[sp,#72] + vrev32.8 q3,q3 + str r12,[sp,#76] @ save original sp + vadd.i32 q8,q8,q0 + vadd.i32 q9,q9,q1 + vst1.32 {q8},[r1,:128]! + vadd.i32 q10,q10,q2 + vst1.32 {q9},[r1,:128]! + vadd.i32 q11,q11,q3 + vst1.32 {q10},[r1,:128]! + vst1.32 {q11},[r1,:128]! + + ldmia r0,{r4-r11} + sub r1,r1,#64 + ldr r2,[sp,#0] + eor r12,r12,r12 + eor r3,r5,r6 + b .L_00_48 + +.align 4 +.L_00_48: + vext.8 q8,q0,q1,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q2,q3,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q0,q0,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#4] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d7,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d7,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d7,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q0,q0,q9 + add r10,r10,r2 + ldr r2,[sp,#8] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d7,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d7,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d0,d0,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d0,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d0,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d0,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#12] + and r3,r3,r12 + vshr.u32 d24,d0,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d0,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d1,d1,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q0 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q1,q2,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q3,q0,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q1,q1,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#20] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d1,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d1,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d1,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q1,q1,q9 + add r6,r6,r2 + ldr r2,[sp,#24] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d1,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d1,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d2,d2,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d2,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d2,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d2,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#28] + and r3,r3,r12 + vshr.u32 d24,d2,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d2,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d3,d3,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q1 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vext.8 q8,q2,q3,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q0,q1,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q2,q2,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#36] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d3,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d3,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d3,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q2,q2,q9 + add r10,r10,r2 + ldr r2,[sp,#40] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d3,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d3,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d4,d4,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d4,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d4,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d4,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#44] + and r3,r3,r12 + vshr.u32 d24,d4,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d4,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d5,d5,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q2 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q3,q0,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q1,q2,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q3,q3,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#52] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d5,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d5,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d5,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q3,q3,q9 + add r6,r6,r2 + ldr r2,[sp,#56] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d5,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d5,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d6,d6,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d6,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d6,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d6,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#60] + and r3,r3,r12 + vshr.u32 d24,d6,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d6,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d7,d7,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q3 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[r14] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + teq r2,#0 @ check for K256 terminator + ldr r2,[sp,#0] + sub r1,r1,#64 + bne .L_00_48 + + ldr r1,[sp,#68] + ldr r0,[sp,#72] + sub r14,r14,#256 @ rewind r14 + teq r1,r0 + it eq + subeq r1,r1,#64 @ avoid SEGV + vld1.8 {q0},[r1]! @ load next input block + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + it ne + strne r1,[sp,#68] + mov r1,sp + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q0,q0 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q0 + ldr r2,[sp,#4] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#8] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#12] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q1,q1 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q1 + ldr r2,[sp,#20] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#24] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#28] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q2,q2 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q2 + ldr r2,[sp,#36] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#40] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#44] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q3,q3 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q3 + ldr r2,[sp,#52] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#56] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#60] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#64] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + ldr r0,[r2,#0] + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r12,[r2,#4] + ldr r3,[r2,#8] + ldr r1,[r2,#12] + add r4,r4,r0 @ accumulate + ldr r0,[r2,#16] + add r5,r5,r12 + ldr r12,[r2,#20] + add r6,r6,r3 + ldr r3,[r2,#24] + add r7,r7,r1 + ldr r1,[r2,#28] + add r8,r8,r0 + str r4,[r2],#4 + add r9,r9,r12 + str r5,[r2],#4 + add r10,r10,r3 + str r6,[r2],#4 + add r11,r11,r1 + str r7,[r2],#4 + stmia r2,{r8-r11} + + ittte ne + movne r1,sp + ldrne r2,[sp,#0] + eorne r12,r12,r12 + ldreq sp,[sp,#76] @ restore original sp + itt ne + eorne r3,r5,r6 + bne .L_00_48 + + ldmia sp!,{r4-r12,pc} +.size zfs_sha256_block_neon,.-zfs_sha256_block_neon + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + +.globl zfs_sha256_block_armv8 +.type zfs_sha256_block_armv8,%function +.align 5 +zfs_sha256_block_armv8: +.LARMv8: + vld1.32 {q0,q1},[r0] + sub r3,r3,#256+32 + add r2,r1,r2,lsl#6 @ len to point at the end of inp + b .Loop_v8 + +.align 4 +.Loop_v8: + vld1.8 {q8-q9},[r1]! + vld1.8 {q10-q11},[r1]! + vld1.32 {q12},[r3]! + vrev32.8 q8,q8 + vrev32.8 q9,q9 + vrev32.8 q10,q10 + vrev32.8 q11,q11 + vmov q14,q0 @ offload + vmov q15,q1 + teq r1,r2 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + + vld1.32 {q13},[r3] + vadd.i32 q12,q12,q10 + sub r3,r3,#256-16 @ rewind + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + + vadd.i32 q13,q13,q11 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + + vadd.i32 q0,q0,q14 + vadd.i32 q1,q1,q15 + it ne + bne .Loop_v8 + + vst1.32 {q0,q1},[r0] + + bx lr @ bx lr +.size zfs_sha256_block_armv8,.-zfs_sha256_block_armv8 + +#endif diff --git a/module/icp/asm-arm/sha2/sha512-armv7.S b/module/icp/asm-arm/sha2/sha512-armv7.S new file mode 100644 index 000000000000..a4c804033b92 --- /dev/null +++ b/module/icp/asm-arm/sha2/sha512-armv7.S @@ -0,0 +1,1822 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Portions Copyright (c) 2022 Tino Reichardt + * - modified assembly to fit into OpenZFS + */ + +#if defined(__arm__) + +#define __ARM_ARCH__ 7 +#define __ARM_MAX_ARCH__ 7 + +#ifndef __KERNEL__ +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + +#if defined(__thumb2__) +.syntax unified +.thumb +# define adrl adr +#else +.code 32 +#endif + +.text + +.type K512,%object +.align 5 +K512: + WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) + WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) + WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) + WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) + WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) + WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) + WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) + WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) + WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) + WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) + WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) + WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) + WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) + WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) + WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) + WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) + WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) + WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) + WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) + WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) + WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) + WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) + WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) + WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) + WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) + WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) + WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) + WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) + WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) + WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) + WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) + WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) + WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) + WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) + WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) + WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) + WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) + WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) + WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) + WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) +.size K512,.-K512 +.word 0 @ terminator + +.align 5 +.globl zfs_sha512_block_armv7 +.type zfs_sha512_block_armv7,%function +zfs_sha512_block_armv7: +.Lzfs_sha512_block_armv7: + +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r3,pc,#8 @ zfs_sha512_block_armv7 +#else + adr r3,.Lzfs_sha512_block_armv7 +#endif + + add r2,r1,r2,lsl#7 @ len to point at the end of inp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + sub r14,r3,#672 @ K512 + sub sp,sp,#9*8 + + ldr r7,[r0,#32+LO] + ldr r8,[r0,#32+HI] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] +.Loop: + str r9, [sp,#48+0] + str r10, [sp,#48+4] + str r11, [sp,#56+0] + str r12, [sp,#56+4] + ldr r5,[r0,#0+LO] + ldr r6,[r0,#0+HI] + ldr r3,[r0,#8+LO] + ldr r4,[r0,#8+HI] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + str r3,[sp,#8+0] + str r4,[sp,#8+4] + str r9, [sp,#16+0] + str r10, [sp,#16+4] + str r11, [sp,#24+0] + str r12, [sp,#24+4] + ldr r3,[r0,#40+LO] + ldr r4,[r0,#40+HI] + str r3,[sp,#40+0] + str r4,[sp,#40+4] + +.L00_15: +#if __ARM_ARCH__<7 + ldrb r3,[r1,#7] + ldrb r9, [r1,#6] + ldrb r10, [r1,#5] + ldrb r11, [r1,#4] + ldrb r4,[r1,#3] + ldrb r12, [r1,#2] + orr r3,r3,r9,lsl#8 + ldrb r9, [r1,#1] + orr r3,r3,r10,lsl#16 + ldrb r10, [r1],#8 + orr r3,r3,r11,lsl#24 + orr r4,r4,r12,lsl#8 + orr r4,r4,r9,lsl#16 + orr r4,r4,r10,lsl#24 +#else + ldr r3,[r1,#4] + ldr r4,[r1],#8 +#ifdef __ARMEL__ + rev r3,r3 + rev r4,r4 +#endif +#endif + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#148 + + ldr r12,[sp,#16+0] @ c.lo +#ifdef __thumb2__ + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 + tst r14,#1 + beq .L00_15 + ldr r9,[sp,#184+0] + ldr r10,[sp,#184+4] + bic r14,r14,#1 +.L16_79: + @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + mov r3,r9,lsr#1 + ldr r11,[sp,#80+0] + mov r4,r10,lsr#1 + ldr r12,[sp,#80+4] + eor r3,r3,r10,lsl#31 + eor r4,r4,r9,lsl#31 + eor r3,r3,r9,lsr#8 + eor r4,r4,r10,lsr#8 + eor r3,r3,r10,lsl#24 + eor r4,r4,r9,lsl#24 + eor r3,r3,r9,lsr#7 + eor r4,r4,r10,lsr#7 + eor r3,r3,r10,lsl#25 + + @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + mov r9,r11,lsr#19 + mov r10,r12,lsr#19 + eor r9,r9,r12,lsl#13 + eor r10,r10,r11,lsl#13 + eor r9,r9,r12,lsr#29 + eor r10,r10,r11,lsr#29 + eor r9,r9,r11,lsl#3 + eor r10,r10,r12,lsl#3 + eor r9,r9,r11,lsr#6 + eor r10,r10,r12,lsr#6 + ldr r11,[sp,#120+0] + eor r9,r9,r12,lsl#26 + + ldr r12,[sp,#120+4] + adds r3,r3,r9 + ldr r9,[sp,#192+0] + adc r4,r4,r10 + + ldr r10,[sp,#192+4] + adds r3,r3,r11 + adc r4,r4,r12 + adds r3,r3,r9 + adc r4,r4,r10 + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#23 + + ldr r12,[sp,#16+0] @ c.lo +#ifdef __thumb2__ + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 +#ifdef __thumb2__ + ittt eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq r9,[sp,#184+0] + ldreq r10,[sp,#184+4] + beq .L16_79 + bic r14,r14,#1 + + ldr r3,[sp,#8+0] + ldr r4,[sp,#8+4] + ldr r9, [r0,#0+LO] + ldr r10, [r0,#0+HI] + ldr r11, [r0,#8+LO] + ldr r12, [r0,#8+HI] + adds r9,r5,r9 + str r9, [r0,#0+LO] + adc r10,r6,r10 + str r10, [r0,#0+HI] + adds r11,r3,r11 + str r11, [r0,#8+LO] + adc r12,r4,r12 + str r12, [r0,#8+HI] + + ldr r5,[sp,#16+0] + ldr r6,[sp,#16+4] + ldr r3,[sp,#24+0] + ldr r4,[sp,#24+4] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + adds r9,r5,r9 + str r9, [r0,#16+LO] + adc r10,r6,r10 + str r10, [r0,#16+HI] + adds r11,r3,r11 + str r11, [r0,#24+LO] + adc r12,r4,r12 + str r12, [r0,#24+HI] + + ldr r3,[sp,#40+0] + ldr r4,[sp,#40+4] + ldr r9, [r0,#32+LO] + ldr r10, [r0,#32+HI] + ldr r11, [r0,#40+LO] + ldr r12, [r0,#40+HI] + adds r7,r7,r9 + str r7,[r0,#32+LO] + adc r8,r8,r10 + str r8,[r0,#32+HI] + adds r11,r3,r11 + str r11, [r0,#40+LO] + adc r12,r4,r12 + str r12, [r0,#40+HI] + + ldr r5,[sp,#48+0] + ldr r6,[sp,#48+4] + ldr r3,[sp,#56+0] + ldr r4,[sp,#56+4] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] + adds r9,r5,r9 + str r9, [r0,#48+LO] + adc r10,r6,r10 + str r10, [r0,#48+HI] + adds r11,r3,r11 + str r11, [r0,#56+LO] + adc r12,r4,r12 + str r12, [r0,#56+HI] + + add sp,sp,#640 + sub r14,r14,#640 + + teq r1,r2 + bne .Loop + + add sp,sp,#8*9 @ destroy frame + +#if __ARM_ARCH__>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size zfs_sha512_block_armv7,.-zfs_sha512_block_armv7 + +.arch armv7-a +.fpu neon + +.globl zfs_sha512_block_neon +.type zfs_sha512_block_neon,%function +.align 4 +zfs_sha512_block_neon: +.LNEON: + dmb @ errata #451034 on early Cortex A8 + add r2,r1,r2,lsl#7 @ len to point at the end of inp + adr r3,K512 + VFP_ABI_PUSH + vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context +.Loop_neon: + vshr.u64 d24,d20,#14 @ 0 +#if 0<16 + vld1.64 {d0},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 +#if 0>0 + vadd.i64 d16,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 0<16 && defined(__ARMEL__) + vrev64.8 d0,d0 +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d0 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 1 +#if 1<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 1>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 1<16 && defined(__ARMEL__) + vrev64.8 d1,d1 +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d1 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 2 +#if 2<16 + vld1.64 {d2},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 +#if 2>0 + vadd.i64 d22,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 2<16 && defined(__ARMEL__) + vrev64.8 d2,d2 +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d2 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 3 +#if 3<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 3>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 3<16 && defined(__ARMEL__) + vrev64.8 d3,d3 +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d3 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 4 +#if 4<16 + vld1.64 {d4},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 +#if 4>0 + vadd.i64 d20,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 4<16 && defined(__ARMEL__) + vrev64.8 d4,d4 +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d4 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 5 +#if 5<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 5>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 5<16 && defined(__ARMEL__) + vrev64.8 d5,d5 +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d5 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 6 +#if 6<16 + vld1.64 {d6},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 +#if 6>0 + vadd.i64 d18,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 6<16 && defined(__ARMEL__) + vrev64.8 d6,d6 +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d6 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 7 +#if 7<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 7>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 7<16 && defined(__ARMEL__) + vrev64.8 d7,d7 +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d7 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + vshr.u64 d24,d20,#14 @ 8 +#if 8<16 + vld1.64 {d8},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 +#if 8>0 + vadd.i64 d16,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 8<16 && defined(__ARMEL__) + vrev64.8 d8,d8 +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d8 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 9 +#if 9<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 9>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 9<16 && defined(__ARMEL__) + vrev64.8 d9,d9 +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d9 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 10 +#if 10<16 + vld1.64 {d10},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 +#if 10>0 + vadd.i64 d22,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 10<16 && defined(__ARMEL__) + vrev64.8 d10,d10 +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d10 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 11 +#if 11<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 11>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 11<16 && defined(__ARMEL__) + vrev64.8 d11,d11 +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d11 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 12 +#if 12<16 + vld1.64 {d12},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 +#if 12>0 + vadd.i64 d20,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 12<16 && defined(__ARMEL__) + vrev64.8 d12,d12 +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d12 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 13 +#if 13<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 13>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 13<16 && defined(__ARMEL__) + vrev64.8 d13,d13 +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d13 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 14 +#if 14<16 + vld1.64 {d14},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 +#if 14>0 + vadd.i64 d18,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 14<16 && defined(__ARMEL__) + vrev64.8 d14,d14 +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d14 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 15 +#if 15<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 15>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 15<16 && defined(__ARMEL__) + vrev64.8 d15,d15 +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d15 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + mov r12,#4 +.L16_79_neon: + subs r12,#1 + vshr.u64 q12,q7,#19 + vshr.u64 q13,q7,#61 + vadd.i64 d16,d30 @ h+=Maj from the past + vshr.u64 q15,q7,#6 + vsli.64 q12,q7,#45 + vext.8 q14,q0,q1,#8 @ X[i+1] + vsli.64 q13,q7,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q0,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q4,q5,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q0,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q0,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 16<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d0 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 17 +#if 17<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 17>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 17<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d1 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 q12,q0,#19 + vshr.u64 q13,q0,#61 + vadd.i64 d22,d30 @ h+=Maj from the past + vshr.u64 q15,q0,#6 + vsli.64 q12,q0,#45 + vext.8 q14,q1,q2,#8 @ X[i+1] + vsli.64 q13,q0,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q1,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q5,q6,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q1,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q1,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 18<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d2 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 19 +#if 19<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 19>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 19<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d3 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 q12,q1,#19 + vshr.u64 q13,q1,#61 + vadd.i64 d20,d30 @ h+=Maj from the past + vshr.u64 q15,q1,#6 + vsli.64 q12,q1,#45 + vext.8 q14,q2,q3,#8 @ X[i+1] + vsli.64 q13,q1,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q2,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q6,q7,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q2,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q2,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 20<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d4 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 21 +#if 21<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 21>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 21<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d5 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 q12,q2,#19 + vshr.u64 q13,q2,#61 + vadd.i64 d18,d30 @ h+=Maj from the past + vshr.u64 q15,q2,#6 + vsli.64 q12,q2,#45 + vext.8 q14,q3,q4,#8 @ X[i+1] + vsli.64 q13,q2,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q3,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q7,q0,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q3,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q3,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 22<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d6 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 23 +#if 23<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 23>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 23<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d7 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + vshr.u64 q12,q3,#19 + vshr.u64 q13,q3,#61 + vadd.i64 d16,d30 @ h+=Maj from the past + vshr.u64 q15,q3,#6 + vsli.64 q12,q3,#45 + vext.8 q14,q4,q5,#8 @ X[i+1] + vsli.64 q13,q3,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q4,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q0,q1,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q4,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q4,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 24<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d8 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 25 +#if 25<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 25>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 25<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d9 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 q12,q4,#19 + vshr.u64 q13,q4,#61 + vadd.i64 d22,d30 @ h+=Maj from the past + vshr.u64 q15,q4,#6 + vsli.64 q12,q4,#45 + vext.8 q14,q5,q6,#8 @ X[i+1] + vsli.64 q13,q4,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q5,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q1,q2,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q5,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q5,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 26<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d10 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 27 +#if 27<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 27>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 27<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d11 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 q12,q5,#19 + vshr.u64 q13,q5,#61 + vadd.i64 d20,d30 @ h+=Maj from the past + vshr.u64 q15,q5,#6 + vsli.64 q12,q5,#45 + vext.8 q14,q6,q7,#8 @ X[i+1] + vsli.64 q13,q5,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q6,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q2,q3,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q6,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q6,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 28<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d12 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 29 +#if 29<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 29>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 29<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d13 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 q12,q6,#19 + vshr.u64 q13,q6,#61 + vadd.i64 d18,d30 @ h+=Maj from the past + vshr.u64 q15,q6,#6 + vsli.64 q12,q6,#45 + vext.8 q14,q7,q0,#8 @ X[i+1] + vsli.64 q13,q6,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q7,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q3,q4,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q7,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q7,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 30<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d14 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 31 +#if 31<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 31>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 31<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d15 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + bne .L16_79_neon + + vadd.i64 d16,d30 @ h+=Maj from the past + vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context + teq r1,r2 + sub r3,#640 @ rewind K512 + bne .Loop_neon + + VFP_ABI_POP + bx lr @ .word 0xe12fff1e +.size zfs_sha512_block_neon,.-zfs_sha512_block_neon +#endif diff --git a/module/icp/asm-ppc64/sha2/sha256-p8.S b/module/icp/asm-ppc64/sha2/sha256-p8.S new file mode 100644 index 000000000000..6bbfe23b6e15 --- /dev/null +++ b/module/icp/asm-ppc64/sha2/sha256-p8.S @@ -0,0 +1,1505 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Portions Copyright (c) 2022 Tino Reichardt + * - modified assembly to fit into OpenZFS + */ + +#if (defined(__PPC64__) && defined(__BIG_ENDIAN__)) + +.text + +.globl zfs_sha256_power8 +.globl .zfs_sha256_power8 +.type zfs_sha256_power8,@function +.section ".opd","aw" +.align 3 +zfs_sha256_power8: +.quad .zfs_sha256_power8,.TOC.@tocbase,0 +.previous +.align 6 +.zfs_sha256_power8: + stdu 1,-384(1) + mflr 8 + li 10,207 + li 11,223 + stvx 24,10,1 + addi 10,10,32 + mfspr 12,256 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + li 11,-4096+255 + stw 12,332(1) + li 10,0x10 + std 26,336(1) + li 26,0x20 + std 27,344(1) + li 27,0x30 + std 28,352(1) + li 28,0x40 + std 29,360(1) + li 29,0x50 + std 30,368(1) + li 30,0x60 + std 31,376(1) + li 31,0x70 + std 8,400(1) + mtspr 256,11 + + bl .LPICmeup + addi 11,1,79 + .long 0x7C001E19 + .long 0x7C8A1E19 + vsldoi 1,0,0,4 + vsldoi 2,0,0,8 + vsldoi 3,0,0,12 + vsldoi 5,4,4,4 + vsldoi 6,4,4,8 + vsldoi 7,4,4,12 + li 0,3 + b .Loop +.align 5 +.Loop: + lvx 28,0,6 + .long 0x7D002699 + addi 4,4,16 + mr 7,6 + stvx 0,0,11 + stvx 1,10,11 + stvx 2,26,11 + stvx 3,27,11 + stvx 4,28,11 + stvx 5,29,11 + stvx 6,30,11 + stvx 7,31,11 + vadduwm 7,7,28 + lvx 28,10,6 + vadduwm 7,7,8 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + vsldoi 9,8,8,4 + vadduwm 6,6,9 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + vsldoi 10,9,9,4 + vadduwm 5,5,10 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x7D802699 + addi 4,4,16 + vsldoi 11,10,10,4 + vadduwm 4,4,11 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + vadduwm 3,3,12 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + vsldoi 13,12,12,4 + vadduwm 2,2,13 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + vsldoi 14,13,13,4 + vadduwm 1,1,14 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + .long 0x7E002699 + addi 4,4,16 + vsldoi 15,14,14,4 + vadduwm 0,0,15 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + vadduwm 7,7,16 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + vsldoi 17,16,16,4 + vadduwm 6,6,17 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + vsldoi 18,17,17,4 + vadduwm 5,5,18 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x7F002699 + addi 4,4,16 + vsldoi 19,18,18,4 + vadduwm 4,4,19 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + vadduwm 3,3,24 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + vsldoi 25,24,24,4 + vadduwm 2,2,25 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + vsldoi 26,25,25,4 + vadduwm 1,1,26 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + vsldoi 27,26,26,4 + .long 0x13C90682 + vadduwm 8,8,30 + .long 0x13DA7E82 + vadduwm 8,8,30 + vadduwm 8,8,17 + vadduwm 0,0,27 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + mtctr 0 + b .L16_xx +.align 5 +.L16_xx: + .long 0x13CA0682 + vadduwm 9,9,30 + .long 0x13DB7E82 + vadduwm 9,9,30 + vadduwm 9,9,18 + vadduwm 7,7,8 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + .long 0x13CB0682 + vadduwm 10,10,30 + .long 0x13C87E82 + vadduwm 10,10,30 + vadduwm 10,10,19 + vadduwm 6,6,9 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + .long 0x13CC0682 + vadduwm 11,11,30 + .long 0x13C97E82 + vadduwm 11,11,30 + vadduwm 11,11,24 + vadduwm 5,5,10 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x13CD0682 + vadduwm 12,12,30 + .long 0x13CA7E82 + vadduwm 12,12,30 + vadduwm 12,12,25 + vadduwm 4,4,11 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + .long 0x13CE0682 + vadduwm 13,13,30 + .long 0x13CB7E82 + vadduwm 13,13,30 + vadduwm 13,13,26 + vadduwm 3,3,12 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + .long 0x13CF0682 + vadduwm 14,14,30 + .long 0x13CC7E82 + vadduwm 14,14,30 + vadduwm 14,14,27 + vadduwm 2,2,13 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13D00682 + vadduwm 15,15,30 + .long 0x13CD7E82 + vadduwm 15,15,30 + vadduwm 15,15,8 + vadduwm 1,1,14 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + .long 0x13D10682 + vadduwm 16,16,30 + .long 0x13CE7E82 + vadduwm 16,16,30 + vadduwm 16,16,9 + vadduwm 0,0,15 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + .long 0x13D20682 + vadduwm 17,17,30 + .long 0x13CF7E82 + vadduwm 17,17,30 + vadduwm 17,17,10 + vadduwm 7,7,16 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + .long 0x13D30682 + vadduwm 18,18,30 + .long 0x13D07E82 + vadduwm 18,18,30 + vadduwm 18,18,11 + vadduwm 6,6,17 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + .long 0x13D80682 + vadduwm 19,19,30 + .long 0x13D17E82 + vadduwm 19,19,30 + vadduwm 19,19,12 + vadduwm 5,5,18 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x13D90682 + vadduwm 24,24,30 + .long 0x13D27E82 + vadduwm 24,24,30 + vadduwm 24,24,13 + vadduwm 4,4,19 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + .long 0x13DA0682 + vadduwm 25,25,30 + .long 0x13D37E82 + vadduwm 25,25,30 + vadduwm 25,25,14 + vadduwm 3,3,24 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + .long 0x13DB0682 + vadduwm 26,26,30 + .long 0x13D87E82 + vadduwm 26,26,30 + vadduwm 26,26,15 + vadduwm 2,2,25 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13C80682 + vadduwm 27,27,30 + .long 0x13D97E82 + vadduwm 27,27,30 + vadduwm 27,27,16 + vadduwm 1,1,26 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + .long 0x13C90682 + vadduwm 8,8,30 + .long 0x13DA7E82 + vadduwm 8,8,30 + vadduwm 8,8,17 + vadduwm 0,0,27 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + bdnz .L16_xx + + lvx 10,0,11 + subic. 5,5,1 + lvx 11,10,11 + vadduwm 0,0,10 + lvx 12,26,11 + vadduwm 1,1,11 + lvx 13,27,11 + vadduwm 2,2,12 + lvx 14,28,11 + vadduwm 3,3,13 + lvx 15,29,11 + vadduwm 4,4,14 + lvx 16,30,11 + vadduwm 5,5,15 + lvx 17,31,11 + vadduwm 6,6,16 + vadduwm 7,7,17 + bne .Loop + lvx 8,26,7 + vperm 0,0,1,28 + lvx 9,27,7 + vperm 4,4,5,28 + vperm 0,0,2,8 + vperm 4,4,6,8 + vperm 0,0,3,9 + vperm 4,4,7,9 + .long 0x7C001F19 + .long 0x7C8A1F19 + addi 11,1,207 + mtlr 8 + mtspr 256,12 + lvx 24,0,11 + lvx 25,10,11 + lvx 26,26,11 + lvx 27,27,11 + lvx 28,28,11 + lvx 29,29,11 + lvx 30,30,11 + lvx 31,31,11 + ld 26,336(1) + ld 27,344(1) + ld 28,352(1) + ld 29,360(1) + ld 30,368(1) + ld 31,376(1) + addi 1,1,384 + blr +.long 0 +.byte 0,12,4,1,0x80,6,3,0 +.long 0 +.size .zfs_sha256_power8,.-.zfs_sha256_power8 +.size zfs_sha256_power8,.-.zfs_sha256_power8 +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 6 + addi 6,6,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0x428a2f98,0x428a2f98,0x428a2f98,0x428a2f98 +.long 0x71374491,0x71374491,0x71374491,0x71374491 +.long 0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf +.long 0xe9b5dba5,0xe9b5dba5,0xe9b5dba5,0xe9b5dba5 +.long 0x3956c25b,0x3956c25b,0x3956c25b,0x3956c25b +.long 0x59f111f1,0x59f111f1,0x59f111f1,0x59f111f1 +.long 0x923f82a4,0x923f82a4,0x923f82a4,0x923f82a4 +.long 0xab1c5ed5,0xab1c5ed5,0xab1c5ed5,0xab1c5ed5 +.long 0xd807aa98,0xd807aa98,0xd807aa98,0xd807aa98 +.long 0x12835b01,0x12835b01,0x12835b01,0x12835b01 +.long 0x243185be,0x243185be,0x243185be,0x243185be +.long 0x550c7dc3,0x550c7dc3,0x550c7dc3,0x550c7dc3 +.long 0x72be5d74,0x72be5d74,0x72be5d74,0x72be5d74 +.long 0x80deb1fe,0x80deb1fe,0x80deb1fe,0x80deb1fe +.long 0x9bdc06a7,0x9bdc06a7,0x9bdc06a7,0x9bdc06a7 +.long 0xc19bf174,0xc19bf174,0xc19bf174,0xc19bf174 +.long 0xe49b69c1,0xe49b69c1,0xe49b69c1,0xe49b69c1 +.long 0xefbe4786,0xefbe4786,0xefbe4786,0xefbe4786 +.long 0x0fc19dc6,0x0fc19dc6,0x0fc19dc6,0x0fc19dc6 +.long 0x240ca1cc,0x240ca1cc,0x240ca1cc,0x240ca1cc +.long 0x2de92c6f,0x2de92c6f,0x2de92c6f,0x2de92c6f +.long 0x4a7484aa,0x4a7484aa,0x4a7484aa,0x4a7484aa +.long 0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc +.long 0x76f988da,0x76f988da,0x76f988da,0x76f988da +.long 0x983e5152,0x983e5152,0x983e5152,0x983e5152 +.long 0xa831c66d,0xa831c66d,0xa831c66d,0xa831c66d +.long 0xb00327c8,0xb00327c8,0xb00327c8,0xb00327c8 +.long 0xbf597fc7,0xbf597fc7,0xbf597fc7,0xbf597fc7 +.long 0xc6e00bf3,0xc6e00bf3,0xc6e00bf3,0xc6e00bf3 +.long 0xd5a79147,0xd5a79147,0xd5a79147,0xd5a79147 +.long 0x06ca6351,0x06ca6351,0x06ca6351,0x06ca6351 +.long 0x14292967,0x14292967,0x14292967,0x14292967 +.long 0x27b70a85,0x27b70a85,0x27b70a85,0x27b70a85 +.long 0x2e1b2138,0x2e1b2138,0x2e1b2138,0x2e1b2138 +.long 0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc +.long 0x53380d13,0x53380d13,0x53380d13,0x53380d13 +.long 0x650a7354,0x650a7354,0x650a7354,0x650a7354 +.long 0x766a0abb,0x766a0abb,0x766a0abb,0x766a0abb +.long 0x81c2c92e,0x81c2c92e,0x81c2c92e,0x81c2c92e +.long 0x92722c85,0x92722c85,0x92722c85,0x92722c85 +.long 0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1 +.long 0xa81a664b,0xa81a664b,0xa81a664b,0xa81a664b +.long 0xc24b8b70,0xc24b8b70,0xc24b8b70,0xc24b8b70 +.long 0xc76c51a3,0xc76c51a3,0xc76c51a3,0xc76c51a3 +.long 0xd192e819,0xd192e819,0xd192e819,0xd192e819 +.long 0xd6990624,0xd6990624,0xd6990624,0xd6990624 +.long 0xf40e3585,0xf40e3585,0xf40e3585,0xf40e3585 +.long 0x106aa070,0x106aa070,0x106aa070,0x106aa070 +.long 0x19a4c116,0x19a4c116,0x19a4c116,0x19a4c116 +.long 0x1e376c08,0x1e376c08,0x1e376c08,0x1e376c08 +.long 0x2748774c,0x2748774c,0x2748774c,0x2748774c +.long 0x34b0bcb5,0x34b0bcb5,0x34b0bcb5,0x34b0bcb5 +.long 0x391c0cb3,0x391c0cb3,0x391c0cb3,0x391c0cb3 +.long 0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a +.long 0x5b9cca4f,0x5b9cca4f,0x5b9cca4f,0x5b9cca4f +.long 0x682e6ff3,0x682e6ff3,0x682e6ff3,0x682e6ff3 +.long 0x748f82ee,0x748f82ee,0x748f82ee,0x748f82ee +.long 0x78a5636f,0x78a5636f,0x78a5636f,0x78a5636f +.long 0x84c87814,0x84c87814,0x84c87814,0x84c87814 +.long 0x8cc70208,0x8cc70208,0x8cc70208,0x8cc70208 +.long 0x90befffa,0x90befffa,0x90befffa,0x90befffa +.long 0xa4506ceb,0xa4506ceb,0xa4506ceb,0xa4506ceb +.long 0xbef9a3f7,0xbef9a3f7,0xbef9a3f7,0xbef9a3f7 +.long 0xc67178f2,0xc67178f2,0xc67178f2,0xc67178f2 +.long 0,0,0,0 +.long 0x00010203,0x10111213,0x10111213,0x10111213 +.long 0x00010203,0x04050607,0x10111213,0x10111213 +.long 0x00010203,0x04050607,0x08090a0b,0x10111213 + +#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +.abiversion 2 +.text + +.globl zfs_sha256_power8 +.type zfs_sha256_power8,@function +.align 6 +zfs_sha256_power8: +.localentry zfs_sha256_power8,0 + + stdu 1,-384(1) + mflr 8 + li 10,207 + li 11,223 + stvx 24,10,1 + addi 10,10,32 + li 12,-1 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + li 11,-4096+255 + stw 12,332(1) + li 10,0x10 + std 26,336(1) + li 26,0x20 + std 27,344(1) + li 27,0x30 + std 28,352(1) + li 28,0x40 + std 29,360(1) + li 29,0x50 + std 30,368(1) + li 30,0x60 + std 31,376(1) + li 31,0x70 + std 8,400(1) + or 11,11,11 + + bl .LPICmeup + addi 11,1,79 + li 7,8 + lvsl 31,0,7 + vspltisb 28,0x0f + vxor 31,31,28 + .long 0x7C001E19 + .long 0x7C8A1E19 + vsldoi 1,0,0,4 + vsldoi 2,0,0,8 + vsldoi 3,0,0,12 + vsldoi 5,4,4,4 + vsldoi 6,4,4,8 + vsldoi 7,4,4,12 + li 0,3 + b .Loop +.align 5 +.Loop: + lvx 28,0,6 + .long 0x7D002699 + addi 4,4,16 + mr 7,6 + stvx 0,0,11 + stvx 1,10,11 + stvx 2,26,11 + stvx 3,27,11 + stvx 4,28,11 + stvx 5,29,11 + stvx 6,30,11 + stvx 7,31,11 + vadduwm 7,7,28 + lvx 28,10,6 + vperm 8,8,8,31 + vadduwm 7,7,8 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + vsldoi 9,8,8,4 + vadduwm 6,6,9 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + vsldoi 10,9,9,4 + vadduwm 5,5,10 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x7D802699 + addi 4,4,16 + vsldoi 11,10,10,4 + vadduwm 4,4,11 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + vperm 12,12,12,31 + vadduwm 3,3,12 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + vsldoi 13,12,12,4 + vadduwm 2,2,13 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + vsldoi 14,13,13,4 + vadduwm 1,1,14 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + .long 0x7E002699 + addi 4,4,16 + vsldoi 15,14,14,4 + vadduwm 0,0,15 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + vperm 16,16,16,31 + vadduwm 7,7,16 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + vsldoi 17,16,16,4 + vadduwm 6,6,17 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + vsldoi 18,17,17,4 + vadduwm 5,5,18 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x7F002699 + addi 4,4,16 + vsldoi 19,18,18,4 + vadduwm 4,4,19 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + vperm 24,24,24,31 + vadduwm 3,3,24 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + vsldoi 25,24,24,4 + vadduwm 2,2,25 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + vsldoi 26,25,25,4 + vadduwm 1,1,26 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + vsldoi 27,26,26,4 + .long 0x13C90682 + vadduwm 8,8,30 + .long 0x13DA7E82 + vadduwm 8,8,30 + vadduwm 8,8,17 + vadduwm 0,0,27 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + mtctr 0 + b .L16_xx +.align 5 +.L16_xx: + .long 0x13CA0682 + vadduwm 9,9,30 + .long 0x13DB7E82 + vadduwm 9,9,30 + vadduwm 9,9,18 + vadduwm 7,7,8 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + .long 0x13CB0682 + vadduwm 10,10,30 + .long 0x13C87E82 + vadduwm 10,10,30 + vadduwm 10,10,19 + vadduwm 6,6,9 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + .long 0x13CC0682 + vadduwm 11,11,30 + .long 0x13C97E82 + vadduwm 11,11,30 + vadduwm 11,11,24 + vadduwm 5,5,10 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x13CD0682 + vadduwm 12,12,30 + .long 0x13CA7E82 + vadduwm 12,12,30 + vadduwm 12,12,25 + vadduwm 4,4,11 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + .long 0x13CE0682 + vadduwm 13,13,30 + .long 0x13CB7E82 + vadduwm 13,13,30 + vadduwm 13,13,26 + vadduwm 3,3,12 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + .long 0x13CF0682 + vadduwm 14,14,30 + .long 0x13CC7E82 + vadduwm 14,14,30 + vadduwm 14,14,27 + vadduwm 2,2,13 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13D00682 + vadduwm 15,15,30 + .long 0x13CD7E82 + vadduwm 15,15,30 + vadduwm 15,15,8 + vadduwm 1,1,14 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + .long 0x13D10682 + vadduwm 16,16,30 + .long 0x13CE7E82 + vadduwm 16,16,30 + vadduwm 16,16,9 + vadduwm 0,0,15 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + .long 0x13D20682 + vadduwm 17,17,30 + .long 0x13CF7E82 + vadduwm 17,17,30 + vadduwm 17,17,10 + vadduwm 7,7,16 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + .long 0x13D30682 + vadduwm 18,18,30 + .long 0x13D07E82 + vadduwm 18,18,30 + vadduwm 18,18,11 + vadduwm 6,6,17 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + .long 0x13D80682 + vadduwm 19,19,30 + .long 0x13D17E82 + vadduwm 19,19,30 + vadduwm 19,19,12 + vadduwm 5,5,18 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x13D90682 + vadduwm 24,24,30 + .long 0x13D27E82 + vadduwm 24,24,30 + vadduwm 24,24,13 + vadduwm 4,4,19 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + .long 0x13DA0682 + vadduwm 25,25,30 + .long 0x13D37E82 + vadduwm 25,25,30 + vadduwm 25,25,14 + vadduwm 3,3,24 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + .long 0x13DB0682 + vadduwm 26,26,30 + .long 0x13D87E82 + vadduwm 26,26,30 + vadduwm 26,26,15 + vadduwm 2,2,25 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13C80682 + vadduwm 27,27,30 + .long 0x13D97E82 + vadduwm 27,27,30 + vadduwm 27,27,16 + vadduwm 1,1,26 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + .long 0x13C90682 + vadduwm 8,8,30 + .long 0x13DA7E82 + vadduwm 8,8,30 + vadduwm 8,8,17 + vadduwm 0,0,27 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + bdnz .L16_xx + + lvx 10,0,11 + subic. 5,5,1 + lvx 11,10,11 + vadduwm 0,0,10 + lvx 12,26,11 + vadduwm 1,1,11 + lvx 13,27,11 + vadduwm 2,2,12 + lvx 14,28,11 + vadduwm 3,3,13 + lvx 15,29,11 + vadduwm 4,4,14 + lvx 16,30,11 + vadduwm 5,5,15 + lvx 17,31,11 + vadduwm 6,6,16 + vadduwm 7,7,17 + bne .Loop + lvx 8,26,7 + vperm 0,0,1,28 + lvx 9,27,7 + vperm 4,4,5,28 + vperm 0,0,2,8 + vperm 4,4,6,8 + vperm 0,0,3,9 + vperm 4,4,7,9 + .long 0x7C001F19 + .long 0x7C8A1F19 + addi 11,1,207 + mtlr 8 + or 12,12,12 + lvx 24,0,11 + lvx 25,10,11 + lvx 26,26,11 + lvx 27,27,11 + lvx 28,28,11 + lvx 29,29,11 + lvx 30,30,11 + lvx 31,31,11 + ld 26,336(1) + ld 27,344(1) + ld 28,352(1) + ld 29,360(1) + ld 30,368(1) + ld 31,376(1) + addi 1,1,384 + blr +.long 0 +.byte 0,12,4,1,0x80,6,3,0 +.long 0 +.size zfs_sha256_power8,.-zfs_sha256_power8 +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 6 + addi 6,6,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0x428a2f98,0x428a2f98,0x428a2f98,0x428a2f98 +.long 0x71374491,0x71374491,0x71374491,0x71374491 +.long 0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf +.long 0xe9b5dba5,0xe9b5dba5,0xe9b5dba5,0xe9b5dba5 +.long 0x3956c25b,0x3956c25b,0x3956c25b,0x3956c25b +.long 0x59f111f1,0x59f111f1,0x59f111f1,0x59f111f1 +.long 0x923f82a4,0x923f82a4,0x923f82a4,0x923f82a4 +.long 0xab1c5ed5,0xab1c5ed5,0xab1c5ed5,0xab1c5ed5 +.long 0xd807aa98,0xd807aa98,0xd807aa98,0xd807aa98 +.long 0x12835b01,0x12835b01,0x12835b01,0x12835b01 +.long 0x243185be,0x243185be,0x243185be,0x243185be +.long 0x550c7dc3,0x550c7dc3,0x550c7dc3,0x550c7dc3 +.long 0x72be5d74,0x72be5d74,0x72be5d74,0x72be5d74 +.long 0x80deb1fe,0x80deb1fe,0x80deb1fe,0x80deb1fe +.long 0x9bdc06a7,0x9bdc06a7,0x9bdc06a7,0x9bdc06a7 +.long 0xc19bf174,0xc19bf174,0xc19bf174,0xc19bf174 +.long 0xe49b69c1,0xe49b69c1,0xe49b69c1,0xe49b69c1 +.long 0xefbe4786,0xefbe4786,0xefbe4786,0xefbe4786 +.long 0x0fc19dc6,0x0fc19dc6,0x0fc19dc6,0x0fc19dc6 +.long 0x240ca1cc,0x240ca1cc,0x240ca1cc,0x240ca1cc +.long 0x2de92c6f,0x2de92c6f,0x2de92c6f,0x2de92c6f +.long 0x4a7484aa,0x4a7484aa,0x4a7484aa,0x4a7484aa +.long 0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc +.long 0x76f988da,0x76f988da,0x76f988da,0x76f988da +.long 0x983e5152,0x983e5152,0x983e5152,0x983e5152 +.long 0xa831c66d,0xa831c66d,0xa831c66d,0xa831c66d +.long 0xb00327c8,0xb00327c8,0xb00327c8,0xb00327c8 +.long 0xbf597fc7,0xbf597fc7,0xbf597fc7,0xbf597fc7 +.long 0xc6e00bf3,0xc6e00bf3,0xc6e00bf3,0xc6e00bf3 +.long 0xd5a79147,0xd5a79147,0xd5a79147,0xd5a79147 +.long 0x06ca6351,0x06ca6351,0x06ca6351,0x06ca6351 +.long 0x14292967,0x14292967,0x14292967,0x14292967 +.long 0x27b70a85,0x27b70a85,0x27b70a85,0x27b70a85 +.long 0x2e1b2138,0x2e1b2138,0x2e1b2138,0x2e1b2138 +.long 0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc +.long 0x53380d13,0x53380d13,0x53380d13,0x53380d13 +.long 0x650a7354,0x650a7354,0x650a7354,0x650a7354 +.long 0x766a0abb,0x766a0abb,0x766a0abb,0x766a0abb +.long 0x81c2c92e,0x81c2c92e,0x81c2c92e,0x81c2c92e +.long 0x92722c85,0x92722c85,0x92722c85,0x92722c85 +.long 0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1 +.long 0xa81a664b,0xa81a664b,0xa81a664b,0xa81a664b +.long 0xc24b8b70,0xc24b8b70,0xc24b8b70,0xc24b8b70 +.long 0xc76c51a3,0xc76c51a3,0xc76c51a3,0xc76c51a3 +.long 0xd192e819,0xd192e819,0xd192e819,0xd192e819 +.long 0xd6990624,0xd6990624,0xd6990624,0xd6990624 +.long 0xf40e3585,0xf40e3585,0xf40e3585,0xf40e3585 +.long 0x106aa070,0x106aa070,0x106aa070,0x106aa070 +.long 0x19a4c116,0x19a4c116,0x19a4c116,0x19a4c116 +.long 0x1e376c08,0x1e376c08,0x1e376c08,0x1e376c08 +.long 0x2748774c,0x2748774c,0x2748774c,0x2748774c +.long 0x34b0bcb5,0x34b0bcb5,0x34b0bcb5,0x34b0bcb5 +.long 0x391c0cb3,0x391c0cb3,0x391c0cb3,0x391c0cb3 +.long 0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a +.long 0x5b9cca4f,0x5b9cca4f,0x5b9cca4f,0x5b9cca4f +.long 0x682e6ff3,0x682e6ff3,0x682e6ff3,0x682e6ff3 +.long 0x748f82ee,0x748f82ee,0x748f82ee,0x748f82ee +.long 0x78a5636f,0x78a5636f,0x78a5636f,0x78a5636f +.long 0x84c87814,0x84c87814,0x84c87814,0x84c87814 +.long 0x8cc70208,0x8cc70208,0x8cc70208,0x8cc70208 +.long 0x90befffa,0x90befffa,0x90befffa,0x90befffa +.long 0xa4506ceb,0xa4506ceb,0xa4506ceb,0xa4506ceb +.long 0xbef9a3f7,0xbef9a3f7,0xbef9a3f7,0xbef9a3f7 +.long 0xc67178f2,0xc67178f2,0xc67178f2,0xc67178f2 +.long 0,0,0,0 +.long 0x10111213,0x10111213,0x10111213,0x00010203 +.long 0x10111213,0x10111213,0x04050607,0x00010203 +.long 0x10111213,0x08090a0b,0x04050607,0x00010203 +#endif diff --git a/module/icp/asm-ppc64/sha2/sha256-ppc.S b/module/icp/asm-ppc64/sha2/sha256-ppc.S new file mode 100644 index 000000000000..2219e313c9c6 --- /dev/null +++ b/module/icp/asm-ppc64/sha2/sha256-ppc.S @@ -0,0 +1,2712 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Portions Copyright (c) 2022 Tino Reichardt + * - modified assembly to fit into OpenZFS + */ + +#if (defined(__PPC64__) && defined(__BIG_ENDIAN__)) + +.text + +.globl zfs_sha256_ppc +.globl .zfs_sha256_ppc +.type zfs_sha256_ppc,@function +.section ".opd","aw" +.align 3 +zfs_sha256_ppc: +.quad .zfs_sha256_ppc,.TOC.@tocbase,0 +.previous +.align 6 +.zfs_sha256_ppc: + stdu 1,-320(1) + mflr 0 + sldi 5,5,6 + + std 3,144(1) + + std 14,176(1) + std 15,184(1) + std 16,192(1) + std 17,200(1) + std 18,208(1) + std 19,216(1) + std 20,224(1) + std 21,232(1) + std 22,240(1) + std 23,248(1) + std 24,256(1) + std 25,264(1) + std 26,272(1) + std 27,280(1) + std 28,288(1) + std 29,296(1) + std 30,304(1) + std 31,312(1) + std 0,336(1) + lwz 8,0(3) + mr 31,4 + lwz 9,4(3) + lwz 10,8(3) + lwz 11,12(3) + lwz 12,16(3) + lwz 6,20(3) + lwz 14,24(3) + lwz 15,28(3) + bl .LPICmeup +.LPICedup: + andi. 0,31,3 + bne .Lunaligned +.Laligned: + add 5,31,5 + std 5,128(1) + std 31,136(1) + bl .Lsha2_block_private + b .Ldone + +.align 4 +.Lunaligned: + subfic 0,31,4096 + andi. 0,0,4032 + beq .Lcross_page + cmpld 5,0 + ble .Laligned + subfc 5,0,5 + add 0,31,0 + std 5,120(1) + std 0,128(1) + std 31,136(1) + bl .Lsha2_block_private + + ld 5,120(1) +.Lcross_page: + li 0,16 + mtctr 0 + addi 20,1,48 +.Lmemcpy: + lbz 16,0(31) + lbz 17,1(31) + lbz 18,2(31) + lbz 19,3(31) + addi 31,31,4 + stb 16,0(20) + stb 17,1(20) + stb 18,2(20) + stb 19,3(20) + addi 20,20,4 + bdnz .Lmemcpy + std 31,112(1) + addi 0,1,112 + addi 31,1,48 + std 5,120(1) + std 0,128(1) + std 31,136(1) + bl .Lsha2_block_private + ld 31,112(1) + ld 5,120(1) + addic. 5,5,-64 + bne .Lunaligned + +.Ldone: + ld 0,336(1) + ld 14,176(1) + ld 15,184(1) + ld 16,192(1) + ld 17,200(1) + ld 18,208(1) + ld 19,216(1) + ld 20,224(1) + ld 21,232(1) + ld 22,240(1) + ld 23,248(1) + ld 24,256(1) + ld 25,264(1) + ld 26,272(1) + ld 27,280(1) + ld 28,288(1) + ld 29,296(1) + ld 30,304(1) + ld 31,312(1) + mtlr 0 + addi 1,1,320 + blr +.long 0 +.byte 0,12,4,1,0x80,18,3,0 +.long 0 +.align 4 +.Lsha2_block_private: + lwz 0,0(7) + lwz 16,0(31) + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + lwz 0,4(7) + add 15,15,3 + add 15,15,5 + + lwz 17,4(31) + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + lwz 0,8(7) + add 14,14,3 + add 14,14,5 + + lwz 18,8(31) + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + lwz 0,12(7) + add 6,6,3 + add 6,6,5 + + lwz 19,12(31) + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + lwz 0,16(7) + add 12,12,3 + add 12,12,5 + + lwz 20,16(31) + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + lwz 0,20(7) + add 11,11,3 + add 11,11,5 + + lwz 21,20(31) + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + lwz 0,24(7) + add 10,10,3 + add 10,10,5 + + lwz 22,24(31) + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + lwz 0,28(7) + add 9,9,3 + add 9,9,5 + + lwz 23,28(31) + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + lwz 0,32(7) + add 8,8,3 + add 8,8,5 + + lwz 24,32(31) + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + lwz 0,36(7) + add 15,15,3 + add 15,15,5 + + lwz 25,36(31) + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + lwz 0,40(7) + add 14,14,3 + add 14,14,5 + + lwz 26,40(31) + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + lwz 0,44(7) + add 6,6,3 + add 6,6,5 + + lwz 27,44(31) + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + lwz 0,48(7) + add 12,12,3 + add 12,12,5 + + lwz 28,48(31) + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + lwz 0,52(7) + add 11,11,3 + add 11,11,5 + + lwz 29,52(31) + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + lwz 0,56(7) + add 10,10,3 + add 10,10,5 + + lwz 30,56(31) + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + lwz 0,60(7) + add 9,9,3 + add 9,9,5 + + lwz 31,60(31) + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + li 5,3 + mtctr 5 +.align 4 +.Lrounds: + addi 7,7,64 + rotrwi 3,17,7 + rotrwi 4,17,18 + rotrwi 5,30,17 + rotrwi 0,30,19 + xor 3,3,4 + srwi 4,17,3 + xor 5,5,0 + srwi 0,30,10 + add 16,16,25 + xor 3,3,4 + xor 5,5,0 + lwz 0,0(7) + add 16,16,3 + add 16,16,5 + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrwi 3,18,7 + rotrwi 4,18,18 + rotrwi 5,31,17 + rotrwi 0,31,19 + xor 3,3,4 + srwi 4,18,3 + xor 5,5,0 + srwi 0,31,10 + add 17,17,26 + xor 3,3,4 + xor 5,5,0 + lwz 0,4(7) + add 17,17,3 + add 17,17,5 + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrwi 3,19,7 + rotrwi 4,19,18 + rotrwi 5,16,17 + rotrwi 0,16,19 + xor 3,3,4 + srwi 4,19,3 + xor 5,5,0 + srwi 0,16,10 + add 18,18,27 + xor 3,3,4 + xor 5,5,0 + lwz 0,8(7) + add 18,18,3 + add 18,18,5 + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrwi 3,20,7 + rotrwi 4,20,18 + rotrwi 5,17,17 + rotrwi 0,17,19 + xor 3,3,4 + srwi 4,20,3 + xor 5,5,0 + srwi 0,17,10 + add 19,19,28 + xor 3,3,4 + xor 5,5,0 + lwz 0,12(7) + add 19,19,3 + add 19,19,5 + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrwi 3,21,7 + rotrwi 4,21,18 + rotrwi 5,18,17 + rotrwi 0,18,19 + xor 3,3,4 + srwi 4,21,3 + xor 5,5,0 + srwi 0,18,10 + add 20,20,29 + xor 3,3,4 + xor 5,5,0 + lwz 0,16(7) + add 20,20,3 + add 20,20,5 + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrwi 3,22,7 + rotrwi 4,22,18 + rotrwi 5,19,17 + rotrwi 0,19,19 + xor 3,3,4 + srwi 4,22,3 + xor 5,5,0 + srwi 0,19,10 + add 21,21,30 + xor 3,3,4 + xor 5,5,0 + lwz 0,20(7) + add 21,21,3 + add 21,21,5 + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrwi 3,23,7 + rotrwi 4,23,18 + rotrwi 5,20,17 + rotrwi 0,20,19 + xor 3,3,4 + srwi 4,23,3 + xor 5,5,0 + srwi 0,20,10 + add 22,22,31 + xor 3,3,4 + xor 5,5,0 + lwz 0,24(7) + add 22,22,3 + add 22,22,5 + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrwi 3,24,7 + rotrwi 4,24,18 + rotrwi 5,21,17 + rotrwi 0,21,19 + xor 3,3,4 + srwi 4,24,3 + xor 5,5,0 + srwi 0,21,10 + add 23,23,16 + xor 3,3,4 + xor 5,5,0 + lwz 0,28(7) + add 23,23,3 + add 23,23,5 + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + rotrwi 3,25,7 + rotrwi 4,25,18 + rotrwi 5,22,17 + rotrwi 0,22,19 + xor 3,3,4 + srwi 4,25,3 + xor 5,5,0 + srwi 0,22,10 + add 24,24,17 + xor 3,3,4 + xor 5,5,0 + lwz 0,32(7) + add 24,24,3 + add 24,24,5 + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrwi 3,26,7 + rotrwi 4,26,18 + rotrwi 5,23,17 + rotrwi 0,23,19 + xor 3,3,4 + srwi 4,26,3 + xor 5,5,0 + srwi 0,23,10 + add 25,25,18 + xor 3,3,4 + xor 5,5,0 + lwz 0,36(7) + add 25,25,3 + add 25,25,5 + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrwi 3,27,7 + rotrwi 4,27,18 + rotrwi 5,24,17 + rotrwi 0,24,19 + xor 3,3,4 + srwi 4,27,3 + xor 5,5,0 + srwi 0,24,10 + add 26,26,19 + xor 3,3,4 + xor 5,5,0 + lwz 0,40(7) + add 26,26,3 + add 26,26,5 + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrwi 3,28,7 + rotrwi 4,28,18 + rotrwi 5,25,17 + rotrwi 0,25,19 + xor 3,3,4 + srwi 4,28,3 + xor 5,5,0 + srwi 0,25,10 + add 27,27,20 + xor 3,3,4 + xor 5,5,0 + lwz 0,44(7) + add 27,27,3 + add 27,27,5 + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrwi 3,29,7 + rotrwi 4,29,18 + rotrwi 5,26,17 + rotrwi 0,26,19 + xor 3,3,4 + srwi 4,29,3 + xor 5,5,0 + srwi 0,26,10 + add 28,28,21 + xor 3,3,4 + xor 5,5,0 + lwz 0,48(7) + add 28,28,3 + add 28,28,5 + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrwi 3,30,7 + rotrwi 4,30,18 + rotrwi 5,27,17 + rotrwi 0,27,19 + xor 3,3,4 + srwi 4,30,3 + xor 5,5,0 + srwi 0,27,10 + add 29,29,22 + xor 3,3,4 + xor 5,5,0 + lwz 0,52(7) + add 29,29,3 + add 29,29,5 + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrwi 3,31,7 + rotrwi 4,31,18 + rotrwi 5,28,17 + rotrwi 0,28,19 + xor 3,3,4 + srwi 4,31,3 + xor 5,5,0 + srwi 0,28,10 + add 30,30,23 + xor 3,3,4 + xor 5,5,0 + lwz 0,56(7) + add 30,30,3 + add 30,30,5 + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrwi 3,16,7 + rotrwi 4,16,18 + rotrwi 5,29,17 + rotrwi 0,29,19 + xor 3,3,4 + srwi 4,16,3 + xor 5,5,0 + srwi 0,29,10 + add 31,31,24 + xor 3,3,4 + xor 5,5,0 + lwz 0,60(7) + add 31,31,3 + add 31,31,5 + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + bdnz .Lrounds + + ld 3,144(1) + ld 31,136(1) + ld 5,128(1) + subi 7,7,192 + + lwz 16,0(3) + lwz 17,4(3) + lwz 18,8(3) + lwz 19,12(3) + lwz 20,16(3) + lwz 21,20(3) + lwz 22,24(3) + addi 31,31,64 + lwz 23,28(3) + add 8,8,16 + add 9,9,17 + std 31,136(1) + add 10,10,18 + stw 8,0(3) + add 11,11,19 + stw 9,4(3) + add 12,12,20 + stw 10,8(3) + add 6,6,21 + stw 11,12(3) + add 14,14,22 + stw 12,16(3) + add 15,15,23 + stw 6,20(3) + stw 14,24(3) + cmpld 31,5 + stw 15,28(3) + bne .Lsha2_block_private + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size .zfs_sha256_ppc,.-.zfs_sha256_ppc +.size zfs_sha256_ppc,.-.zfs_sha256_ppc +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 7 + addi 7,7,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +.abiversion 2 +.text + +.globl zfs_sha256_ppc +.type zfs_sha256_ppc,@function +.align 6 +zfs_sha256_ppc: +.localentry zfs_sha256_ppc,0 + + stdu 1,-320(1) + mflr 0 + sldi 5,5,6 + + std 3,144(1) + + std 14,176(1) + std 15,184(1) + std 16,192(1) + std 17,200(1) + std 18,208(1) + std 19,216(1) + std 20,224(1) + std 21,232(1) + std 22,240(1) + std 23,248(1) + std 24,256(1) + std 25,264(1) + std 26,272(1) + std 27,280(1) + std 28,288(1) + std 29,296(1) + std 30,304(1) + std 31,312(1) + std 0,336(1) + lwz 8,0(3) + mr 31,4 + lwz 9,4(3) + lwz 10,8(3) + lwz 11,12(3) + lwz 12,16(3) + lwz 6,20(3) + lwz 14,24(3) + lwz 15,28(3) + bl .LPICmeup +.LPICedup: + andi. 0,31,3 + bne .Lunaligned +.Laligned: + add 5,31,5 + std 5,128(1) + std 31,136(1) + bl .Lsha2_block_private + b .Ldone + +.align 4 +.Lunaligned: + subfic 0,31,4096 + andi. 0,0,4032 + beq .Lcross_page + cmpld 5,0 + ble .Laligned + subfc 5,0,5 + add 0,31,0 + std 5,120(1) + std 0,128(1) + std 31,136(1) + bl .Lsha2_block_private + + ld 5,120(1) +.Lcross_page: + li 0,16 + mtctr 0 + addi 20,1,48 +.Lmemcpy: + lbz 16,0(31) + lbz 17,1(31) + lbz 18,2(31) + lbz 19,3(31) + addi 31,31,4 + stb 16,0(20) + stb 17,1(20) + stb 18,2(20) + stb 19,3(20) + addi 20,20,4 + bdnz .Lmemcpy + std 31,112(1) + addi 0,1,112 + addi 31,1,48 + std 5,120(1) + std 0,128(1) + std 31,136(1) + bl .Lsha2_block_private + ld 31,112(1) + ld 5,120(1) + addic. 5,5,-64 + bne .Lunaligned + +.Ldone: + ld 0,336(1) + ld 14,176(1) + ld 15,184(1) + ld 16,192(1) + ld 17,200(1) + ld 18,208(1) + ld 19,216(1) + ld 20,224(1) + ld 21,232(1) + ld 22,240(1) + ld 23,248(1) + ld 24,256(1) + ld 25,264(1) + ld 26,272(1) + ld 27,280(1) + ld 28,288(1) + ld 29,296(1) + ld 30,304(1) + ld 31,312(1) + mtlr 0 + addi 1,1,320 + blr +.long 0 +.byte 0,12,4,1,0x80,18,3,0 +.long 0 +.align 4 +.Lsha2_block_private: + lwz 0,0(7) + lwz 3,0(31) + rotlwi 16,3,8 + rlwimi 16,3,24,0,7 + rlwimi 16,3,24,16,23 + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + lwz 0,4(7) + add 15,15,3 + add 15,15,5 + + lwz 3,4(31) + rotlwi 17,3,8 + rlwimi 17,3,24,0,7 + rlwimi 17,3,24,16,23 + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + lwz 0,8(7) + add 14,14,3 + add 14,14,5 + + lwz 3,8(31) + rotlwi 18,3,8 + rlwimi 18,3,24,0,7 + rlwimi 18,3,24,16,23 + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + lwz 0,12(7) + add 6,6,3 + add 6,6,5 + + lwz 3,12(31) + rotlwi 19,3,8 + rlwimi 19,3,24,0,7 + rlwimi 19,3,24,16,23 + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + lwz 0,16(7) + add 12,12,3 + add 12,12,5 + + lwz 3,16(31) + rotlwi 20,3,8 + rlwimi 20,3,24,0,7 + rlwimi 20,3,24,16,23 + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + lwz 0,20(7) + add 11,11,3 + add 11,11,5 + + lwz 3,20(31) + rotlwi 21,3,8 + rlwimi 21,3,24,0,7 + rlwimi 21,3,24,16,23 + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + lwz 0,24(7) + add 10,10,3 + add 10,10,5 + + lwz 3,24(31) + rotlwi 22,3,8 + rlwimi 22,3,24,0,7 + rlwimi 22,3,24,16,23 + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + lwz 0,28(7) + add 9,9,3 + add 9,9,5 + + lwz 3,28(31) + rotlwi 23,3,8 + rlwimi 23,3,24,0,7 + rlwimi 23,3,24,16,23 + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + lwz 0,32(7) + add 8,8,3 + add 8,8,5 + + lwz 3,32(31) + rotlwi 24,3,8 + rlwimi 24,3,24,0,7 + rlwimi 24,3,24,16,23 + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + lwz 0,36(7) + add 15,15,3 + add 15,15,5 + + lwz 3,36(31) + rotlwi 25,3,8 + rlwimi 25,3,24,0,7 + rlwimi 25,3,24,16,23 + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + lwz 0,40(7) + add 14,14,3 + add 14,14,5 + + lwz 3,40(31) + rotlwi 26,3,8 + rlwimi 26,3,24,0,7 + rlwimi 26,3,24,16,23 + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + lwz 0,44(7) + add 6,6,3 + add 6,6,5 + + lwz 3,44(31) + rotlwi 27,3,8 + rlwimi 27,3,24,0,7 + rlwimi 27,3,24,16,23 + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + lwz 0,48(7) + add 12,12,3 + add 12,12,5 + + lwz 3,48(31) + rotlwi 28,3,8 + rlwimi 28,3,24,0,7 + rlwimi 28,3,24,16,23 + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + lwz 0,52(7) + add 11,11,3 + add 11,11,5 + + lwz 3,52(31) + rotlwi 29,3,8 + rlwimi 29,3,24,0,7 + rlwimi 29,3,24,16,23 + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + lwz 0,56(7) + add 10,10,3 + add 10,10,5 + + lwz 3,56(31) + rotlwi 30,3,8 + rlwimi 30,3,24,0,7 + rlwimi 30,3,24,16,23 + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + lwz 0,60(7) + add 9,9,3 + add 9,9,5 + + lwz 3,60(31) + rotlwi 31,3,8 + rlwimi 31,3,24,0,7 + rlwimi 31,3,24,16,23 + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + li 5,3 + mtctr 5 +.align 4 +.Lrounds: + addi 7,7,64 + rotrwi 3,17,7 + rotrwi 4,17,18 + rotrwi 5,30,17 + rotrwi 0,30,19 + xor 3,3,4 + srwi 4,17,3 + xor 5,5,0 + srwi 0,30,10 + add 16,16,25 + xor 3,3,4 + xor 5,5,0 + lwz 0,0(7) + add 16,16,3 + add 16,16,5 + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrwi 3,18,7 + rotrwi 4,18,18 + rotrwi 5,31,17 + rotrwi 0,31,19 + xor 3,3,4 + srwi 4,18,3 + xor 5,5,0 + srwi 0,31,10 + add 17,17,26 + xor 3,3,4 + xor 5,5,0 + lwz 0,4(7) + add 17,17,3 + add 17,17,5 + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrwi 3,19,7 + rotrwi 4,19,18 + rotrwi 5,16,17 + rotrwi 0,16,19 + xor 3,3,4 + srwi 4,19,3 + xor 5,5,0 + srwi 0,16,10 + add 18,18,27 + xor 3,3,4 + xor 5,5,0 + lwz 0,8(7) + add 18,18,3 + add 18,18,5 + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrwi 3,20,7 + rotrwi 4,20,18 + rotrwi 5,17,17 + rotrwi 0,17,19 + xor 3,3,4 + srwi 4,20,3 + xor 5,5,0 + srwi 0,17,10 + add 19,19,28 + xor 3,3,4 + xor 5,5,0 + lwz 0,12(7) + add 19,19,3 + add 19,19,5 + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrwi 3,21,7 + rotrwi 4,21,18 + rotrwi 5,18,17 + rotrwi 0,18,19 + xor 3,3,4 + srwi 4,21,3 + xor 5,5,0 + srwi 0,18,10 + add 20,20,29 + xor 3,3,4 + xor 5,5,0 + lwz 0,16(7) + add 20,20,3 + add 20,20,5 + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrwi 3,22,7 + rotrwi 4,22,18 + rotrwi 5,19,17 + rotrwi 0,19,19 + xor 3,3,4 + srwi 4,22,3 + xor 5,5,0 + srwi 0,19,10 + add 21,21,30 + xor 3,3,4 + xor 5,5,0 + lwz 0,20(7) + add 21,21,3 + add 21,21,5 + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrwi 3,23,7 + rotrwi 4,23,18 + rotrwi 5,20,17 + rotrwi 0,20,19 + xor 3,3,4 + srwi 4,23,3 + xor 5,5,0 + srwi 0,20,10 + add 22,22,31 + xor 3,3,4 + xor 5,5,0 + lwz 0,24(7) + add 22,22,3 + add 22,22,5 + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrwi 3,24,7 + rotrwi 4,24,18 + rotrwi 5,21,17 + rotrwi 0,21,19 + xor 3,3,4 + srwi 4,24,3 + xor 5,5,0 + srwi 0,21,10 + add 23,23,16 + xor 3,3,4 + xor 5,5,0 + lwz 0,28(7) + add 23,23,3 + add 23,23,5 + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + rotrwi 3,25,7 + rotrwi 4,25,18 + rotrwi 5,22,17 + rotrwi 0,22,19 + xor 3,3,4 + srwi 4,25,3 + xor 5,5,0 + srwi 0,22,10 + add 24,24,17 + xor 3,3,4 + xor 5,5,0 + lwz 0,32(7) + add 24,24,3 + add 24,24,5 + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrwi 3,26,7 + rotrwi 4,26,18 + rotrwi 5,23,17 + rotrwi 0,23,19 + xor 3,3,4 + srwi 4,26,3 + xor 5,5,0 + srwi 0,23,10 + add 25,25,18 + xor 3,3,4 + xor 5,5,0 + lwz 0,36(7) + add 25,25,3 + add 25,25,5 + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrwi 3,27,7 + rotrwi 4,27,18 + rotrwi 5,24,17 + rotrwi 0,24,19 + xor 3,3,4 + srwi 4,27,3 + xor 5,5,0 + srwi 0,24,10 + add 26,26,19 + xor 3,3,4 + xor 5,5,0 + lwz 0,40(7) + add 26,26,3 + add 26,26,5 + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrwi 3,28,7 + rotrwi 4,28,18 + rotrwi 5,25,17 + rotrwi 0,25,19 + xor 3,3,4 + srwi 4,28,3 + xor 5,5,0 + srwi 0,25,10 + add 27,27,20 + xor 3,3,4 + xor 5,5,0 + lwz 0,44(7) + add 27,27,3 + add 27,27,5 + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrwi 3,29,7 + rotrwi 4,29,18 + rotrwi 5,26,17 + rotrwi 0,26,19 + xor 3,3,4 + srwi 4,29,3 + xor 5,5,0 + srwi 0,26,10 + add 28,28,21 + xor 3,3,4 + xor 5,5,0 + lwz 0,48(7) + add 28,28,3 + add 28,28,5 + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrwi 3,30,7 + rotrwi 4,30,18 + rotrwi 5,27,17 + rotrwi 0,27,19 + xor 3,3,4 + srwi 4,30,3 + xor 5,5,0 + srwi 0,27,10 + add 29,29,22 + xor 3,3,4 + xor 5,5,0 + lwz 0,52(7) + add 29,29,3 + add 29,29,5 + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrwi 3,31,7 + rotrwi 4,31,18 + rotrwi 5,28,17 + rotrwi 0,28,19 + xor 3,3,4 + srwi 4,31,3 + xor 5,5,0 + srwi 0,28,10 + add 30,30,23 + xor 3,3,4 + xor 5,5,0 + lwz 0,56(7) + add 30,30,3 + add 30,30,5 + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrwi 3,16,7 + rotrwi 4,16,18 + rotrwi 5,29,17 + rotrwi 0,29,19 + xor 3,3,4 + srwi 4,16,3 + xor 5,5,0 + srwi 0,29,10 + add 31,31,24 + xor 3,3,4 + xor 5,5,0 + lwz 0,60(7) + add 31,31,3 + add 31,31,5 + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + bdnz .Lrounds + + ld 3,144(1) + ld 31,136(1) + ld 5,128(1) + subi 7,7,192 + + lwz 16,0(3) + lwz 17,4(3) + lwz 18,8(3) + lwz 19,12(3) + lwz 20,16(3) + lwz 21,20(3) + lwz 22,24(3) + addi 31,31,64 + lwz 23,28(3) + add 8,8,16 + add 9,9,17 + std 31,136(1) + add 10,10,18 + stw 8,0(3) + add 11,11,19 + stw 9,4(3) + add 12,12,20 + stw 10,8(3) + add 6,6,21 + stw 11,12(3) + add 14,14,22 + stw 12,16(3) + add 15,15,23 + stw 6,20(3) + stw 14,24(3) + cmpld 31,5 + stw 15,28(3) + bne .Lsha2_block_private + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size zfs_sha256_ppc,.-zfs_sha256_ppc +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 7 + addi 7,7,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +#endif diff --git a/module/icp/asm-ppc64/sha2/sha512-p8.S b/module/icp/asm-ppc64/sha2/sha512-p8.S new file mode 100644 index 000000000000..39a90ede3dc5 --- /dev/null +++ b/module/icp/asm-ppc64/sha2/sha512-p8.S @@ -0,0 +1,1706 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Portions Copyright (c) 2022 Tino Reichardt + * - modified assembly to fit into OpenZFS + */ + +#if (defined(__PPC64__) && defined(__BIG_ENDIAN__)) + +.text + +.globl zfs_sha512_power8 +.globl .zfs_sha512_power8 +.type zfs_sha512_power8,@function +.section ".opd","aw" +.align 3 +zfs_sha512_power8: +.quad .zfs_sha512_power8,.TOC.@tocbase,0 +.previous +.align 6 +.zfs_sha512_power8: + stdu 1,-384(1) + mflr 8 + li 10,207 + li 11,223 + stvx 24,10,1 + addi 10,10,32 + mfspr 12,256 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + li 11,-4096+255 + stw 12,332(1) + li 10,0x10 + std 26,336(1) + li 26,0x20 + std 27,344(1) + li 27,0x30 + std 28,352(1) + li 28,0x40 + std 29,360(1) + li 29,0x50 + std 30,368(1) + li 30,0x60 + std 31,376(1) + li 31,0x70 + std 8,400(1) + mtspr 256,11 + + bl .LPICmeup + addi 11,1,79 + .long 0x7C001E99 + .long 0x7C4A1E99 + .long 0x7C9A1E99 + vsldoi 1,0,0,8 + .long 0x7CDB1E99 + vsldoi 3,2,2,8 + vsldoi 5,4,4,8 + vsldoi 7,6,6,8 + li 0,4 + b .Loop +.align 5 +.Loop: + lvx 28,0,6 + .long 0x7D002699 + addi 4,4,16 + mr 7,6 + stvx 0,0,11 + stvx 1,10,11 + stvx 2,26,11 + stvx 3,27,11 + stvx 4,28,11 + stvx 5,29,11 + stvx 6,30,11 + stvx 7,31,11 + .long 0x10E7E0C0 + lvx 28,10,6 + .long 0x10E740C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x7D402699 + addi 4,4,16 + vsldoi 9,8,8,8 + .long 0x10C648C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + .long 0x10A550C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x7D802699 + addi 4,4,16 + vsldoi 11,10,10,8 + .long 0x108458C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + .long 0x106360C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x7DC02699 + addi 4,4,16 + vsldoi 13,12,12,8 + .long 0x104268C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x102170C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + .long 0x7E002699 + addi 4,4,16 + vsldoi 15,14,14,8 + .long 0x100078C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + .long 0x10E780C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x7E402699 + addi 4,4,16 + vsldoi 17,16,16,8 + .long 0x10C688C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + .long 0x10A590C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x7F002699 + addi 4,4,16 + vsldoi 19,18,18,8 + .long 0x108498C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + .long 0x1063C0C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x7F402699 + addi 4,4,16 + vsldoi 25,24,24,8 + .long 0x1042C8C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x1021D0C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + vsldoi 27,26,26,8 + .long 0x13C906C2 + .long 0x1108F0C0 + .long 0x13DA7EC2 + .long 0x1108F0C0 + .long 0x110888C0 + .long 0x1000D8C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + mtctr 0 + b .L16_xx +.align 5 +.L16_xx: + .long 0x13CA06C2 + .long 0x1129F0C0 + .long 0x13DB7EC2 + .long 0x1129F0C0 + .long 0x112990C0 + .long 0x10E740C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x13CB06C2 + .long 0x114AF0C0 + .long 0x13C87EC2 + .long 0x114AF0C0 + .long 0x114A98C0 + .long 0x10C648C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + .long 0x13CC06C2 + .long 0x116BF0C0 + .long 0x13C97EC2 + .long 0x116BF0C0 + .long 0x116BC0C0 + .long 0x10A550C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x13CD06C2 + .long 0x118CF0C0 + .long 0x13CA7EC2 + .long 0x118CF0C0 + .long 0x118CC8C0 + .long 0x108458C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + .long 0x13CE06C2 + .long 0x11ADF0C0 + .long 0x13CB7EC2 + .long 0x11ADF0C0 + .long 0x11ADD0C0 + .long 0x106360C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x13CF06C2 + .long 0x11CEF0C0 + .long 0x13CC7EC2 + .long 0x11CEF0C0 + .long 0x11CED8C0 + .long 0x104268C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13D006C2 + .long 0x11EFF0C0 + .long 0x13CD7EC2 + .long 0x11EFF0C0 + .long 0x11EF40C0 + .long 0x102170C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + .long 0x13D106C2 + .long 0x1210F0C0 + .long 0x13CE7EC2 + .long 0x1210F0C0 + .long 0x121048C0 + .long 0x100078C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + .long 0x13D206C2 + .long 0x1231F0C0 + .long 0x13CF7EC2 + .long 0x1231F0C0 + .long 0x123150C0 + .long 0x10E780C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x13D306C2 + .long 0x1252F0C0 + .long 0x13D07EC2 + .long 0x1252F0C0 + .long 0x125258C0 + .long 0x10C688C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + .long 0x13D806C2 + .long 0x1273F0C0 + .long 0x13D17EC2 + .long 0x1273F0C0 + .long 0x127360C0 + .long 0x10A590C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x13D906C2 + .long 0x1318F0C0 + .long 0x13D27EC2 + .long 0x1318F0C0 + .long 0x131868C0 + .long 0x108498C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + .long 0x13DA06C2 + .long 0x1339F0C0 + .long 0x13D37EC2 + .long 0x1339F0C0 + .long 0x133970C0 + .long 0x1063C0C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x13DB06C2 + .long 0x135AF0C0 + .long 0x13D87EC2 + .long 0x135AF0C0 + .long 0x135A78C0 + .long 0x1042C8C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13C806C2 + .long 0x137BF0C0 + .long 0x13D97EC2 + .long 0x137BF0C0 + .long 0x137B80C0 + .long 0x1021D0C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + .long 0x13C906C2 + .long 0x1108F0C0 + .long 0x13DA7EC2 + .long 0x1108F0C0 + .long 0x110888C0 + .long 0x1000D8C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + bdnz .L16_xx + + lvx 10,0,11 + subic. 5,5,1 + lvx 11,10,11 + .long 0x100050C0 + lvx 12,26,11 + .long 0x102158C0 + lvx 13,27,11 + .long 0x104260C0 + lvx 14,28,11 + .long 0x106368C0 + lvx 15,29,11 + .long 0x108470C0 + lvx 16,30,11 + .long 0x10A578C0 + lvx 17,31,11 + .long 0x10C680C0 + .long 0x10E788C0 + bne .Loop + vperm 0,0,1,28 + vperm 2,2,3,28 + vperm 4,4,5,28 + vperm 6,6,7,28 + .long 0x7C001F99 + .long 0x7C4A1F99 + .long 0x7C9A1F99 + .long 0x7CDB1F99 + addi 11,1,207 + mtlr 8 + mtspr 256,12 + lvx 24,0,11 + lvx 25,10,11 + lvx 26,26,11 + lvx 27,27,11 + lvx 28,28,11 + lvx 29,29,11 + lvx 30,30,11 + lvx 31,31,11 + ld 26,336(1) + ld 27,344(1) + ld 28,352(1) + ld 29,360(1) + ld 30,368(1) + ld 31,376(1) + addi 1,1,384 + blr +.long 0 +.byte 0,12,4,1,0x80,6,3,0 +.long 0 +.size .zfs_sha512_power8,.-.zfs_sha512_power8 +.size zfs_sha512_power8,.-.zfs_sha512_power8 +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 6 + addi 6,6,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0x428a2f98,0xd728ae22 +.long 0x428a2f98,0xd728ae22 +.long 0x71374491,0x23ef65cd +.long 0x71374491,0x23ef65cd +.long 0xb5c0fbcf,0xec4d3b2f +.long 0xb5c0fbcf,0xec4d3b2f +.long 0xe9b5dba5,0x8189dbbc +.long 0xe9b5dba5,0x8189dbbc +.long 0x3956c25b,0xf348b538 +.long 0x3956c25b,0xf348b538 +.long 0x59f111f1,0xb605d019 +.long 0x59f111f1,0xb605d019 +.long 0x923f82a4,0xaf194f9b +.long 0x923f82a4,0xaf194f9b +.long 0xab1c5ed5,0xda6d8118 +.long 0xab1c5ed5,0xda6d8118 +.long 0xd807aa98,0xa3030242 +.long 0xd807aa98,0xa3030242 +.long 0x12835b01,0x45706fbe +.long 0x12835b01,0x45706fbe +.long 0x243185be,0x4ee4b28c +.long 0x243185be,0x4ee4b28c +.long 0x550c7dc3,0xd5ffb4e2 +.long 0x550c7dc3,0xd5ffb4e2 +.long 0x72be5d74,0xf27b896f +.long 0x72be5d74,0xf27b896f +.long 0x80deb1fe,0x3b1696b1 +.long 0x80deb1fe,0x3b1696b1 +.long 0x9bdc06a7,0x25c71235 +.long 0x9bdc06a7,0x25c71235 +.long 0xc19bf174,0xcf692694 +.long 0xc19bf174,0xcf692694 +.long 0xe49b69c1,0x9ef14ad2 +.long 0xe49b69c1,0x9ef14ad2 +.long 0xefbe4786,0x384f25e3 +.long 0xefbe4786,0x384f25e3 +.long 0x0fc19dc6,0x8b8cd5b5 +.long 0x0fc19dc6,0x8b8cd5b5 +.long 0x240ca1cc,0x77ac9c65 +.long 0x240ca1cc,0x77ac9c65 +.long 0x2de92c6f,0x592b0275 +.long 0x2de92c6f,0x592b0275 +.long 0x4a7484aa,0x6ea6e483 +.long 0x4a7484aa,0x6ea6e483 +.long 0x5cb0a9dc,0xbd41fbd4 +.long 0x5cb0a9dc,0xbd41fbd4 +.long 0x76f988da,0x831153b5 +.long 0x76f988da,0x831153b5 +.long 0x983e5152,0xee66dfab +.long 0x983e5152,0xee66dfab +.long 0xa831c66d,0x2db43210 +.long 0xa831c66d,0x2db43210 +.long 0xb00327c8,0x98fb213f +.long 0xb00327c8,0x98fb213f +.long 0xbf597fc7,0xbeef0ee4 +.long 0xbf597fc7,0xbeef0ee4 +.long 0xc6e00bf3,0x3da88fc2 +.long 0xc6e00bf3,0x3da88fc2 +.long 0xd5a79147,0x930aa725 +.long 0xd5a79147,0x930aa725 +.long 0x06ca6351,0xe003826f +.long 0x06ca6351,0xe003826f +.long 0x14292967,0x0a0e6e70 +.long 0x14292967,0x0a0e6e70 +.long 0x27b70a85,0x46d22ffc +.long 0x27b70a85,0x46d22ffc +.long 0x2e1b2138,0x5c26c926 +.long 0x2e1b2138,0x5c26c926 +.long 0x4d2c6dfc,0x5ac42aed +.long 0x4d2c6dfc,0x5ac42aed +.long 0x53380d13,0x9d95b3df +.long 0x53380d13,0x9d95b3df +.long 0x650a7354,0x8baf63de +.long 0x650a7354,0x8baf63de +.long 0x766a0abb,0x3c77b2a8 +.long 0x766a0abb,0x3c77b2a8 +.long 0x81c2c92e,0x47edaee6 +.long 0x81c2c92e,0x47edaee6 +.long 0x92722c85,0x1482353b +.long 0x92722c85,0x1482353b +.long 0xa2bfe8a1,0x4cf10364 +.long 0xa2bfe8a1,0x4cf10364 +.long 0xa81a664b,0xbc423001 +.long 0xa81a664b,0xbc423001 +.long 0xc24b8b70,0xd0f89791 +.long 0xc24b8b70,0xd0f89791 +.long 0xc76c51a3,0x0654be30 +.long 0xc76c51a3,0x0654be30 +.long 0xd192e819,0xd6ef5218 +.long 0xd192e819,0xd6ef5218 +.long 0xd6990624,0x5565a910 +.long 0xd6990624,0x5565a910 +.long 0xf40e3585,0x5771202a +.long 0xf40e3585,0x5771202a +.long 0x106aa070,0x32bbd1b8 +.long 0x106aa070,0x32bbd1b8 +.long 0x19a4c116,0xb8d2d0c8 +.long 0x19a4c116,0xb8d2d0c8 +.long 0x1e376c08,0x5141ab53 +.long 0x1e376c08,0x5141ab53 +.long 0x2748774c,0xdf8eeb99 +.long 0x2748774c,0xdf8eeb99 +.long 0x34b0bcb5,0xe19b48a8 +.long 0x34b0bcb5,0xe19b48a8 +.long 0x391c0cb3,0xc5c95a63 +.long 0x391c0cb3,0xc5c95a63 +.long 0x4ed8aa4a,0xe3418acb +.long 0x4ed8aa4a,0xe3418acb +.long 0x5b9cca4f,0x7763e373 +.long 0x5b9cca4f,0x7763e373 +.long 0x682e6ff3,0xd6b2b8a3 +.long 0x682e6ff3,0xd6b2b8a3 +.long 0x748f82ee,0x5defb2fc +.long 0x748f82ee,0x5defb2fc +.long 0x78a5636f,0x43172f60 +.long 0x78a5636f,0x43172f60 +.long 0x84c87814,0xa1f0ab72 +.long 0x84c87814,0xa1f0ab72 +.long 0x8cc70208,0x1a6439ec +.long 0x8cc70208,0x1a6439ec +.long 0x90befffa,0x23631e28 +.long 0x90befffa,0x23631e28 +.long 0xa4506ceb,0xde82bde9 +.long 0xa4506ceb,0xde82bde9 +.long 0xbef9a3f7,0xb2c67915 +.long 0xbef9a3f7,0xb2c67915 +.long 0xc67178f2,0xe372532b +.long 0xc67178f2,0xe372532b +.long 0xca273ece,0xea26619c +.long 0xca273ece,0xea26619c +.long 0xd186b8c7,0x21c0c207 +.long 0xd186b8c7,0x21c0c207 +.long 0xeada7dd6,0xcde0eb1e +.long 0xeada7dd6,0xcde0eb1e +.long 0xf57d4f7f,0xee6ed178 +.long 0xf57d4f7f,0xee6ed178 +.long 0x06f067aa,0x72176fba +.long 0x06f067aa,0x72176fba +.long 0x0a637dc5,0xa2c898a6 +.long 0x0a637dc5,0xa2c898a6 +.long 0x113f9804,0xbef90dae +.long 0x113f9804,0xbef90dae +.long 0x1b710b35,0x131c471b +.long 0x1b710b35,0x131c471b +.long 0x28db77f5,0x23047d84 +.long 0x28db77f5,0x23047d84 +.long 0x32caab7b,0x40c72493 +.long 0x32caab7b,0x40c72493 +.long 0x3c9ebe0a,0x15c9bebc +.long 0x3c9ebe0a,0x15c9bebc +.long 0x431d67c4,0x9c100d4c +.long 0x431d67c4,0x9c100d4c +.long 0x4cc5d4be,0xcb3e42b6 +.long 0x4cc5d4be,0xcb3e42b6 +.long 0x597f299c,0xfc657e2a +.long 0x597f299c,0xfc657e2a +.long 0x5fcb6fab,0x3ad6faec +.long 0x5fcb6fab,0x3ad6faec +.long 0x6c44198c,0x4a475817 +.long 0x6c44198c,0x4a475817 +.long 0,0 +.long 0,0 +.long 0x00010203,0x04050607 +.long 0x10111213,0x14151617 + +#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +.abiversion 2 +.text + +.globl zfs_sha512_power8 +.type zfs_sha512_power8,@function +.align 6 +zfs_sha512_power8: +.localentry zfs_sha512_power8,0 + + stdu 1,-384(1) + mflr 8 + li 10,207 + li 11,223 + stvx 24,10,1 + addi 10,10,32 + li 12,-1 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + li 11,-4096+255 + stw 12,332(1) + li 10,0x10 + std 26,336(1) + li 26,0x20 + std 27,344(1) + li 27,0x30 + std 28,352(1) + li 28,0x40 + std 29,360(1) + li 29,0x50 + std 30,368(1) + li 30,0x60 + std 31,376(1) + li 31,0x70 + std 8,400(1) + or 11,11,11 + + bl .LPICmeup + addi 11,1,79 + li 7,8 + lvsl 31,0,7 + vspltisb 28,0x0f + vxor 31,31,28 + .long 0x7C001E99 + .long 0x7C4A1E99 + .long 0x7C9A1E99 + vsldoi 1,0,0,8 + .long 0x7CDB1E99 + vsldoi 3,2,2,8 + vsldoi 5,4,4,8 + vsldoi 7,6,6,8 + li 0,4 + b .Loop +.align 5 +.Loop: + lvx 28,0,6 + .long 0x7D002699 + addi 4,4,16 + mr 7,6 + stvx 0,0,11 + stvx 1,10,11 + stvx 2,26,11 + stvx 3,27,11 + stvx 4,28,11 + stvx 5,29,11 + stvx 6,30,11 + stvx 7,31,11 + .long 0x10E7E0C0 + lvx 28,10,6 + vperm 8,8,8,31 + .long 0x10E740C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x7D402699 + addi 4,4,16 + vsldoi 9,8,8,8 + .long 0x10C648C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + vperm 10,10,10,31 + .long 0x10A550C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x7D802699 + addi 4,4,16 + vsldoi 11,10,10,8 + .long 0x108458C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + vperm 12,12,12,31 + .long 0x106360C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x7DC02699 + addi 4,4,16 + vsldoi 13,12,12,8 + .long 0x104268C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + vperm 14,14,14,31 + .long 0x102170C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + .long 0x7E002699 + addi 4,4,16 + vsldoi 15,14,14,8 + .long 0x100078C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + vperm 16,16,16,31 + .long 0x10E780C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x7E402699 + addi 4,4,16 + vsldoi 17,16,16,8 + .long 0x10C688C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + vperm 18,18,18,31 + .long 0x10A590C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x7F002699 + addi 4,4,16 + vsldoi 19,18,18,8 + .long 0x108498C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + vperm 24,24,24,31 + .long 0x1063C0C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x7F402699 + addi 4,4,16 + vsldoi 25,24,24,8 + .long 0x1042C8C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + vperm 26,26,26,31 + .long 0x1021D0C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + vsldoi 27,26,26,8 + .long 0x13C906C2 + .long 0x1108F0C0 + .long 0x13DA7EC2 + .long 0x1108F0C0 + .long 0x110888C0 + .long 0x1000D8C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + mtctr 0 + b .L16_xx +.align 5 +.L16_xx: + .long 0x13CA06C2 + .long 0x1129F0C0 + .long 0x13DB7EC2 + .long 0x1129F0C0 + .long 0x112990C0 + .long 0x10E740C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x13CB06C2 + .long 0x114AF0C0 + .long 0x13C87EC2 + .long 0x114AF0C0 + .long 0x114A98C0 + .long 0x10C648C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + .long 0x13CC06C2 + .long 0x116BF0C0 + .long 0x13C97EC2 + .long 0x116BF0C0 + .long 0x116BC0C0 + .long 0x10A550C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x13CD06C2 + .long 0x118CF0C0 + .long 0x13CA7EC2 + .long 0x118CF0C0 + .long 0x118CC8C0 + .long 0x108458C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + .long 0x13CE06C2 + .long 0x11ADF0C0 + .long 0x13CB7EC2 + .long 0x11ADF0C0 + .long 0x11ADD0C0 + .long 0x106360C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x13CF06C2 + .long 0x11CEF0C0 + .long 0x13CC7EC2 + .long 0x11CEF0C0 + .long 0x11CED8C0 + .long 0x104268C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13D006C2 + .long 0x11EFF0C0 + .long 0x13CD7EC2 + .long 0x11EFF0C0 + .long 0x11EF40C0 + .long 0x102170C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + .long 0x13D106C2 + .long 0x1210F0C0 + .long 0x13CE7EC2 + .long 0x1210F0C0 + .long 0x121048C0 + .long 0x100078C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + .long 0x13D206C2 + .long 0x1231F0C0 + .long 0x13CF7EC2 + .long 0x1231F0C0 + .long 0x123150C0 + .long 0x10E780C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x13D306C2 + .long 0x1252F0C0 + .long 0x13D07EC2 + .long 0x1252F0C0 + .long 0x125258C0 + .long 0x10C688C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + .long 0x13D806C2 + .long 0x1273F0C0 + .long 0x13D17EC2 + .long 0x1273F0C0 + .long 0x127360C0 + .long 0x10A590C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x13D906C2 + .long 0x1318F0C0 + .long 0x13D27EC2 + .long 0x1318F0C0 + .long 0x131868C0 + .long 0x108498C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + .long 0x13DA06C2 + .long 0x1339F0C0 + .long 0x13D37EC2 + .long 0x1339F0C0 + .long 0x133970C0 + .long 0x1063C0C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x13DB06C2 + .long 0x135AF0C0 + .long 0x13D87EC2 + .long 0x135AF0C0 + .long 0x135A78C0 + .long 0x1042C8C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13C806C2 + .long 0x137BF0C0 + .long 0x13D97EC2 + .long 0x137BF0C0 + .long 0x137B80C0 + .long 0x1021D0C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + .long 0x13C906C2 + .long 0x1108F0C0 + .long 0x13DA7EC2 + .long 0x1108F0C0 + .long 0x110888C0 + .long 0x1000D8C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + bdnz .L16_xx + + lvx 10,0,11 + subic. 5,5,1 + lvx 11,10,11 + .long 0x100050C0 + lvx 12,26,11 + .long 0x102158C0 + lvx 13,27,11 + .long 0x104260C0 + lvx 14,28,11 + .long 0x106368C0 + lvx 15,29,11 + .long 0x108470C0 + lvx 16,30,11 + .long 0x10A578C0 + lvx 17,31,11 + .long 0x10C680C0 + .long 0x10E788C0 + bne .Loop + vperm 0,0,1,28 + vperm 2,2,3,28 + vperm 4,4,5,28 + vperm 6,6,7,28 + .long 0x7C001F99 + .long 0x7C4A1F99 + .long 0x7C9A1F99 + .long 0x7CDB1F99 + addi 11,1,207 + mtlr 8 + or 12,12,12 + lvx 24,0,11 + lvx 25,10,11 + lvx 26,26,11 + lvx 27,27,11 + lvx 28,28,11 + lvx 29,29,11 + lvx 30,30,11 + lvx 31,31,11 + ld 26,336(1) + ld 27,344(1) + ld 28,352(1) + ld 29,360(1) + ld 30,368(1) + ld 31,376(1) + addi 1,1,384 + blr +.long 0 +.byte 0,12,4,1,0x80,6,3,0 +.long 0 +.size zfs_sha512_power8,.-zfs_sha512_power8 +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 6 + addi 6,6,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0xd728ae22,0x428a2f98 +.long 0xd728ae22,0x428a2f98 +.long 0x23ef65cd,0x71374491 +.long 0x23ef65cd,0x71374491 +.long 0xec4d3b2f,0xb5c0fbcf +.long 0xec4d3b2f,0xb5c0fbcf +.long 0x8189dbbc,0xe9b5dba5 +.long 0x8189dbbc,0xe9b5dba5 +.long 0xf348b538,0x3956c25b +.long 0xf348b538,0x3956c25b +.long 0xb605d019,0x59f111f1 +.long 0xb605d019,0x59f111f1 +.long 0xaf194f9b,0x923f82a4 +.long 0xaf194f9b,0x923f82a4 +.long 0xda6d8118,0xab1c5ed5 +.long 0xda6d8118,0xab1c5ed5 +.long 0xa3030242,0xd807aa98 +.long 0xa3030242,0xd807aa98 +.long 0x45706fbe,0x12835b01 +.long 0x45706fbe,0x12835b01 +.long 0x4ee4b28c,0x243185be +.long 0x4ee4b28c,0x243185be +.long 0xd5ffb4e2,0x550c7dc3 +.long 0xd5ffb4e2,0x550c7dc3 +.long 0xf27b896f,0x72be5d74 +.long 0xf27b896f,0x72be5d74 +.long 0x3b1696b1,0x80deb1fe +.long 0x3b1696b1,0x80deb1fe +.long 0x25c71235,0x9bdc06a7 +.long 0x25c71235,0x9bdc06a7 +.long 0xcf692694,0xc19bf174 +.long 0xcf692694,0xc19bf174 +.long 0x9ef14ad2,0xe49b69c1 +.long 0x9ef14ad2,0xe49b69c1 +.long 0x384f25e3,0xefbe4786 +.long 0x384f25e3,0xefbe4786 +.long 0x8b8cd5b5,0x0fc19dc6 +.long 0x8b8cd5b5,0x0fc19dc6 +.long 0x77ac9c65,0x240ca1cc +.long 0x77ac9c65,0x240ca1cc +.long 0x592b0275,0x2de92c6f +.long 0x592b0275,0x2de92c6f +.long 0x6ea6e483,0x4a7484aa +.long 0x6ea6e483,0x4a7484aa +.long 0xbd41fbd4,0x5cb0a9dc +.long 0xbd41fbd4,0x5cb0a9dc +.long 0x831153b5,0x76f988da +.long 0x831153b5,0x76f988da +.long 0xee66dfab,0x983e5152 +.long 0xee66dfab,0x983e5152 +.long 0x2db43210,0xa831c66d +.long 0x2db43210,0xa831c66d +.long 0x98fb213f,0xb00327c8 +.long 0x98fb213f,0xb00327c8 +.long 0xbeef0ee4,0xbf597fc7 +.long 0xbeef0ee4,0xbf597fc7 +.long 0x3da88fc2,0xc6e00bf3 +.long 0x3da88fc2,0xc6e00bf3 +.long 0x930aa725,0xd5a79147 +.long 0x930aa725,0xd5a79147 +.long 0xe003826f,0x06ca6351 +.long 0xe003826f,0x06ca6351 +.long 0x0a0e6e70,0x14292967 +.long 0x0a0e6e70,0x14292967 +.long 0x46d22ffc,0x27b70a85 +.long 0x46d22ffc,0x27b70a85 +.long 0x5c26c926,0x2e1b2138 +.long 0x5c26c926,0x2e1b2138 +.long 0x5ac42aed,0x4d2c6dfc +.long 0x5ac42aed,0x4d2c6dfc +.long 0x9d95b3df,0x53380d13 +.long 0x9d95b3df,0x53380d13 +.long 0x8baf63de,0x650a7354 +.long 0x8baf63de,0x650a7354 +.long 0x3c77b2a8,0x766a0abb +.long 0x3c77b2a8,0x766a0abb +.long 0x47edaee6,0x81c2c92e +.long 0x47edaee6,0x81c2c92e +.long 0x1482353b,0x92722c85 +.long 0x1482353b,0x92722c85 +.long 0x4cf10364,0xa2bfe8a1 +.long 0x4cf10364,0xa2bfe8a1 +.long 0xbc423001,0xa81a664b +.long 0xbc423001,0xa81a664b +.long 0xd0f89791,0xc24b8b70 +.long 0xd0f89791,0xc24b8b70 +.long 0x0654be30,0xc76c51a3 +.long 0x0654be30,0xc76c51a3 +.long 0xd6ef5218,0xd192e819 +.long 0xd6ef5218,0xd192e819 +.long 0x5565a910,0xd6990624 +.long 0x5565a910,0xd6990624 +.long 0x5771202a,0xf40e3585 +.long 0x5771202a,0xf40e3585 +.long 0x32bbd1b8,0x106aa070 +.long 0x32bbd1b8,0x106aa070 +.long 0xb8d2d0c8,0x19a4c116 +.long 0xb8d2d0c8,0x19a4c116 +.long 0x5141ab53,0x1e376c08 +.long 0x5141ab53,0x1e376c08 +.long 0xdf8eeb99,0x2748774c +.long 0xdf8eeb99,0x2748774c +.long 0xe19b48a8,0x34b0bcb5 +.long 0xe19b48a8,0x34b0bcb5 +.long 0xc5c95a63,0x391c0cb3 +.long 0xc5c95a63,0x391c0cb3 +.long 0xe3418acb,0x4ed8aa4a +.long 0xe3418acb,0x4ed8aa4a +.long 0x7763e373,0x5b9cca4f +.long 0x7763e373,0x5b9cca4f +.long 0xd6b2b8a3,0x682e6ff3 +.long 0xd6b2b8a3,0x682e6ff3 +.long 0x5defb2fc,0x748f82ee +.long 0x5defb2fc,0x748f82ee +.long 0x43172f60,0x78a5636f +.long 0x43172f60,0x78a5636f +.long 0xa1f0ab72,0x84c87814 +.long 0xa1f0ab72,0x84c87814 +.long 0x1a6439ec,0x8cc70208 +.long 0x1a6439ec,0x8cc70208 +.long 0x23631e28,0x90befffa +.long 0x23631e28,0x90befffa +.long 0xde82bde9,0xa4506ceb +.long 0xde82bde9,0xa4506ceb +.long 0xb2c67915,0xbef9a3f7 +.long 0xb2c67915,0xbef9a3f7 +.long 0xe372532b,0xc67178f2 +.long 0xe372532b,0xc67178f2 +.long 0xea26619c,0xca273ece +.long 0xea26619c,0xca273ece +.long 0x21c0c207,0xd186b8c7 +.long 0x21c0c207,0xd186b8c7 +.long 0xcde0eb1e,0xeada7dd6 +.long 0xcde0eb1e,0xeada7dd6 +.long 0xee6ed178,0xf57d4f7f +.long 0xee6ed178,0xf57d4f7f +.long 0x72176fba,0x06f067aa +.long 0x72176fba,0x06f067aa +.long 0xa2c898a6,0x0a637dc5 +.long 0xa2c898a6,0x0a637dc5 +.long 0xbef90dae,0x113f9804 +.long 0xbef90dae,0x113f9804 +.long 0x131c471b,0x1b710b35 +.long 0x131c471b,0x1b710b35 +.long 0x23047d84,0x28db77f5 +.long 0x23047d84,0x28db77f5 +.long 0x40c72493,0x32caab7b +.long 0x40c72493,0x32caab7b +.long 0x15c9bebc,0x3c9ebe0a +.long 0x15c9bebc,0x3c9ebe0a +.long 0x9c100d4c,0x431d67c4 +.long 0x9c100d4c,0x431d67c4 +.long 0xcb3e42b6,0x4cc5d4be +.long 0xcb3e42b6,0x4cc5d4be +.long 0xfc657e2a,0x597f299c +.long 0xfc657e2a,0x597f299c +.long 0x3ad6faec,0x5fcb6fab +.long 0x3ad6faec,0x5fcb6fab +.long 0x4a475817,0x6c44198c +.long 0x4a475817,0x6c44198c +.long 0,0 +.long 0,0 +.long 0x14151617,0x10111213 +.long 0x04050607,0x00010203 + +#endif diff --git a/module/icp/asm-ppc64/sha2/sha512-ppc.S b/module/icp/asm-ppc64/sha2/sha512-ppc.S new file mode 100644 index 000000000000..37070115c3ff --- /dev/null +++ b/module/icp/asm-ppc64/sha2/sha512-ppc.S @@ -0,0 +1,2958 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Portions Copyright (c) 2022 Tino Reichardt + * - modified assembly to fit into OpenZFS + */ + +#if (defined(__PPC64__) && defined(__BIG_ENDIAN__)) + +.text + +.globl zfs_sha512_ppc +.globl .zfs_sha512_ppc +.type zfs_sha512_ppc,@function +.section ".opd","aw" +.align 3 +zfs_sha512_ppc: +.quad .zfs_sha512_ppc,.TOC.@tocbase,0 +.previous +.align 6 +.zfs_sha512_ppc: + stdu 1,-384(1) + mflr 0 + sldi 5,5,7 + + std 3,208(1) + + std 14,240(1) + std 15,248(1) + std 16,256(1) + std 17,264(1) + std 18,272(1) + std 19,280(1) + std 20,288(1) + std 21,296(1) + std 22,304(1) + std 23,312(1) + std 24,320(1) + std 25,328(1) + std 26,336(1) + std 27,344(1) + std 28,352(1) + std 29,360(1) + std 30,368(1) + std 31,376(1) + std 0,400(1) + ld 8,0(3) + mr 31,4 + ld 9,8(3) + ld 10,16(3) + ld 11,24(3) + ld 12,32(3) + ld 6,40(3) + ld 14,48(3) + ld 15,56(3) + bl .LPICmeup +.LPICedup: + andi. 0,31,3 + bne .Lunaligned +.Laligned: + add 5,31,5 + std 5,192(1) + std 31,200(1) + bl .Lsha2_block_private + b .Ldone + + + + + + + +.align 4 +.Lunaligned: + subfic 0,31,4096 + andi. 0,0,3968 + beq .Lcross_page + cmpld 5,0 + ble .Laligned + subfc 5,0,5 + add 0,31,0 + std 5,184(1) + std 0,192(1) + std 31,200(1) + bl .Lsha2_block_private + + ld 5,184(1) +.Lcross_page: + li 0,32 + mtctr 0 + addi 20,1,48 +.Lmemcpy: + lbz 16,0(31) + lbz 17,1(31) + lbz 18,2(31) + lbz 19,3(31) + addi 31,31,4 + stb 16,0(20) + stb 17,1(20) + stb 18,2(20) + stb 19,3(20) + addi 20,20,4 + bdnz .Lmemcpy + std 31,176(1) + addi 0,1,176 + addi 31,1,48 + std 5,184(1) + std 0,192(1) + std 31,200(1) + bl .Lsha2_block_private + ld 31,176(1) + ld 5,184(1) + addic. 5,5,-128 + bne .Lunaligned + +.Ldone: + ld 0,400(1) + ld 14,240(1) + ld 15,248(1) + ld 16,256(1) + ld 17,264(1) + ld 18,272(1) + ld 19,280(1) + ld 20,288(1) + ld 21,296(1) + ld 22,304(1) + ld 23,312(1) + ld 24,320(1) + ld 25,328(1) + ld 26,336(1) + ld 27,344(1) + ld 28,352(1) + ld 29,360(1) + ld 30,368(1) + ld 31,376(1) + mtlr 0 + addi 1,1,384 + blr +.long 0 +.byte 0,12,4,1,0x80,18,3,0 +.long 0 +.align 4 +.Lsha2_block_private: + ld 0,0(7) + lwz 5,0(31) + lwz 16,4(31) + insrdi 16,5,32,0 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + ld 0,8(7) + add 15,15,3 + add 15,15,5 + + lwz 5,8(31) + lwz 17,12(31) + insrdi 17,5,32,0 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + ld 0,16(7) + add 14,14,3 + add 14,14,5 + + lwz 5,16(31) + lwz 18,20(31) + insrdi 18,5,32,0 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + ld 0,24(7) + add 6,6,3 + add 6,6,5 + + lwz 5,24(31) + lwz 19,28(31) + insrdi 19,5,32,0 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + ld 0,32(7) + add 12,12,3 + add 12,12,5 + + lwz 5,32(31) + lwz 20,36(31) + insrdi 20,5,32,0 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + ld 0,40(7) + add 11,11,3 + add 11,11,5 + + lwz 5,40(31) + lwz 21,44(31) + insrdi 21,5,32,0 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + ld 0,48(7) + add 10,10,3 + add 10,10,5 + + lwz 5,48(31) + lwz 22,52(31) + insrdi 22,5,32,0 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + ld 0,56(7) + add 9,9,3 + add 9,9,5 + + lwz 5,56(31) + lwz 23,60(31) + insrdi 23,5,32,0 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + ld 0,64(7) + add 8,8,3 + add 8,8,5 + + lwz 5,64(31) + lwz 24,68(31) + insrdi 24,5,32,0 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + ld 0,72(7) + add 15,15,3 + add 15,15,5 + + lwz 5,72(31) + lwz 25,76(31) + insrdi 25,5,32,0 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + ld 0,80(7) + add 14,14,3 + add 14,14,5 + + lwz 5,80(31) + lwz 26,84(31) + insrdi 26,5,32,0 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + ld 0,88(7) + add 6,6,3 + add 6,6,5 + + lwz 5,88(31) + lwz 27,92(31) + insrdi 27,5,32,0 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + ld 0,96(7) + add 12,12,3 + add 12,12,5 + + lwz 5,96(31) + lwz 28,100(31) + insrdi 28,5,32,0 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + ld 0,104(7) + add 11,11,3 + add 11,11,5 + + lwz 5,104(31) + lwz 29,108(31) + insrdi 29,5,32,0 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + ld 0,112(7) + add 10,10,3 + add 10,10,5 + + lwz 5,112(31) + lwz 30,116(31) + insrdi 30,5,32,0 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + ld 0,120(7) + add 9,9,3 + add 9,9,5 + + lwz 5,120(31) + lwz 31,124(31) + insrdi 31,5,32,0 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + li 5,4 + mtctr 5 +.align 4 +.Lrounds: + addi 7,7,128 + rotrdi 3,17,1 + rotrdi 4,17,8 + rotrdi 5,30,19 + rotrdi 0,30,61 + xor 3,3,4 + srdi 4,17,7 + xor 5,5,0 + srdi 0,30,6 + add 16,16,25 + xor 3,3,4 + xor 5,5,0 + ld 0,0(7) + add 16,16,3 + add 16,16,5 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrdi 3,18,1 + rotrdi 4,18,8 + rotrdi 5,31,19 + rotrdi 0,31,61 + xor 3,3,4 + srdi 4,18,7 + xor 5,5,0 + srdi 0,31,6 + add 17,17,26 + xor 3,3,4 + xor 5,5,0 + ld 0,8(7) + add 17,17,3 + add 17,17,5 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrdi 3,19,1 + rotrdi 4,19,8 + rotrdi 5,16,19 + rotrdi 0,16,61 + xor 3,3,4 + srdi 4,19,7 + xor 5,5,0 + srdi 0,16,6 + add 18,18,27 + xor 3,3,4 + xor 5,5,0 + ld 0,16(7) + add 18,18,3 + add 18,18,5 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrdi 3,20,1 + rotrdi 4,20,8 + rotrdi 5,17,19 + rotrdi 0,17,61 + xor 3,3,4 + srdi 4,20,7 + xor 5,5,0 + srdi 0,17,6 + add 19,19,28 + xor 3,3,4 + xor 5,5,0 + ld 0,24(7) + add 19,19,3 + add 19,19,5 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrdi 3,21,1 + rotrdi 4,21,8 + rotrdi 5,18,19 + rotrdi 0,18,61 + xor 3,3,4 + srdi 4,21,7 + xor 5,5,0 + srdi 0,18,6 + add 20,20,29 + xor 3,3,4 + xor 5,5,0 + ld 0,32(7) + add 20,20,3 + add 20,20,5 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrdi 3,22,1 + rotrdi 4,22,8 + rotrdi 5,19,19 + rotrdi 0,19,61 + xor 3,3,4 + srdi 4,22,7 + xor 5,5,0 + srdi 0,19,6 + add 21,21,30 + xor 3,3,4 + xor 5,5,0 + ld 0,40(7) + add 21,21,3 + add 21,21,5 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrdi 3,23,1 + rotrdi 4,23,8 + rotrdi 5,20,19 + rotrdi 0,20,61 + xor 3,3,4 + srdi 4,23,7 + xor 5,5,0 + srdi 0,20,6 + add 22,22,31 + xor 3,3,4 + xor 5,5,0 + ld 0,48(7) + add 22,22,3 + add 22,22,5 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrdi 3,24,1 + rotrdi 4,24,8 + rotrdi 5,21,19 + rotrdi 0,21,61 + xor 3,3,4 + srdi 4,24,7 + xor 5,5,0 + srdi 0,21,6 + add 23,23,16 + xor 3,3,4 + xor 5,5,0 + ld 0,56(7) + add 23,23,3 + add 23,23,5 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + rotrdi 3,25,1 + rotrdi 4,25,8 + rotrdi 5,22,19 + rotrdi 0,22,61 + xor 3,3,4 + srdi 4,25,7 + xor 5,5,0 + srdi 0,22,6 + add 24,24,17 + xor 3,3,4 + xor 5,5,0 + ld 0,64(7) + add 24,24,3 + add 24,24,5 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrdi 3,26,1 + rotrdi 4,26,8 + rotrdi 5,23,19 + rotrdi 0,23,61 + xor 3,3,4 + srdi 4,26,7 + xor 5,5,0 + srdi 0,23,6 + add 25,25,18 + xor 3,3,4 + xor 5,5,0 + ld 0,72(7) + add 25,25,3 + add 25,25,5 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrdi 3,27,1 + rotrdi 4,27,8 + rotrdi 5,24,19 + rotrdi 0,24,61 + xor 3,3,4 + srdi 4,27,7 + xor 5,5,0 + srdi 0,24,6 + add 26,26,19 + xor 3,3,4 + xor 5,5,0 + ld 0,80(7) + add 26,26,3 + add 26,26,5 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrdi 3,28,1 + rotrdi 4,28,8 + rotrdi 5,25,19 + rotrdi 0,25,61 + xor 3,3,4 + srdi 4,28,7 + xor 5,5,0 + srdi 0,25,6 + add 27,27,20 + xor 3,3,4 + xor 5,5,0 + ld 0,88(7) + add 27,27,3 + add 27,27,5 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrdi 3,29,1 + rotrdi 4,29,8 + rotrdi 5,26,19 + rotrdi 0,26,61 + xor 3,3,4 + srdi 4,29,7 + xor 5,5,0 + srdi 0,26,6 + add 28,28,21 + xor 3,3,4 + xor 5,5,0 + ld 0,96(7) + add 28,28,3 + add 28,28,5 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrdi 3,30,1 + rotrdi 4,30,8 + rotrdi 5,27,19 + rotrdi 0,27,61 + xor 3,3,4 + srdi 4,30,7 + xor 5,5,0 + srdi 0,27,6 + add 29,29,22 + xor 3,3,4 + xor 5,5,0 + ld 0,104(7) + add 29,29,3 + add 29,29,5 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrdi 3,31,1 + rotrdi 4,31,8 + rotrdi 5,28,19 + rotrdi 0,28,61 + xor 3,3,4 + srdi 4,31,7 + xor 5,5,0 + srdi 0,28,6 + add 30,30,23 + xor 3,3,4 + xor 5,5,0 + ld 0,112(7) + add 30,30,3 + add 30,30,5 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrdi 3,16,1 + rotrdi 4,16,8 + rotrdi 5,29,19 + rotrdi 0,29,61 + xor 3,3,4 + srdi 4,16,7 + xor 5,5,0 + srdi 0,29,6 + add 31,31,24 + xor 3,3,4 + xor 5,5,0 + ld 0,120(7) + add 31,31,3 + add 31,31,5 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + bdnz .Lrounds + + ld 3,208(1) + ld 31,200(1) + ld 5,192(1) + subi 7,7,512 + + ld 16,0(3) + ld 17,8(3) + ld 18,16(3) + ld 19,24(3) + ld 20,32(3) + ld 21,40(3) + ld 22,48(3) + addi 31,31,128 + ld 23,56(3) + add 8,8,16 + add 9,9,17 + std 31,200(1) + add 10,10,18 + std 8,0(3) + add 11,11,19 + std 9,8(3) + add 12,12,20 + std 10,16(3) + add 6,6,21 + std 11,24(3) + add 14,14,22 + std 12,32(3) + add 15,15,23 + std 6,40(3) + std 14,48(3) + cmpld 31,5 + std 15,56(3) + bne .Lsha2_block_private + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size .zfs_sha512_ppc,.-.zfs_sha512_ppc +.size zfs_sha512_ppc,.-.zfs_sha512_ppc +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 7 + addi 7,7,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0x428a2f98,0xd728ae22 +.long 0x71374491,0x23ef65cd +.long 0xb5c0fbcf,0xec4d3b2f +.long 0xe9b5dba5,0x8189dbbc +.long 0x3956c25b,0xf348b538 +.long 0x59f111f1,0xb605d019 +.long 0x923f82a4,0xaf194f9b +.long 0xab1c5ed5,0xda6d8118 +.long 0xd807aa98,0xa3030242 +.long 0x12835b01,0x45706fbe +.long 0x243185be,0x4ee4b28c +.long 0x550c7dc3,0xd5ffb4e2 +.long 0x72be5d74,0xf27b896f +.long 0x80deb1fe,0x3b1696b1 +.long 0x9bdc06a7,0x25c71235 +.long 0xc19bf174,0xcf692694 +.long 0xe49b69c1,0x9ef14ad2 +.long 0xefbe4786,0x384f25e3 +.long 0x0fc19dc6,0x8b8cd5b5 +.long 0x240ca1cc,0x77ac9c65 +.long 0x2de92c6f,0x592b0275 +.long 0x4a7484aa,0x6ea6e483 +.long 0x5cb0a9dc,0xbd41fbd4 +.long 0x76f988da,0x831153b5 +.long 0x983e5152,0xee66dfab +.long 0xa831c66d,0x2db43210 +.long 0xb00327c8,0x98fb213f +.long 0xbf597fc7,0xbeef0ee4 +.long 0xc6e00bf3,0x3da88fc2 +.long 0xd5a79147,0x930aa725 +.long 0x06ca6351,0xe003826f +.long 0x14292967,0x0a0e6e70 +.long 0x27b70a85,0x46d22ffc +.long 0x2e1b2138,0x5c26c926 +.long 0x4d2c6dfc,0x5ac42aed +.long 0x53380d13,0x9d95b3df +.long 0x650a7354,0x8baf63de +.long 0x766a0abb,0x3c77b2a8 +.long 0x81c2c92e,0x47edaee6 +.long 0x92722c85,0x1482353b +.long 0xa2bfe8a1,0x4cf10364 +.long 0xa81a664b,0xbc423001 +.long 0xc24b8b70,0xd0f89791 +.long 0xc76c51a3,0x0654be30 +.long 0xd192e819,0xd6ef5218 +.long 0xd6990624,0x5565a910 +.long 0xf40e3585,0x5771202a +.long 0x106aa070,0x32bbd1b8 +.long 0x19a4c116,0xb8d2d0c8 +.long 0x1e376c08,0x5141ab53 +.long 0x2748774c,0xdf8eeb99 +.long 0x34b0bcb5,0xe19b48a8 +.long 0x391c0cb3,0xc5c95a63 +.long 0x4ed8aa4a,0xe3418acb +.long 0x5b9cca4f,0x7763e373 +.long 0x682e6ff3,0xd6b2b8a3 +.long 0x748f82ee,0x5defb2fc +.long 0x78a5636f,0x43172f60 +.long 0x84c87814,0xa1f0ab72 +.long 0x8cc70208,0x1a6439ec +.long 0x90befffa,0x23631e28 +.long 0xa4506ceb,0xde82bde9 +.long 0xbef9a3f7,0xb2c67915 +.long 0xc67178f2,0xe372532b +.long 0xca273ece,0xea26619c +.long 0xd186b8c7,0x21c0c207 +.long 0xeada7dd6,0xcde0eb1e +.long 0xf57d4f7f,0xee6ed178 +.long 0x06f067aa,0x72176fba +.long 0x0a637dc5,0xa2c898a6 +.long 0x113f9804,0xbef90dae +.long 0x1b710b35,0x131c471b +.long 0x28db77f5,0x23047d84 +.long 0x32caab7b,0x40c72493 +.long 0x3c9ebe0a,0x15c9bebc +.long 0x431d67c4,0x9c100d4c +.long 0x4cc5d4be,0xcb3e42b6 +.long 0x597f299c,0xfc657e2a +.long 0x5fcb6fab,0x3ad6faec +.long 0x6c44198c,0x4a475817 + +#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +.abiversion 2 +.text + +.globl zfs_sha512_ppc +.type zfs_sha512_ppc,@function +.align 6 +zfs_sha512_ppc: +.localentry zfs_sha512_ppc,0 + + stdu 1,-384(1) + mflr 0 + sldi 5,5,7 + + std 3,208(1) + + std 14,240(1) + std 15,248(1) + std 16,256(1) + std 17,264(1) + std 18,272(1) + std 19,280(1) + std 20,288(1) + std 21,296(1) + std 22,304(1) + std 23,312(1) + std 24,320(1) + std 25,328(1) + std 26,336(1) + std 27,344(1) + std 28,352(1) + std 29,360(1) + std 30,368(1) + std 31,376(1) + std 0,400(1) + ld 8,0(3) + mr 31,4 + ld 9,8(3) + ld 10,16(3) + ld 11,24(3) + ld 12,32(3) + ld 6,40(3) + ld 14,48(3) + ld 15,56(3) + bl .LPICmeup +.LPICedup: + andi. 0,31,3 + bne .Lunaligned +.Laligned: + add 5,31,5 + std 5,192(1) + std 31,200(1) + bl .Lsha2_block_private + b .Ldone + +.align 4 +.Lunaligned: + subfic 0,31,4096 + andi. 0,0,3968 + beq .Lcross_page + cmpld 5,0 + ble .Laligned + subfc 5,0,5 + add 0,31,0 + std 5,184(1) + std 0,192(1) + std 31,200(1) + bl .Lsha2_block_private + + ld 5,184(1) +.Lcross_page: + li 0,32 + mtctr 0 + addi 20,1,48 +.Lmemcpy: + lbz 16,0(31) + lbz 17,1(31) + lbz 18,2(31) + lbz 19,3(31) + addi 31,31,4 + stb 16,0(20) + stb 17,1(20) + stb 18,2(20) + stb 19,3(20) + addi 20,20,4 + bdnz .Lmemcpy + std 31,176(1) + addi 0,1,176 + addi 31,1,48 + std 5,184(1) + std 0,192(1) + std 31,200(1) + bl .Lsha2_block_private + ld 31,176(1) + ld 5,184(1) + addic. 5,5,-128 + bne .Lunaligned + +.Ldone: + ld 0,400(1) + ld 14,240(1) + ld 15,248(1) + ld 16,256(1) + ld 17,264(1) + ld 18,272(1) + ld 19,280(1) + ld 20,288(1) + ld 21,296(1) + ld 22,304(1) + ld 23,312(1) + ld 24,320(1) + ld 25,328(1) + ld 26,336(1) + ld 27,344(1) + ld 28,352(1) + ld 29,360(1) + ld 30,368(1) + ld 31,376(1) + mtlr 0 + addi 1,1,384 + blr +.long 0 +.byte 0,12,4,1,0x80,18,3,0 +.long 0 +.align 4 +.Lsha2_block_private: + ld 0,0(7) + lwz 3,0(31) + lwz 4,4(31) + rotlwi 5,3,8 + rotlwi 16,4,8 + rlwimi 5,3,24,0,7 + rlwimi 16,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 16,4,24,16,23 + insrdi 16,5,32,0 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + ld 0,8(7) + add 15,15,3 + add 15,15,5 + + lwz 3,8(31) + lwz 4,12(31) + rotlwi 5,3,8 + rotlwi 17,4,8 + rlwimi 5,3,24,0,7 + rlwimi 17,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 17,4,24,16,23 + insrdi 17,5,32,0 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + ld 0,16(7) + add 14,14,3 + add 14,14,5 + + lwz 3,16(31) + lwz 4,20(31) + rotlwi 5,3,8 + rotlwi 18,4,8 + rlwimi 5,3,24,0,7 + rlwimi 18,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 18,4,24,16,23 + insrdi 18,5,32,0 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + ld 0,24(7) + add 6,6,3 + add 6,6,5 + + lwz 3,24(31) + lwz 4,28(31) + rotlwi 5,3,8 + rotlwi 19,4,8 + rlwimi 5,3,24,0,7 + rlwimi 19,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 19,4,24,16,23 + insrdi 19,5,32,0 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + ld 0,32(7) + add 12,12,3 + add 12,12,5 + + lwz 3,32(31) + lwz 4,36(31) + rotlwi 5,3,8 + rotlwi 20,4,8 + rlwimi 5,3,24,0,7 + rlwimi 20,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 20,4,24,16,23 + insrdi 20,5,32,0 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + ld 0,40(7) + add 11,11,3 + add 11,11,5 + + lwz 3,40(31) + lwz 4,44(31) + rotlwi 5,3,8 + rotlwi 21,4,8 + rlwimi 5,3,24,0,7 + rlwimi 21,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 21,4,24,16,23 + insrdi 21,5,32,0 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + ld 0,48(7) + add 10,10,3 + add 10,10,5 + + lwz 3,48(31) + lwz 4,52(31) + rotlwi 5,3,8 + rotlwi 22,4,8 + rlwimi 5,3,24,0,7 + rlwimi 22,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 22,4,24,16,23 + insrdi 22,5,32,0 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + ld 0,56(7) + add 9,9,3 + add 9,9,5 + + lwz 3,56(31) + lwz 4,60(31) + rotlwi 5,3,8 + rotlwi 23,4,8 + rlwimi 5,3,24,0,7 + rlwimi 23,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 23,4,24,16,23 + insrdi 23,5,32,0 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + ld 0,64(7) + add 8,8,3 + add 8,8,5 + + lwz 3,64(31) + lwz 4,68(31) + rotlwi 5,3,8 + rotlwi 24,4,8 + rlwimi 5,3,24,0,7 + rlwimi 24,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 24,4,24,16,23 + insrdi 24,5,32,0 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + ld 0,72(7) + add 15,15,3 + add 15,15,5 + + lwz 3,72(31) + lwz 4,76(31) + rotlwi 5,3,8 + rotlwi 25,4,8 + rlwimi 5,3,24,0,7 + rlwimi 25,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 25,4,24,16,23 + insrdi 25,5,32,0 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + ld 0,80(7) + add 14,14,3 + add 14,14,5 + + lwz 3,80(31) + lwz 4,84(31) + rotlwi 5,3,8 + rotlwi 26,4,8 + rlwimi 5,3,24,0,7 + rlwimi 26,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 26,4,24,16,23 + insrdi 26,5,32,0 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + ld 0,88(7) + add 6,6,3 + add 6,6,5 + + lwz 3,88(31) + lwz 4,92(31) + rotlwi 5,3,8 + rotlwi 27,4,8 + rlwimi 5,3,24,0,7 + rlwimi 27,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 27,4,24,16,23 + insrdi 27,5,32,0 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + ld 0,96(7) + add 12,12,3 + add 12,12,5 + + lwz 3,96(31) + lwz 4,100(31) + rotlwi 5,3,8 + rotlwi 28,4,8 + rlwimi 5,3,24,0,7 + rlwimi 28,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 28,4,24,16,23 + insrdi 28,5,32,0 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + ld 0,104(7) + add 11,11,3 + add 11,11,5 + + lwz 3,104(31) + lwz 4,108(31) + rotlwi 5,3,8 + rotlwi 29,4,8 + rlwimi 5,3,24,0,7 + rlwimi 29,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 29,4,24,16,23 + insrdi 29,5,32,0 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + ld 0,112(7) + add 10,10,3 + add 10,10,5 + + lwz 3,112(31) + lwz 4,116(31) + rotlwi 5,3,8 + rotlwi 30,4,8 + rlwimi 5,3,24,0,7 + rlwimi 30,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 30,4,24,16,23 + insrdi 30,5,32,0 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + ld 0,120(7) + add 9,9,3 + add 9,9,5 + + lwz 3,120(31) + lwz 4,124(31) + rotlwi 5,3,8 + rotlwi 31,4,8 + rlwimi 5,3,24,0,7 + rlwimi 31,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 31,4,24,16,23 + insrdi 31,5,32,0 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + li 5,4 + mtctr 5 +.align 4 +.Lrounds: + addi 7,7,128 + rotrdi 3,17,1 + rotrdi 4,17,8 + rotrdi 5,30,19 + rotrdi 0,30,61 + xor 3,3,4 + srdi 4,17,7 + xor 5,5,0 + srdi 0,30,6 + add 16,16,25 + xor 3,3,4 + xor 5,5,0 + ld 0,0(7) + add 16,16,3 + add 16,16,5 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrdi 3,18,1 + rotrdi 4,18,8 + rotrdi 5,31,19 + rotrdi 0,31,61 + xor 3,3,4 + srdi 4,18,7 + xor 5,5,0 + srdi 0,31,6 + add 17,17,26 + xor 3,3,4 + xor 5,5,0 + ld 0,8(7) + add 17,17,3 + add 17,17,5 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrdi 3,19,1 + rotrdi 4,19,8 + rotrdi 5,16,19 + rotrdi 0,16,61 + xor 3,3,4 + srdi 4,19,7 + xor 5,5,0 + srdi 0,16,6 + add 18,18,27 + xor 3,3,4 + xor 5,5,0 + ld 0,16(7) + add 18,18,3 + add 18,18,5 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrdi 3,20,1 + rotrdi 4,20,8 + rotrdi 5,17,19 + rotrdi 0,17,61 + xor 3,3,4 + srdi 4,20,7 + xor 5,5,0 + srdi 0,17,6 + add 19,19,28 + xor 3,3,4 + xor 5,5,0 + ld 0,24(7) + add 19,19,3 + add 19,19,5 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrdi 3,21,1 + rotrdi 4,21,8 + rotrdi 5,18,19 + rotrdi 0,18,61 + xor 3,3,4 + srdi 4,21,7 + xor 5,5,0 + srdi 0,18,6 + add 20,20,29 + xor 3,3,4 + xor 5,5,0 + ld 0,32(7) + add 20,20,3 + add 20,20,5 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrdi 3,22,1 + rotrdi 4,22,8 + rotrdi 5,19,19 + rotrdi 0,19,61 + xor 3,3,4 + srdi 4,22,7 + xor 5,5,0 + srdi 0,19,6 + add 21,21,30 + xor 3,3,4 + xor 5,5,0 + ld 0,40(7) + add 21,21,3 + add 21,21,5 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrdi 3,23,1 + rotrdi 4,23,8 + rotrdi 5,20,19 + rotrdi 0,20,61 + xor 3,3,4 + srdi 4,23,7 + xor 5,5,0 + srdi 0,20,6 + add 22,22,31 + xor 3,3,4 + xor 5,5,0 + ld 0,48(7) + add 22,22,3 + add 22,22,5 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrdi 3,24,1 + rotrdi 4,24,8 + rotrdi 5,21,19 + rotrdi 0,21,61 + xor 3,3,4 + srdi 4,24,7 + xor 5,5,0 + srdi 0,21,6 + add 23,23,16 + xor 3,3,4 + xor 5,5,0 + ld 0,56(7) + add 23,23,3 + add 23,23,5 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + rotrdi 3,25,1 + rotrdi 4,25,8 + rotrdi 5,22,19 + rotrdi 0,22,61 + xor 3,3,4 + srdi 4,25,7 + xor 5,5,0 + srdi 0,22,6 + add 24,24,17 + xor 3,3,4 + xor 5,5,0 + ld 0,64(7) + add 24,24,3 + add 24,24,5 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrdi 3,26,1 + rotrdi 4,26,8 + rotrdi 5,23,19 + rotrdi 0,23,61 + xor 3,3,4 + srdi 4,26,7 + xor 5,5,0 + srdi 0,23,6 + add 25,25,18 + xor 3,3,4 + xor 5,5,0 + ld 0,72(7) + add 25,25,3 + add 25,25,5 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrdi 3,27,1 + rotrdi 4,27,8 + rotrdi 5,24,19 + rotrdi 0,24,61 + xor 3,3,4 + srdi 4,27,7 + xor 5,5,0 + srdi 0,24,6 + add 26,26,19 + xor 3,3,4 + xor 5,5,0 + ld 0,80(7) + add 26,26,3 + add 26,26,5 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrdi 3,28,1 + rotrdi 4,28,8 + rotrdi 5,25,19 + rotrdi 0,25,61 + xor 3,3,4 + srdi 4,28,7 + xor 5,5,0 + srdi 0,25,6 + add 27,27,20 + xor 3,3,4 + xor 5,5,0 + ld 0,88(7) + add 27,27,3 + add 27,27,5 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrdi 3,29,1 + rotrdi 4,29,8 + rotrdi 5,26,19 + rotrdi 0,26,61 + xor 3,3,4 + srdi 4,29,7 + xor 5,5,0 + srdi 0,26,6 + add 28,28,21 + xor 3,3,4 + xor 5,5,0 + ld 0,96(7) + add 28,28,3 + add 28,28,5 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrdi 3,30,1 + rotrdi 4,30,8 + rotrdi 5,27,19 + rotrdi 0,27,61 + xor 3,3,4 + srdi 4,30,7 + xor 5,5,0 + srdi 0,27,6 + add 29,29,22 + xor 3,3,4 + xor 5,5,0 + ld 0,104(7) + add 29,29,3 + add 29,29,5 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrdi 3,31,1 + rotrdi 4,31,8 + rotrdi 5,28,19 + rotrdi 0,28,61 + xor 3,3,4 + srdi 4,31,7 + xor 5,5,0 + srdi 0,28,6 + add 30,30,23 + xor 3,3,4 + xor 5,5,0 + ld 0,112(7) + add 30,30,3 + add 30,30,5 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrdi 3,16,1 + rotrdi 4,16,8 + rotrdi 5,29,19 + rotrdi 0,29,61 + xor 3,3,4 + srdi 4,16,7 + xor 5,5,0 + srdi 0,29,6 + add 31,31,24 + xor 3,3,4 + xor 5,5,0 + ld 0,120(7) + add 31,31,3 + add 31,31,5 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + bdnz .Lrounds + + ld 3,208(1) + ld 31,200(1) + ld 5,192(1) + subi 7,7,512 + + ld 16,0(3) + ld 17,8(3) + ld 18,16(3) + ld 19,24(3) + ld 20,32(3) + ld 21,40(3) + ld 22,48(3) + addi 31,31,128 + ld 23,56(3) + add 8,8,16 + add 9,9,17 + std 31,200(1) + add 10,10,18 + std 8,0(3) + add 11,11,19 + std 9,8(3) + add 12,12,20 + std 10,16(3) + add 6,6,21 + std 11,24(3) + add 14,14,22 + std 12,32(3) + add 15,15,23 + std 6,40(3) + std 14,48(3) + cmpld 31,5 + std 15,56(3) + bne .Lsha2_block_private + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size zfs_sha512_ppc,.-zfs_sha512_ppc +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 7 + addi 7,7,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0xd728ae22,0x428a2f98 +.long 0x23ef65cd,0x71374491 +.long 0xec4d3b2f,0xb5c0fbcf +.long 0x8189dbbc,0xe9b5dba5 +.long 0xf348b538,0x3956c25b +.long 0xb605d019,0x59f111f1 +.long 0xaf194f9b,0x923f82a4 +.long 0xda6d8118,0xab1c5ed5 +.long 0xa3030242,0xd807aa98 +.long 0x45706fbe,0x12835b01 +.long 0x4ee4b28c,0x243185be +.long 0xd5ffb4e2,0x550c7dc3 +.long 0xf27b896f,0x72be5d74 +.long 0x3b1696b1,0x80deb1fe +.long 0x25c71235,0x9bdc06a7 +.long 0xcf692694,0xc19bf174 +.long 0x9ef14ad2,0xe49b69c1 +.long 0x384f25e3,0xefbe4786 +.long 0x8b8cd5b5,0x0fc19dc6 +.long 0x77ac9c65,0x240ca1cc +.long 0x592b0275,0x2de92c6f +.long 0x6ea6e483,0x4a7484aa +.long 0xbd41fbd4,0x5cb0a9dc +.long 0x831153b5,0x76f988da +.long 0xee66dfab,0x983e5152 +.long 0x2db43210,0xa831c66d +.long 0x98fb213f,0xb00327c8 +.long 0xbeef0ee4,0xbf597fc7 +.long 0x3da88fc2,0xc6e00bf3 +.long 0x930aa725,0xd5a79147 +.long 0xe003826f,0x06ca6351 +.long 0x0a0e6e70,0x14292967 +.long 0x46d22ffc,0x27b70a85 +.long 0x5c26c926,0x2e1b2138 +.long 0x5ac42aed,0x4d2c6dfc +.long 0x9d95b3df,0x53380d13 +.long 0x8baf63de,0x650a7354 +.long 0x3c77b2a8,0x766a0abb +.long 0x47edaee6,0x81c2c92e +.long 0x1482353b,0x92722c85 +.long 0x4cf10364,0xa2bfe8a1 +.long 0xbc423001,0xa81a664b +.long 0xd0f89791,0xc24b8b70 +.long 0x0654be30,0xc76c51a3 +.long 0xd6ef5218,0xd192e819 +.long 0x5565a910,0xd6990624 +.long 0x5771202a,0xf40e3585 +.long 0x32bbd1b8,0x106aa070 +.long 0xb8d2d0c8,0x19a4c116 +.long 0x5141ab53,0x1e376c08 +.long 0xdf8eeb99,0x2748774c +.long 0xe19b48a8,0x34b0bcb5 +.long 0xc5c95a63,0x391c0cb3 +.long 0xe3418acb,0x4ed8aa4a +.long 0x7763e373,0x5b9cca4f +.long 0xd6b2b8a3,0x682e6ff3 +.long 0x5defb2fc,0x748f82ee +.long 0x43172f60,0x78a5636f +.long 0xa1f0ab72,0x84c87814 +.long 0x1a6439ec,0x8cc70208 +.long 0x23631e28,0x90befffa +.long 0xde82bde9,0xa4506ceb +.long 0xb2c67915,0xbef9a3f7 +.long 0xe372532b,0xc67178f2 +.long 0xea26619c,0xca273ece +.long 0x21c0c207,0xd186b8c7 +.long 0xcde0eb1e,0xeada7dd6 +.long 0xee6ed178,0xf57d4f7f +.long 0x72176fba,0x06f067aa +.long 0xa2c898a6,0x0a637dc5 +.long 0xbef90dae,0x113f9804 +.long 0x131c471b,0x1b710b35 +.long 0x23047d84,0x28db77f5 +.long 0x40c72493,0x32caab7b +.long 0x15c9bebc,0x3c9ebe0a +.long 0x9c100d4c,0x431d67c4 +.long 0xcb3e42b6,0x4cc5d4be +.long 0xfc657e2a,0x597f299c +.long 0x3ad6faec,0x5fcb6fab +.long 0x4a475817,0x6c44198c + +#endif diff --git a/module/icp/asm-x86_64/sha2/sha256-x86_64.S b/module/icp/asm-x86_64/sha2/sha256-x86_64.S new file mode 100644 index 000000000000..da3722f808ba --- /dev/null +++ b/module/icp/asm-x86_64/sha2/sha256-x86_64.S @@ -0,0 +1,5104 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Portions Copyright (c) 2022 Tino Reichardt + * - modified assembly to fit into OpenZFS + */ + +#if defined(__x86_64) + +#define _ASM +#include + +SECTION_STATIC + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + +ENTRY_ALIGN(zfs_sha256_transform_x64, 16) +.cfi_startproc + ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $64+32,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue: + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop +.align 16 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 20(%rbp),%rbp + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 20(%rbp),%rbp + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + leaq 4(%rbp),%rbp + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + leaq 4(%rbp),%rbp + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + leaq 4(%rbp),%rbp + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + leaq 20(%rbp),%rbp + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + leaq 4(%rbp),%rbp + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + leaq 4(%rbp),%rbp + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + leaq 4(%rbp),%rbp + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 20(%rbp),%rbp + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + leaq 4(%rbp),%rbp + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + leaq 4(%rbp),%rbp + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + leaq 4(%rbp),%rbp + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + leaq 20(%rbp),%rbp + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + leaq 4(%rbp),%rbp + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + leaq 4(%rbp),%rbp + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + leaq 4(%rbp),%rbp + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 20(%rbp),%rbp + cmpb $0,3(%rbp) + jnz .Lrounds_16_xx + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + cmpq 64+16(%rsp),%rsi + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue: + RET +.cfi_endproc +SET_SIZE(zfs_sha256_transform_x64) + +ENTRY_ALIGN(zfs_sha256_transform_shani, 64) +.cfi_startproc + ENDBR + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shani + +.align 16 +.Loop_shani: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shani + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + RET +.cfi_endproc +SET_SIZE(zfs_sha256_transform_shani) + +ENTRY_ALIGN(zfs_sha256_transform_ssse3, 64) +.cfi_startproc + ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rbp +.byte 102,15,56,0,207 + movdqa 0(%rbp),%xmm4 + movdqa 32(%rbp),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 +.byte 102,15,56,0,223 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_ssse3: + RET +.cfi_endproc +SET_SIZE(zfs_sha256_transform_ssse3) + +ENTRY_ALIGN(zfs_sha256_transform_avx, 64) +.cfi_startproc + ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lavx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_avx + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + RET +.cfi_endproc +SET_SIZE(zfs_sha256_transform_avx) + +ENTRY_ALIGN(zfs_sha256_transform_avx2, 64) +.cfi_startproc + ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $544,%rsp + shlq $4,%rdx + andq $-1024,%rsp + leaq (%rsi,%rdx,4),%rdx + addq $448,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_avx2: + + vzeroupper + subq $-64,%rsi + movl 0(%rdi),%eax + movq %rsi,%r12 + movl 4(%rdi),%ebx + cmpq %rdx,%rsi + movl 8(%rdi),%ecx + cmoveq %rsp,%r12 + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%ymm8 + vmovdqa K256+512+64(%rip),%ymm9 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqa K256+512(%rip),%ymm7 + vmovdqu -64+0(%rsi),%xmm0 + vmovdqu -64+16(%rsi),%xmm1 + vmovdqu -64+32(%rsi),%xmm2 + vmovdqu -64+48(%rsi),%xmm3 + + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm7,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm7,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + + leaq K256(%rip),%rbp + vpshufb %ymm7,%ymm2,%ymm2 + vpaddd 0(%rbp),%ymm0,%ymm4 + vpshufb %ymm7,%ymm3,%ymm3 + vpaddd 32(%rbp),%ymm1,%ymm5 + vpaddd 64(%rbp),%ymm2,%ymm6 + vpaddd 96(%rbp),%ymm3,%ymm7 + vmovdqa %ymm4,0(%rsp) + xorl %r14d,%r14d + vmovdqa %ymm5,32(%rsp) + + movq 88(%rsp),%rdi +.cfi_def_cfa %rdi,8 + leaq -64(%rsp),%rsp + + + + movq %rdi,-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + movl %ebx,%edi + vmovdqa %ymm6,0(%rsp) + xorl %ecx,%edi + vmovdqa %ymm7,32(%rsp) + movl %r9d,%r12d + subq $-32*4,%rbp + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: + leaq -64(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 + + pushq 64-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $4,%ymm0,%ymm1,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm2,%ymm3,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm0,%ymm0 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm3,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm0,%ymm0 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm0,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 0(%rbp),%ymm0,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm1,%ymm2,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm3,%ymm0,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm1,%ymm1 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm0,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm1,%ymm1 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm1,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 32(%rbp),%ymm1,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq -64(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 + + pushq 64-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $4,%ymm2,%ymm3,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm0,%ymm1,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm2,%ymm2 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm1,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm2,%ymm2 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm2,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 64(%rbp),%ymm2,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm3,%ymm0,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm1,%ymm2,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm3,%ymm3 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm2,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm3,%ymm3 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm3,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 96(%rbp),%ymm3,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq 128(%rbp),%rbp + cmpb $0,3(%rbp) + jne .Lavx2_00_47 + addl 0+64(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+64(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+64(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+64(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+64(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+64(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+64(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+64(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + addl 0(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rbp + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + cmpq 80(%rbp),%rsi + je .Ldone_avx2 + + xorl %r14d,%r14d + movl %ebx,%edi + xorl %ecx,%edi + movl %r9d,%r12d + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + addl 0+16(%rbp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+16(%rbp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+16(%rbp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+16(%rbp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+16(%rbp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+16(%rbp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+16(%rbp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+16(%rbp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + leaq -64(%rbp),%rbp + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rsp + +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + leaq 128(%rsi),%rsi + addl 24(%rdi),%r10d + movq %rsi,%r12 + addl 28(%rdi),%r11d + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + cmoveq %rsp,%r12 + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + + +.cfi_escape 0x0f,0x06,0x76,0xd8,0x00,0x06,0x23,0x08 + +.Ldone_avx2: + movq 88(%rbp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + RET +.cfi_endproc +SET_SIZE(zfs_sha256_transform_avx2) + +#if defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif +#endif diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S deleted file mode 100644 index f1fde51c1d69..000000000000 --- a/module/icp/asm-x86_64/sha2/sha256_impl.S +++ /dev/null @@ -1,2090 +0,0 @@ -/* - * ==================================================================== - * Written by Andy Polyakov for the OpenSSL - * project. Rights for redistribution and usage in source and binary - * forms are granted according to the OpenSSL license. - * ==================================================================== - * - * sha256/512_block procedure for x86_64. - * - * 40% improvement over compiler-generated code on Opteron. On EM64T - * sha256 was observed to run >80% faster and sha512 - >40%. No magical - * tricks, just straight implementation... I really wonder why gcc - * [being armed with inline assembler] fails to generate as fast code. - * The only thing which is cool about this module is that it's very - * same instruction sequence used for both SHA-256 and SHA-512. In - * former case the instructions operate on 32-bit operands, while in - * latter - on 64-bit ones. All I had to do is to get one flavor right, - * the other one passed the test right away:-) - * - * sha256_block runs in ~1005 cycles on Opteron, which gives you - * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock - * frequency in GHz. sha512_block runs in ~1275 cycles, which results - * in 128*1000/1275=100MBps per GHz. Is there room for improvement? - * Well, if you compare it to IA-64 implementation, which maintains - * X[16] in register bank[!], tends to 4 instructions per CPU clock - * cycle and runs in 1003 cycles, 1275 is very good result for 3-way - * issue Opteron pipeline and X[16] maintained in memory. So that *if* - * there is a way to improve it, *then* the only way would be to try to - * offload X[16] updates to SSE unit, but that would require "deeper" - * loop unroll, which in turn would naturally cause size blow-up, not - * to mention increased complexity! And once again, only *if* it's - * actually possible to noticeably improve overall ILP, instruction - * level parallelism, on a given CPU implementation in this case. - * - * Special note on Intel EM64T. While Opteron CPU exhibits perfect - * performance ratio of 1.5 between 64- and 32-bit flavors [see above], - * [currently available] EM64T CPUs apparently are far from it. On the - * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit - * sha256_block:-( This is presumably because 64-bit shifts/rotates - * apparently are not atomic instructions, but implemented in microcode. - */ - -/* - * OpenSolaris OS modifications - * - * Sun elects to use this software under the BSD license. - * - * This source originates from OpenSSL file sha512-x86_64.pl at - * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz - * (presumably for future OpenSSL release 0.9.8h), with these changes: - * - * 1. Added perl "use strict" and declared variables. - * - * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from - * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. - * - * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) - * assemblers). Replaced the .picmeup macro with assembler code. - * - * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", - * at the beginning of SHA2_CTX (the next field is 8-byte aligned). - */ - -/* - * This file was generated by a perl script (sha512-x86_64.pl) that were - * used to generate sha256 and sha512 variants from the same code base. - * The comments from the original file have been pasted above. - */ - -#if defined(lint) || defined(__lint) -#include -#include - -void -SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) -{ - (void) ctx, (void) in, (void) num; -} - - -#else -#define _ASM -#include - -ENTRY_NP(SHA256TransformBlocks) -.cfi_startproc - ENDBR - movq %rsp, %rax -.cfi_def_cfa_register %rax - push %rbx -.cfi_offset %rbx,-16 - push %rbp -.cfi_offset %rbp,-24 - push %r12 -.cfi_offset %r12,-32 - push %r13 -.cfi_offset %r13,-40 - push %r14 -.cfi_offset %r14,-48 - push %r15 -.cfi_offset %r15,-56 - mov %rsp,%rbp # copy %rsp - shl $4,%rdx # num*16 - sub $16*4+4*8,%rsp - lea (%rsi,%rdx,4),%rdx # inp+num*16*4 - and $-64,%rsp # align stack frame - add $8,%rdi # Skip OpenSolaris field, "algotype" - mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg - mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg - mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg - mov %rbp,16*4+3*8(%rsp) # save copy of %rsp -# echo ".cfi_cfa_expression %rsp+88,deref,+56" | -# openssl/crypto/perlasm/x86_64-xlate.pl -.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x38 - - #.picmeup %rbp - # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts - # the address of the "next" instruction into the target register - # (%rbp). This generates these 2 instructions: - lea .Llea(%rip),%rbp - #nop # .picmeup generates a nop for mod 8 alignment--not needed here - -.Llea: - lea K256-.(%rbp),%rbp - - mov 4*0(%rdi),%eax - mov 4*1(%rdi),%ebx - mov 4*2(%rdi),%ecx - mov 4*3(%rdi),%edx - mov 4*4(%rdi),%r8d - mov 4*5(%rdi),%r9d - mov 4*6(%rdi),%r10d - mov 4*7(%rdi),%r11d - jmp .Lloop - -.balign 16 -.Lloop: - xor %rdi,%rdi - mov 4*0(%rsi),%r12d - bswap %r12d - mov %r8d,%r13d - mov %r8d,%r14d - mov %r9d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r10d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r8d,%r15d # (f^g)&e - mov %r12d,0(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r11d,%r12d # T1+=h - - mov %eax,%r11d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %eax,%r13d - mov %eax,%r14d - - ror $2,%r11d - ror $13,%r13d - mov %eax,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r11d - ror $9,%r13d - or %ecx,%r14d # a|c - - xor %r13d,%r11d # h=Sigma0(a) - and %ecx,%r15d # a&c - add %r12d,%edx # d+=T1 - - and %ebx,%r14d # (a|c)&b - add %r12d,%r11d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r11d # h+=Maj(a,b,c) - mov 4*1(%rsi),%r12d - bswap %r12d - mov %edx,%r13d - mov %edx,%r14d - mov %r8d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r9d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %edx,%r15d # (f^g)&e - mov %r12d,4(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r10d,%r12d # T1+=h - - mov %r11d,%r10d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r11d,%r13d - mov %r11d,%r14d - - ror $2,%r10d - ror $13,%r13d - mov %r11d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r10d - ror $9,%r13d - or %ebx,%r14d # a|c - - xor %r13d,%r10d # h=Sigma0(a) - and %ebx,%r15d # a&c - add %r12d,%ecx # d+=T1 - - and %eax,%r14d # (a|c)&b - add %r12d,%r10d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r10d # h+=Maj(a,b,c) - mov 4*2(%rsi),%r12d - bswap %r12d - mov %ecx,%r13d - mov %ecx,%r14d - mov %edx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r8d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ecx,%r15d # (f^g)&e - mov %r12d,8(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r9d,%r12d # T1+=h - - mov %r10d,%r9d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r10d,%r13d - mov %r10d,%r14d - - ror $2,%r9d - ror $13,%r13d - mov %r10d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r9d - ror $9,%r13d - or %eax,%r14d # a|c - - xor %r13d,%r9d # h=Sigma0(a) - and %eax,%r15d # a&c - add %r12d,%ebx # d+=T1 - - and %r11d,%r14d # (a|c)&b - add %r12d,%r9d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r9d # h+=Maj(a,b,c) - mov 4*3(%rsi),%r12d - bswap %r12d - mov %ebx,%r13d - mov %ebx,%r14d - mov %ecx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %edx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ebx,%r15d # (f^g)&e - mov %r12d,12(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r8d,%r12d # T1+=h - - mov %r9d,%r8d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r9d,%r13d - mov %r9d,%r14d - - ror $2,%r8d - ror $13,%r13d - mov %r9d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r8d - ror $9,%r13d - or %r11d,%r14d # a|c - - xor %r13d,%r8d # h=Sigma0(a) - and %r11d,%r15d # a&c - add %r12d,%eax # d+=T1 - - and %r10d,%r14d # (a|c)&b - add %r12d,%r8d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r8d # h+=Maj(a,b,c) - mov 4*4(%rsi),%r12d - bswap %r12d - mov %eax,%r13d - mov %eax,%r14d - mov %ebx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ecx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %eax,%r15d # (f^g)&e - mov %r12d,16(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %edx,%r12d # T1+=h - - mov %r8d,%edx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r8d,%r13d - mov %r8d,%r14d - - ror $2,%edx - ror $13,%r13d - mov %r8d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%edx - ror $9,%r13d - or %r10d,%r14d # a|c - - xor %r13d,%edx # h=Sigma0(a) - and %r10d,%r15d # a&c - add %r12d,%r11d # d+=T1 - - and %r9d,%r14d # (a|c)&b - add %r12d,%edx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%edx # h+=Maj(a,b,c) - mov 4*5(%rsi),%r12d - bswap %r12d - mov %r11d,%r13d - mov %r11d,%r14d - mov %eax,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ebx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r11d,%r15d # (f^g)&e - mov %r12d,20(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ecx,%r12d # T1+=h - - mov %edx,%ecx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %edx,%r13d - mov %edx,%r14d - - ror $2,%ecx - ror $13,%r13d - mov %edx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ecx - ror $9,%r13d - or %r9d,%r14d # a|c - - xor %r13d,%ecx # h=Sigma0(a) - and %r9d,%r15d # a&c - add %r12d,%r10d # d+=T1 - - and %r8d,%r14d # (a|c)&b - add %r12d,%ecx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ecx # h+=Maj(a,b,c) - mov 4*6(%rsi),%r12d - bswap %r12d - mov %r10d,%r13d - mov %r10d,%r14d - mov %r11d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %eax,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r10d,%r15d # (f^g)&e - mov %r12d,24(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ebx,%r12d # T1+=h - - mov %ecx,%ebx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ecx,%r13d - mov %ecx,%r14d - - ror $2,%ebx - ror $13,%r13d - mov %ecx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ebx - ror $9,%r13d - or %r8d,%r14d # a|c - - xor %r13d,%ebx # h=Sigma0(a) - and %r8d,%r15d # a&c - add %r12d,%r9d # d+=T1 - - and %edx,%r14d # (a|c)&b - add %r12d,%ebx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ebx # h+=Maj(a,b,c) - mov 4*7(%rsi),%r12d - bswap %r12d - mov %r9d,%r13d - mov %r9d,%r14d - mov %r10d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r11d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r9d,%r15d # (f^g)&e - mov %r12d,28(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %eax,%r12d # T1+=h - - mov %ebx,%eax - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ebx,%r13d - mov %ebx,%r14d - - ror $2,%eax - ror $13,%r13d - mov %ebx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%eax - ror $9,%r13d - or %edx,%r14d # a|c - - xor %r13d,%eax # h=Sigma0(a) - and %edx,%r15d # a&c - add %r12d,%r8d # d+=T1 - - and %ecx,%r14d # (a|c)&b - add %r12d,%eax # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%eax # h+=Maj(a,b,c) - mov 4*8(%rsi),%r12d - bswap %r12d - mov %r8d,%r13d - mov %r8d,%r14d - mov %r9d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r10d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r8d,%r15d # (f^g)&e - mov %r12d,32(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r11d,%r12d # T1+=h - - mov %eax,%r11d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %eax,%r13d - mov %eax,%r14d - - ror $2,%r11d - ror $13,%r13d - mov %eax,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r11d - ror $9,%r13d - or %ecx,%r14d # a|c - - xor %r13d,%r11d # h=Sigma0(a) - and %ecx,%r15d # a&c - add %r12d,%edx # d+=T1 - - and %ebx,%r14d # (a|c)&b - add %r12d,%r11d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r11d # h+=Maj(a,b,c) - mov 4*9(%rsi),%r12d - bswap %r12d - mov %edx,%r13d - mov %edx,%r14d - mov %r8d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r9d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %edx,%r15d # (f^g)&e - mov %r12d,36(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r10d,%r12d # T1+=h - - mov %r11d,%r10d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r11d,%r13d - mov %r11d,%r14d - - ror $2,%r10d - ror $13,%r13d - mov %r11d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r10d - ror $9,%r13d - or %ebx,%r14d # a|c - - xor %r13d,%r10d # h=Sigma0(a) - and %ebx,%r15d # a&c - add %r12d,%ecx # d+=T1 - - and %eax,%r14d # (a|c)&b - add %r12d,%r10d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r10d # h+=Maj(a,b,c) - mov 4*10(%rsi),%r12d - bswap %r12d - mov %ecx,%r13d - mov %ecx,%r14d - mov %edx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r8d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ecx,%r15d # (f^g)&e - mov %r12d,40(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r9d,%r12d # T1+=h - - mov %r10d,%r9d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r10d,%r13d - mov %r10d,%r14d - - ror $2,%r9d - ror $13,%r13d - mov %r10d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r9d - ror $9,%r13d - or %eax,%r14d # a|c - - xor %r13d,%r9d # h=Sigma0(a) - and %eax,%r15d # a&c - add %r12d,%ebx # d+=T1 - - and %r11d,%r14d # (a|c)&b - add %r12d,%r9d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r9d # h+=Maj(a,b,c) - mov 4*11(%rsi),%r12d - bswap %r12d - mov %ebx,%r13d - mov %ebx,%r14d - mov %ecx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %edx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ebx,%r15d # (f^g)&e - mov %r12d,44(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r8d,%r12d # T1+=h - - mov %r9d,%r8d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r9d,%r13d - mov %r9d,%r14d - - ror $2,%r8d - ror $13,%r13d - mov %r9d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r8d - ror $9,%r13d - or %r11d,%r14d # a|c - - xor %r13d,%r8d # h=Sigma0(a) - and %r11d,%r15d # a&c - add %r12d,%eax # d+=T1 - - and %r10d,%r14d # (a|c)&b - add %r12d,%r8d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r8d # h+=Maj(a,b,c) - mov 4*12(%rsi),%r12d - bswap %r12d - mov %eax,%r13d - mov %eax,%r14d - mov %ebx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ecx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %eax,%r15d # (f^g)&e - mov %r12d,48(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %edx,%r12d # T1+=h - - mov %r8d,%edx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r8d,%r13d - mov %r8d,%r14d - - ror $2,%edx - ror $13,%r13d - mov %r8d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%edx - ror $9,%r13d - or %r10d,%r14d # a|c - - xor %r13d,%edx # h=Sigma0(a) - and %r10d,%r15d # a&c - add %r12d,%r11d # d+=T1 - - and %r9d,%r14d # (a|c)&b - add %r12d,%edx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%edx # h+=Maj(a,b,c) - mov 4*13(%rsi),%r12d - bswap %r12d - mov %r11d,%r13d - mov %r11d,%r14d - mov %eax,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ebx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r11d,%r15d # (f^g)&e - mov %r12d,52(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ecx,%r12d # T1+=h - - mov %edx,%ecx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %edx,%r13d - mov %edx,%r14d - - ror $2,%ecx - ror $13,%r13d - mov %edx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ecx - ror $9,%r13d - or %r9d,%r14d # a|c - - xor %r13d,%ecx # h=Sigma0(a) - and %r9d,%r15d # a&c - add %r12d,%r10d # d+=T1 - - and %r8d,%r14d # (a|c)&b - add %r12d,%ecx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ecx # h+=Maj(a,b,c) - mov 4*14(%rsi),%r12d - bswap %r12d - mov %r10d,%r13d - mov %r10d,%r14d - mov %r11d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %eax,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r10d,%r15d # (f^g)&e - mov %r12d,56(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ebx,%r12d # T1+=h - - mov %ecx,%ebx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ecx,%r13d - mov %ecx,%r14d - - ror $2,%ebx - ror $13,%r13d - mov %ecx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ebx - ror $9,%r13d - or %r8d,%r14d # a|c - - xor %r13d,%ebx # h=Sigma0(a) - and %r8d,%r15d # a&c - add %r12d,%r9d # d+=T1 - - and %edx,%r14d # (a|c)&b - add %r12d,%ebx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ebx # h+=Maj(a,b,c) - mov 4*15(%rsi),%r12d - bswap %r12d - mov %r9d,%r13d - mov %r9d,%r14d - mov %r10d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r11d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r9d,%r15d # (f^g)&e - mov %r12d,60(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %eax,%r12d # T1+=h - - mov %ebx,%eax - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ebx,%r13d - mov %ebx,%r14d - - ror $2,%eax - ror $13,%r13d - mov %ebx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%eax - ror $9,%r13d - or %edx,%r14d # a|c - - xor %r13d,%eax # h=Sigma0(a) - and %edx,%r15d # a&c - add %r12d,%r8d # d+=T1 - - and %ecx,%r14d # (a|c)&b - add %r12d,%eax # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%eax # h+=Maj(a,b,c) - jmp .Lrounds_16_xx -.balign 16 -.Lrounds_16_xx: - mov 4(%rsp),%r13d - mov 56(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 36(%rsp),%r12d - - add 0(%rsp),%r12d - mov %r8d,%r13d - mov %r8d,%r14d - mov %r9d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r10d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r8d,%r15d # (f^g)&e - mov %r12d,0(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r11d,%r12d # T1+=h - - mov %eax,%r11d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %eax,%r13d - mov %eax,%r14d - - ror $2,%r11d - ror $13,%r13d - mov %eax,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r11d - ror $9,%r13d - or %ecx,%r14d # a|c - - xor %r13d,%r11d # h=Sigma0(a) - and %ecx,%r15d # a&c - add %r12d,%edx # d+=T1 - - and %ebx,%r14d # (a|c)&b - add %r12d,%r11d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r11d # h+=Maj(a,b,c) - mov 8(%rsp),%r13d - mov 60(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 40(%rsp),%r12d - - add 4(%rsp),%r12d - mov %edx,%r13d - mov %edx,%r14d - mov %r8d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r9d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %edx,%r15d # (f^g)&e - mov %r12d,4(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r10d,%r12d # T1+=h - - mov %r11d,%r10d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r11d,%r13d - mov %r11d,%r14d - - ror $2,%r10d - ror $13,%r13d - mov %r11d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r10d - ror $9,%r13d - or %ebx,%r14d # a|c - - xor %r13d,%r10d # h=Sigma0(a) - and %ebx,%r15d # a&c - add %r12d,%ecx # d+=T1 - - and %eax,%r14d # (a|c)&b - add %r12d,%r10d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r10d # h+=Maj(a,b,c) - mov 12(%rsp),%r13d - mov 0(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 44(%rsp),%r12d - - add 8(%rsp),%r12d - mov %ecx,%r13d - mov %ecx,%r14d - mov %edx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r8d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ecx,%r15d # (f^g)&e - mov %r12d,8(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r9d,%r12d # T1+=h - - mov %r10d,%r9d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r10d,%r13d - mov %r10d,%r14d - - ror $2,%r9d - ror $13,%r13d - mov %r10d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r9d - ror $9,%r13d - or %eax,%r14d # a|c - - xor %r13d,%r9d # h=Sigma0(a) - and %eax,%r15d # a&c - add %r12d,%ebx # d+=T1 - - and %r11d,%r14d # (a|c)&b - add %r12d,%r9d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r9d # h+=Maj(a,b,c) - mov 16(%rsp),%r13d - mov 4(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 48(%rsp),%r12d - - add 12(%rsp),%r12d - mov %ebx,%r13d - mov %ebx,%r14d - mov %ecx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %edx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ebx,%r15d # (f^g)&e - mov %r12d,12(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r8d,%r12d # T1+=h - - mov %r9d,%r8d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r9d,%r13d - mov %r9d,%r14d - - ror $2,%r8d - ror $13,%r13d - mov %r9d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r8d - ror $9,%r13d - or %r11d,%r14d # a|c - - xor %r13d,%r8d # h=Sigma0(a) - and %r11d,%r15d # a&c - add %r12d,%eax # d+=T1 - - and %r10d,%r14d # (a|c)&b - add %r12d,%r8d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r8d # h+=Maj(a,b,c) - mov 20(%rsp),%r13d - mov 8(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 52(%rsp),%r12d - - add 16(%rsp),%r12d - mov %eax,%r13d - mov %eax,%r14d - mov %ebx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ecx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %eax,%r15d # (f^g)&e - mov %r12d,16(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %edx,%r12d # T1+=h - - mov %r8d,%edx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r8d,%r13d - mov %r8d,%r14d - - ror $2,%edx - ror $13,%r13d - mov %r8d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%edx - ror $9,%r13d - or %r10d,%r14d # a|c - - xor %r13d,%edx # h=Sigma0(a) - and %r10d,%r15d # a&c - add %r12d,%r11d # d+=T1 - - and %r9d,%r14d # (a|c)&b - add %r12d,%edx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%edx # h+=Maj(a,b,c) - mov 24(%rsp),%r13d - mov 12(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 56(%rsp),%r12d - - add 20(%rsp),%r12d - mov %r11d,%r13d - mov %r11d,%r14d - mov %eax,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ebx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r11d,%r15d # (f^g)&e - mov %r12d,20(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ecx,%r12d # T1+=h - - mov %edx,%ecx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %edx,%r13d - mov %edx,%r14d - - ror $2,%ecx - ror $13,%r13d - mov %edx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ecx - ror $9,%r13d - or %r9d,%r14d # a|c - - xor %r13d,%ecx # h=Sigma0(a) - and %r9d,%r15d # a&c - add %r12d,%r10d # d+=T1 - - and %r8d,%r14d # (a|c)&b - add %r12d,%ecx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ecx # h+=Maj(a,b,c) - mov 28(%rsp),%r13d - mov 16(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 60(%rsp),%r12d - - add 24(%rsp),%r12d - mov %r10d,%r13d - mov %r10d,%r14d - mov %r11d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %eax,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r10d,%r15d # (f^g)&e - mov %r12d,24(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ebx,%r12d # T1+=h - - mov %ecx,%ebx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ecx,%r13d - mov %ecx,%r14d - - ror $2,%ebx - ror $13,%r13d - mov %ecx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ebx - ror $9,%r13d - or %r8d,%r14d # a|c - - xor %r13d,%ebx # h=Sigma0(a) - and %r8d,%r15d # a&c - add %r12d,%r9d # d+=T1 - - and %edx,%r14d # (a|c)&b - add %r12d,%ebx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ebx # h+=Maj(a,b,c) - mov 32(%rsp),%r13d - mov 20(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 0(%rsp),%r12d - - add 28(%rsp),%r12d - mov %r9d,%r13d - mov %r9d,%r14d - mov %r10d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r11d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r9d,%r15d # (f^g)&e - mov %r12d,28(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %eax,%r12d # T1+=h - - mov %ebx,%eax - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ebx,%r13d - mov %ebx,%r14d - - ror $2,%eax - ror $13,%r13d - mov %ebx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%eax - ror $9,%r13d - or %edx,%r14d # a|c - - xor %r13d,%eax # h=Sigma0(a) - and %edx,%r15d # a&c - add %r12d,%r8d # d+=T1 - - and %ecx,%r14d # (a|c)&b - add %r12d,%eax # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%eax # h+=Maj(a,b,c) - mov 36(%rsp),%r13d - mov 24(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 4(%rsp),%r12d - - add 32(%rsp),%r12d - mov %r8d,%r13d - mov %r8d,%r14d - mov %r9d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r10d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r8d,%r15d # (f^g)&e - mov %r12d,32(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r11d,%r12d # T1+=h - - mov %eax,%r11d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %eax,%r13d - mov %eax,%r14d - - ror $2,%r11d - ror $13,%r13d - mov %eax,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r11d - ror $9,%r13d - or %ecx,%r14d # a|c - - xor %r13d,%r11d # h=Sigma0(a) - and %ecx,%r15d # a&c - add %r12d,%edx # d+=T1 - - and %ebx,%r14d # (a|c)&b - add %r12d,%r11d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r11d # h+=Maj(a,b,c) - mov 40(%rsp),%r13d - mov 28(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 8(%rsp),%r12d - - add 36(%rsp),%r12d - mov %edx,%r13d - mov %edx,%r14d - mov %r8d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r9d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %edx,%r15d # (f^g)&e - mov %r12d,36(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r10d,%r12d # T1+=h - - mov %r11d,%r10d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r11d,%r13d - mov %r11d,%r14d - - ror $2,%r10d - ror $13,%r13d - mov %r11d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r10d - ror $9,%r13d - or %ebx,%r14d # a|c - - xor %r13d,%r10d # h=Sigma0(a) - and %ebx,%r15d # a&c - add %r12d,%ecx # d+=T1 - - and %eax,%r14d # (a|c)&b - add %r12d,%r10d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r10d # h+=Maj(a,b,c) - mov 44(%rsp),%r13d - mov 32(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 12(%rsp),%r12d - - add 40(%rsp),%r12d - mov %ecx,%r13d - mov %ecx,%r14d - mov %edx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r8d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ecx,%r15d # (f^g)&e - mov %r12d,40(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r9d,%r12d # T1+=h - - mov %r10d,%r9d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r10d,%r13d - mov %r10d,%r14d - - ror $2,%r9d - ror $13,%r13d - mov %r10d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r9d - ror $9,%r13d - or %eax,%r14d # a|c - - xor %r13d,%r9d # h=Sigma0(a) - and %eax,%r15d # a&c - add %r12d,%ebx # d+=T1 - - and %r11d,%r14d # (a|c)&b - add %r12d,%r9d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r9d # h+=Maj(a,b,c) - mov 48(%rsp),%r13d - mov 36(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 16(%rsp),%r12d - - add 44(%rsp),%r12d - mov %ebx,%r13d - mov %ebx,%r14d - mov %ecx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %edx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ebx,%r15d # (f^g)&e - mov %r12d,44(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r8d,%r12d # T1+=h - - mov %r9d,%r8d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r9d,%r13d - mov %r9d,%r14d - - ror $2,%r8d - ror $13,%r13d - mov %r9d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r8d - ror $9,%r13d - or %r11d,%r14d # a|c - - xor %r13d,%r8d # h=Sigma0(a) - and %r11d,%r15d # a&c - add %r12d,%eax # d+=T1 - - and %r10d,%r14d # (a|c)&b - add %r12d,%r8d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r8d # h+=Maj(a,b,c) - mov 52(%rsp),%r13d - mov 40(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 20(%rsp),%r12d - - add 48(%rsp),%r12d - mov %eax,%r13d - mov %eax,%r14d - mov %ebx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ecx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %eax,%r15d # (f^g)&e - mov %r12d,48(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %edx,%r12d # T1+=h - - mov %r8d,%edx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r8d,%r13d - mov %r8d,%r14d - - ror $2,%edx - ror $13,%r13d - mov %r8d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%edx - ror $9,%r13d - or %r10d,%r14d # a|c - - xor %r13d,%edx # h=Sigma0(a) - and %r10d,%r15d # a&c - add %r12d,%r11d # d+=T1 - - and %r9d,%r14d # (a|c)&b - add %r12d,%edx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%edx # h+=Maj(a,b,c) - mov 56(%rsp),%r13d - mov 44(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 24(%rsp),%r12d - - add 52(%rsp),%r12d - mov %r11d,%r13d - mov %r11d,%r14d - mov %eax,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ebx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r11d,%r15d # (f^g)&e - mov %r12d,52(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ecx,%r12d # T1+=h - - mov %edx,%ecx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %edx,%r13d - mov %edx,%r14d - - ror $2,%ecx - ror $13,%r13d - mov %edx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ecx - ror $9,%r13d - or %r9d,%r14d # a|c - - xor %r13d,%ecx # h=Sigma0(a) - and %r9d,%r15d # a&c - add %r12d,%r10d # d+=T1 - - and %r8d,%r14d # (a|c)&b - add %r12d,%ecx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ecx # h+=Maj(a,b,c) - mov 60(%rsp),%r13d - mov 48(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 28(%rsp),%r12d - - add 56(%rsp),%r12d - mov %r10d,%r13d - mov %r10d,%r14d - mov %r11d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %eax,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r10d,%r15d # (f^g)&e - mov %r12d,56(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ebx,%r12d # T1+=h - - mov %ecx,%ebx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ecx,%r13d - mov %ecx,%r14d - - ror $2,%ebx - ror $13,%r13d - mov %ecx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ebx - ror $9,%r13d - or %r8d,%r14d # a|c - - xor %r13d,%ebx # h=Sigma0(a) - and %r8d,%r15d # a&c - add %r12d,%r9d # d+=T1 - - and %edx,%r14d # (a|c)&b - add %r12d,%ebx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ebx # h+=Maj(a,b,c) - mov 0(%rsp),%r13d - mov 52(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 32(%rsp),%r12d - - add 60(%rsp),%r12d - mov %r9d,%r13d - mov %r9d,%r14d - mov %r10d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r11d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r9d,%r15d # (f^g)&e - mov %r12d,60(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %eax,%r12d # T1+=h - - mov %ebx,%eax - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ebx,%r13d - mov %ebx,%r14d - - ror $2,%eax - ror $13,%r13d - mov %ebx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%eax - ror $9,%r13d - or %edx,%r14d # a|c - - xor %r13d,%eax # h=Sigma0(a) - and %edx,%r15d # a&c - add %r12d,%r8d # d+=T1 - - and %ecx,%r14d # (a|c)&b - add %r12d,%eax # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%eax # h+=Maj(a,b,c) - cmp $64,%rdi - jb .Lrounds_16_xx - - mov 16*4+0*8(%rsp),%rdi - lea 16*4(%rsi),%rsi - - add 4*0(%rdi),%eax - add 4*1(%rdi),%ebx - add 4*2(%rdi),%ecx - add 4*3(%rdi),%edx - add 4*4(%rdi),%r8d - add 4*5(%rdi),%r9d - add 4*6(%rdi),%r10d - add 4*7(%rdi),%r11d - - cmp 16*4+2*8(%rsp),%rsi - - mov %eax,4*0(%rdi) - mov %ebx,4*1(%rdi) - mov %ecx,4*2(%rdi) - mov %edx,4*3(%rdi) - mov %r8d,4*4(%rdi) - mov %r9d,4*5(%rdi) - mov %r10d,4*6(%rdi) - mov %r11d,4*7(%rdi) - jb .Lloop - - mov 16*4+3*8(%rsp),%rsp -.cfi_def_cfa %rsp,56 - pop %r15 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r15 - pop %r14 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r14 - pop %r13 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r13 - pop %r12 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r12 - pop %rbp -.cfi_adjust_cfa_offset -8 -.cfi_restore %rbp - pop %rbx -.cfi_adjust_cfa_offset -8 -.cfi_restore %rbx - - RET -.cfi_endproc -SET_SIZE(SHA256TransformBlocks) - -SECTION_STATIC -.balign 64 -SET_OBJ(K256) -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -#endif /* !lint && !__lint */ - -#ifdef __ELF__ -.section .note.GNU-stack,"",%progbits -#endif diff --git a/module/icp/asm-x86_64/sha2/sha512-x86_64.S b/module/icp/asm-x86_64/sha2/sha512-x86_64.S new file mode 100644 index 000000000000..29f103965147 --- /dev/null +++ b/module/icp/asm-x86_64/sha2/sha512-x86_64.S @@ -0,0 +1,4011 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Portions Copyright (c) 2022 Tino Reichardt + * - modified assembly to fit into OpenZFS + */ + +#if defined(__x86_64) + +#define _ASM +#include + +SECTION_STATIC + +.align 64 +.type K512,@object +K512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.quad 0x0001020304050607,0x08090a0b0c0d0e0f + +ENTRY_ALIGN(zfs_sha512_transform_x64, 16) +.cfi_startproc + ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $128+32,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue: + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop +.align 16 +.Lloop: + movq %rbx,%rdi + leaq K512(%rip),%rbp + xorq %rcx,%rdi + movq 0(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 8(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 16(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 24(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 32(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 40(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 48(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 56(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + leaq 24(%rbp),%rbp + addq %r14,%rax + movq 64(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 72(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 80(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 88(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 96(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 104(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 112(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 120(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + leaq 24(%rbp),%rbp + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movq 8(%rsp),%r13 + movq 112(%rsp),%r15 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 72(%rsp),%r12 + addq 0(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + leaq 8(%rbp),%rbp + movq 16(%rsp),%r13 + movq 120(%rsp),%rdi + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 80(%rsp),%r12 + addq 8(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + leaq 24(%rbp),%rbp + movq 24(%rsp),%r13 + movq 0(%rsp),%r15 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 88(%rsp),%r12 + addq 16(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + leaq 8(%rbp),%rbp + movq 32(%rsp),%r13 + movq 8(%rsp),%rdi + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 96(%rsp),%r12 + addq 24(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + leaq 24(%rbp),%rbp + movq 40(%rsp),%r13 + movq 16(%rsp),%r15 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 104(%rsp),%r12 + addq 32(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + leaq 8(%rbp),%rbp + movq 48(%rsp),%r13 + movq 24(%rsp),%rdi + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 112(%rsp),%r12 + addq 40(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + leaq 24(%rbp),%rbp + movq 56(%rsp),%r13 + movq 32(%rsp),%r15 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 120(%rsp),%r12 + addq 48(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + leaq 8(%rbp),%rbp + movq 64(%rsp),%r13 + movq 40(%rsp),%rdi + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 0(%rsp),%r12 + addq 56(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + leaq 24(%rbp),%rbp + movq 72(%rsp),%r13 + movq 48(%rsp),%r15 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 8(%rsp),%r12 + addq 64(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + leaq 8(%rbp),%rbp + movq 80(%rsp),%r13 + movq 56(%rsp),%rdi + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 16(%rsp),%r12 + addq 72(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + leaq 24(%rbp),%rbp + movq 88(%rsp),%r13 + movq 64(%rsp),%r15 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 24(%rsp),%r12 + addq 80(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + leaq 8(%rbp),%rbp + movq 96(%rsp),%r13 + movq 72(%rsp),%rdi + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 32(%rsp),%r12 + addq 88(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + leaq 24(%rbp),%rbp + movq 104(%rsp),%r13 + movq 80(%rsp),%r15 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 40(%rsp),%r12 + addq 96(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + leaq 8(%rbp),%rbp + movq 112(%rsp),%r13 + movq 88(%rsp),%rdi + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 48(%rsp),%r12 + addq 104(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + leaq 24(%rbp),%rbp + movq 120(%rsp),%r13 + movq 96(%rsp),%r15 + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 56(%rsp),%r12 + addq 112(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + leaq 8(%rbp),%rbp + movq 0(%rsp),%r13 + movq 104(%rsp),%rdi + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 64(%rsp),%r12 + addq 120(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + leaq 24(%rbp),%rbp + cmpb $0,7(%rbp) + jnz .Lrounds_16_xx + movq 128+0(%rsp),%rdi + addq %r14,%rax + leaq 128(%rsi),%rsi + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + cmpq 128+16(%rsp),%rsi + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue: + RET +.cfi_endproc +SET_SIZE(zfs_sha512_transform_x64) + +ENTRY_ALIGN(zfs_sha512_transform_avx, 64) +.cfi_startproc + ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lavx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_avx + + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + RET +.cfi_endproc +SET_SIZE(zfs_sha512_transform_avx) + +ENTRY_ALIGN(zfs_sha512_transform_avx2, 64) +.cfi_startproc + ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $1312,%rsp + shlq $4,%rdx + andq $-2048,%rsp + leaq (%rsi,%rdx,8),%rdx + addq $1152,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_avx2: + + vzeroupper + subq $-128,%rsi + movq 0(%rdi),%rax + movq %rsi,%r12 + movq 8(%rdi),%rbx + cmpq %rdx,%rsi + movq 16(%rdi),%rcx + cmoveq %rsp,%r12 + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqu -128(%rsi),%xmm0 + vmovdqu -128+16(%rsi),%xmm1 + vmovdqu -128+32(%rsi),%xmm2 + leaq K512+128(%rip),%rbp + vmovdqu -128+48(%rsi),%xmm3 + vmovdqu -128+64(%rsi),%xmm4 + vmovdqu -128+80(%rsi),%xmm5 + vmovdqu -128+96(%rsi),%xmm6 + vmovdqu -128+112(%rsi),%xmm7 + + vmovdqa 1152(%rbp),%ymm10 + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm10,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm10,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + vpshufb %ymm10,%ymm2,%ymm2 + vinserti128 $1,64(%r12),%ymm4,%ymm4 + vpshufb %ymm10,%ymm3,%ymm3 + vinserti128 $1,80(%r12),%ymm5,%ymm5 + vpshufb %ymm10,%ymm4,%ymm4 + vinserti128 $1,96(%r12),%ymm6,%ymm6 + vpshufb %ymm10,%ymm5,%ymm5 + vinserti128 $1,112(%r12),%ymm7,%ymm7 + + vpaddq -128(%rbp),%ymm0,%ymm8 + vpshufb %ymm10,%ymm6,%ymm6 + vpaddq -96(%rbp),%ymm1,%ymm9 + vpshufb %ymm10,%ymm7,%ymm7 + vpaddq -64(%rbp),%ymm2,%ymm10 + vpaddq -32(%rbp),%ymm3,%ymm11 + vmovdqa %ymm8,0(%rsp) + vpaddq 0(%rbp),%ymm4,%ymm8 + vmovdqa %ymm9,32(%rsp) + vpaddq 32(%rbp),%ymm5,%ymm9 + vmovdqa %ymm10,64(%rsp) + vpaddq 64(%rbp),%ymm6,%ymm10 + vmovdqa %ymm11,96(%rsp) + + movq 152(%rsp),%rdi +.cfi_def_cfa %rdi,8 + leaq -128(%rsp),%rsp + + + + movq %rdi,-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpaddq 96(%rbp),%ymm7,%ymm11 + vmovdqa %ymm8,0(%rsp) + xorq %r14,%r14 + vmovdqa %ymm9,32(%rsp) + movq %rbx,%rdi + vmovdqa %ymm10,64(%rsp) + xorq %rcx,%rdi + vmovdqa %ymm11,96(%rsp) + movq %r9,%r12 + addq $32*8,%rbp + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: + leaq -128(%rsp),%rsp +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 + + pushq 128-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $8,%ymm0,%ymm1,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm4,%ymm5,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm0,%ymm0 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm7,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm7,%ymm10 + vpaddq %ymm8,%ymm0,%ymm0 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm7,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm0,%ymm0 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq -128(%rbp),%ymm0,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm5,%ymm6,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm1,%ymm1 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm0,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm0,%ymm10 + vpaddq %ymm8,%ymm1,%ymm1 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm0,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm1,%ymm1 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq -96(%rbp),%ymm1,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm6,%ymm7,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm2,%ymm2 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm1,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm1,%ymm10 + vpaddq %ymm8,%ymm2,%ymm2 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm1,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm2,%ymm2 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq -64(%rbp),%ymm2,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm7,%ymm0,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm3,%ymm3 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm2,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm2,%ymm10 + vpaddq %ymm8,%ymm3,%ymm3 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm2,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm3,%ymm3 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq -32(%rbp),%ymm3,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq -128(%rsp),%rsp +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 + + pushq 128-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $8,%ymm4,%ymm5,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm0,%ymm1,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm4,%ymm4 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm3,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm3,%ymm10 + vpaddq %ymm8,%ymm4,%ymm4 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm3,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm4,%ymm4 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq 0(%rbp),%ymm4,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm5,%ymm6,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm1,%ymm2,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm5,%ymm5 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm4,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm4,%ymm10 + vpaddq %ymm8,%ymm5,%ymm5 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm4,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm5,%ymm5 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq 32(%rbp),%ymm5,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm6,%ymm7,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm2,%ymm3,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm6,%ymm6 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm5,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm5,%ymm10 + vpaddq %ymm8,%ymm6,%ymm6 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm5,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm6,%ymm6 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq 64(%rbp),%ymm6,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm7,%ymm0,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm3,%ymm4,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm7,%ymm7 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm6,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm6,%ymm10 + vpaddq %ymm8,%ymm7,%ymm7 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm6,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm7,%ymm7 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq 96(%rbp),%ymm7,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq 256(%rbp),%rbp + cmpb $0,-121(%rbp) + jne .Lavx2_00_47 + addq 0+128(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+128(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+128(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+128(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+128(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+128(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+128(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+128(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + addq 0(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rbp + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + cmpq 144(%rbp),%rsi + je .Ldone_avx2 + + xorq %r14,%r14 + movq %rbx,%rdi + xorq %rcx,%rdi + movq %r9,%r12 + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + addq 0+16(%rbp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+16(%rbp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+16(%rbp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+16(%rbp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+16(%rbp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+16(%rbp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+16(%rbp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+16(%rbp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + leaq -128(%rbp),%rbp + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rsp + +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + leaq 256(%rsi),%rsi + addq 48(%rdi),%r10 + movq %rsi,%r12 + addq 56(%rdi),%r11 + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + cmoveq %rsp,%r12 + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + +.cfi_escape 0x0f,0x06,0x76,0x98,0x01,0x06,0x23,0x08 + +.Ldone_avx2: + movq 152(%rbp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + RET +.cfi_endproc +SET_SIZE(zfs_sha512_transform_avx2) + +#if defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif +#endif diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S deleted file mode 100644 index b2f7d4863d8a..000000000000 --- a/module/icp/asm-x86_64/sha2/sha512_impl.S +++ /dev/null @@ -1,2115 +0,0 @@ -/* - * ==================================================================== - * Written by Andy Polyakov for the OpenSSL - * project. Rights for redistribution and usage in source and binary - * forms are granted according to the OpenSSL license. - * ==================================================================== - * - * sha256/512_block procedure for x86_64. - * - * 40% improvement over compiler-generated code on Opteron. On EM64T - * sha256 was observed to run >80% faster and sha512 - >40%. No magical - * tricks, just straight implementation... I really wonder why gcc - * [being armed with inline assembler] fails to generate as fast code. - * The only thing which is cool about this module is that it's very - * same instruction sequence used for both SHA-256 and SHA-512. In - * former case the instructions operate on 32-bit operands, while in - * latter - on 64-bit ones. All I had to do is to get one flavor right, - * the other one passed the test right away:-) - * - * sha256_block runs in ~1005 cycles on Opteron, which gives you - * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock - * frequency in GHz. sha512_block runs in ~1275 cycles, which results - * in 128*1000/1275=100MBps per GHz. Is there room for improvement? - * Well, if you compare it to IA-64 implementation, which maintains - * X[16] in register bank[!], tends to 4 instructions per CPU clock - * cycle and runs in 1003 cycles, 1275 is very good result for 3-way - * issue Opteron pipeline and X[16] maintained in memory. So that *if* - * there is a way to improve it, *then* the only way would be to try to - * offload X[16] updates to SSE unit, but that would require "deeper" - * loop unroll, which in turn would naturally cause size blow-up, not - * to mention increased complexity! And once again, only *if* it's - * actually possible to noticeably improve overall ILP, instruction - * level parallelism, on a given CPU implementation in this case. - * - * Special note on Intel EM64T. While Opteron CPU exhibits perfect - * performance ratio of 1.5 between 64- and 32-bit flavors [see above], - * [currently available] EM64T CPUs apparently are far from it. On the - * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit - * sha256_block:-( This is presumably because 64-bit shifts/rotates - * apparently are not atomic instructions, but implemented in microcode. - */ - -/* - * OpenSolaris OS modifications - * - * Sun elects to use this software under the BSD license. - * - * This source originates from OpenSSL file sha512-x86_64.pl at - * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz - * (presumably for future OpenSSL release 0.9.8h), with these changes: - * - * 1. Added perl "use strict" and declared variables. - * - * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from - * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. - * - * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) - * assemblers). Replaced the .picmeup macro with assembler code. - * - * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", - * at the beginning of SHA2_CTX (the next field is 8-byte aligned). - */ - -/* - * This file was generated by a perl script (sha512-x86_64.pl) that were - * used to generate sha256 and sha512 variants from the same code base. - * The comments from the original file have been pasted above. - */ - - -#if defined(lint) || defined(__lint) -#include -#include - -void -SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) -{ - (void) ctx, (void) in, (void) num; -} - - -#else -#define _ASM -#include - -ENTRY_NP(SHA512TransformBlocks) -.cfi_startproc - ENDBR - movq %rsp, %rax -.cfi_def_cfa_register %rax - push %rbx -.cfi_offset %rbx,-16 - push %rbp -.cfi_offset %rbp,-24 - push %r12 -.cfi_offset %r12,-32 - push %r13 -.cfi_offset %r13,-40 - push %r14 -.cfi_offset %r14,-48 - push %r15 -.cfi_offset %r15,-56 - mov %rsp,%rbp # copy %rsp - shl $4,%rdx # num*16 - sub $16*8+4*8,%rsp - lea (%rsi,%rdx,8),%rdx # inp+num*16*8 - and $-64,%rsp # align stack frame - add $8,%rdi # Skip OpenSolaris field, "algotype" - mov %rdi,16*8+0*8(%rsp) # save ctx, 1st arg - mov %rsi,16*8+1*8(%rsp) # save inp, 2nd arg - mov %rdx,16*8+2*8(%rsp) # save end pointer, "3rd" arg - mov %rbp,16*8+3*8(%rsp) # save copy of %rsp -# echo ".cfi_cfa_expression %rsp+152,deref,+56" | -# openssl/crypto/perlasm/x86_64-xlate.pl -.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x38 - - #.picmeup %rbp - # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts - # the address of the "next" instruction into the target register - # (%rbp). This generates these 2 instructions: - lea .Llea(%rip),%rbp - #nop # .picmeup generates a nop for mod 8 alignment--not needed here - -.Llea: - lea K512-.(%rbp),%rbp - - mov 8*0(%rdi),%rax - mov 8*1(%rdi),%rbx - mov 8*2(%rdi),%rcx - mov 8*3(%rdi),%rdx - mov 8*4(%rdi),%r8 - mov 8*5(%rdi),%r9 - mov 8*6(%rdi),%r10 - mov 8*7(%rdi),%r11 - jmp .Lloop - -.balign 16 -.Lloop: - xor %rdi,%rdi - mov 8*0(%rsi),%r12 - bswap %r12 - mov %r8,%r13 - mov %r8,%r14 - mov %r9,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r10,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r8,%r15 # (f^g)&e - mov %r12,0(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r11,%r12 # T1+=h - - mov %rax,%r11 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rax,%r13 - mov %rax,%r14 - - ror $28,%r11 - ror $34,%r13 - mov %rax,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r11 - ror $5,%r13 - or %rcx,%r14 # a|c - - xor %r13,%r11 # h=Sigma0(a) - and %rcx,%r15 # a&c - add %r12,%rdx # d+=T1 - - and %rbx,%r14 # (a|c)&b - add %r12,%r11 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r11 # h+=Maj(a,b,c) - mov 8*1(%rsi),%r12 - bswap %r12 - mov %rdx,%r13 - mov %rdx,%r14 - mov %r8,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r9,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rdx,%r15 # (f^g)&e - mov %r12,8(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r10,%r12 # T1+=h - - mov %r11,%r10 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r11,%r13 - mov %r11,%r14 - - ror $28,%r10 - ror $34,%r13 - mov %r11,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r10 - ror $5,%r13 - or %rbx,%r14 # a|c - - xor %r13,%r10 # h=Sigma0(a) - and %rbx,%r15 # a&c - add %r12,%rcx # d+=T1 - - and %rax,%r14 # (a|c)&b - add %r12,%r10 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r10 # h+=Maj(a,b,c) - mov 8*2(%rsi),%r12 - bswap %r12 - mov %rcx,%r13 - mov %rcx,%r14 - mov %rdx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r8,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rcx,%r15 # (f^g)&e - mov %r12,16(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r9,%r12 # T1+=h - - mov %r10,%r9 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r10,%r13 - mov %r10,%r14 - - ror $28,%r9 - ror $34,%r13 - mov %r10,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r9 - ror $5,%r13 - or %rax,%r14 # a|c - - xor %r13,%r9 # h=Sigma0(a) - and %rax,%r15 # a&c - add %r12,%rbx # d+=T1 - - and %r11,%r14 # (a|c)&b - add %r12,%r9 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r9 # h+=Maj(a,b,c) - mov 8*3(%rsi),%r12 - bswap %r12 - mov %rbx,%r13 - mov %rbx,%r14 - mov %rcx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rdx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rbx,%r15 # (f^g)&e - mov %r12,24(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r8,%r12 # T1+=h - - mov %r9,%r8 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r9,%r13 - mov %r9,%r14 - - ror $28,%r8 - ror $34,%r13 - mov %r9,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r8 - ror $5,%r13 - or %r11,%r14 # a|c - - xor %r13,%r8 # h=Sigma0(a) - and %r11,%r15 # a&c - add %r12,%rax # d+=T1 - - and %r10,%r14 # (a|c)&b - add %r12,%r8 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r8 # h+=Maj(a,b,c) - mov 8*4(%rsi),%r12 - bswap %r12 - mov %rax,%r13 - mov %rax,%r14 - mov %rbx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rcx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rax,%r15 # (f^g)&e - mov %r12,32(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rdx,%r12 # T1+=h - - mov %r8,%rdx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r8,%r13 - mov %r8,%r14 - - ror $28,%rdx - ror $34,%r13 - mov %r8,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rdx - ror $5,%r13 - or %r10,%r14 # a|c - - xor %r13,%rdx # h=Sigma0(a) - and %r10,%r15 # a&c - add %r12,%r11 # d+=T1 - - and %r9,%r14 # (a|c)&b - add %r12,%rdx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rdx # h+=Maj(a,b,c) - mov 8*5(%rsi),%r12 - bswap %r12 - mov %r11,%r13 - mov %r11,%r14 - mov %rax,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rbx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r11,%r15 # (f^g)&e - mov %r12,40(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rcx,%r12 # T1+=h - - mov %rdx,%rcx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rdx,%r13 - mov %rdx,%r14 - - ror $28,%rcx - ror $34,%r13 - mov %rdx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rcx - ror $5,%r13 - or %r9,%r14 # a|c - - xor %r13,%rcx # h=Sigma0(a) - and %r9,%r15 # a&c - add %r12,%r10 # d+=T1 - - and %r8,%r14 # (a|c)&b - add %r12,%rcx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rcx # h+=Maj(a,b,c) - mov 8*6(%rsi),%r12 - bswap %r12 - mov %r10,%r13 - mov %r10,%r14 - mov %r11,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rax,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r10,%r15 # (f^g)&e - mov %r12,48(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rbx,%r12 # T1+=h - - mov %rcx,%rbx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rcx,%r13 - mov %rcx,%r14 - - ror $28,%rbx - ror $34,%r13 - mov %rcx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rbx - ror $5,%r13 - or %r8,%r14 # a|c - - xor %r13,%rbx # h=Sigma0(a) - and %r8,%r15 # a&c - add %r12,%r9 # d+=T1 - - and %rdx,%r14 # (a|c)&b - add %r12,%rbx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rbx # h+=Maj(a,b,c) - mov 8*7(%rsi),%r12 - bswap %r12 - mov %r9,%r13 - mov %r9,%r14 - mov %r10,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r11,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r9,%r15 # (f^g)&e - mov %r12,56(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rax,%r12 # T1+=h - - mov %rbx,%rax - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rbx,%r13 - mov %rbx,%r14 - - ror $28,%rax - ror $34,%r13 - mov %rbx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rax - ror $5,%r13 - or %rdx,%r14 # a|c - - xor %r13,%rax # h=Sigma0(a) - and %rdx,%r15 # a&c - add %r12,%r8 # d+=T1 - - and %rcx,%r14 # (a|c)&b - add %r12,%rax # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rax # h+=Maj(a,b,c) - mov 8*8(%rsi),%r12 - bswap %r12 - mov %r8,%r13 - mov %r8,%r14 - mov %r9,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r10,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r8,%r15 # (f^g)&e - mov %r12,64(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r11,%r12 # T1+=h - - mov %rax,%r11 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rax,%r13 - mov %rax,%r14 - - ror $28,%r11 - ror $34,%r13 - mov %rax,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r11 - ror $5,%r13 - or %rcx,%r14 # a|c - - xor %r13,%r11 # h=Sigma0(a) - and %rcx,%r15 # a&c - add %r12,%rdx # d+=T1 - - and %rbx,%r14 # (a|c)&b - add %r12,%r11 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r11 # h+=Maj(a,b,c) - mov 8*9(%rsi),%r12 - bswap %r12 - mov %rdx,%r13 - mov %rdx,%r14 - mov %r8,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r9,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rdx,%r15 # (f^g)&e - mov %r12,72(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r10,%r12 # T1+=h - - mov %r11,%r10 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r11,%r13 - mov %r11,%r14 - - ror $28,%r10 - ror $34,%r13 - mov %r11,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r10 - ror $5,%r13 - or %rbx,%r14 # a|c - - xor %r13,%r10 # h=Sigma0(a) - and %rbx,%r15 # a&c - add %r12,%rcx # d+=T1 - - and %rax,%r14 # (a|c)&b - add %r12,%r10 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r10 # h+=Maj(a,b,c) - mov 8*10(%rsi),%r12 - bswap %r12 - mov %rcx,%r13 - mov %rcx,%r14 - mov %rdx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r8,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rcx,%r15 # (f^g)&e - mov %r12,80(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r9,%r12 # T1+=h - - mov %r10,%r9 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r10,%r13 - mov %r10,%r14 - - ror $28,%r9 - ror $34,%r13 - mov %r10,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r9 - ror $5,%r13 - or %rax,%r14 # a|c - - xor %r13,%r9 # h=Sigma0(a) - and %rax,%r15 # a&c - add %r12,%rbx # d+=T1 - - and %r11,%r14 # (a|c)&b - add %r12,%r9 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r9 # h+=Maj(a,b,c) - mov 8*11(%rsi),%r12 - bswap %r12 - mov %rbx,%r13 - mov %rbx,%r14 - mov %rcx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rdx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rbx,%r15 # (f^g)&e - mov %r12,88(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r8,%r12 # T1+=h - - mov %r9,%r8 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r9,%r13 - mov %r9,%r14 - - ror $28,%r8 - ror $34,%r13 - mov %r9,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r8 - ror $5,%r13 - or %r11,%r14 # a|c - - xor %r13,%r8 # h=Sigma0(a) - and %r11,%r15 # a&c - add %r12,%rax # d+=T1 - - and %r10,%r14 # (a|c)&b - add %r12,%r8 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r8 # h+=Maj(a,b,c) - mov 8*12(%rsi),%r12 - bswap %r12 - mov %rax,%r13 - mov %rax,%r14 - mov %rbx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rcx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rax,%r15 # (f^g)&e - mov %r12,96(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rdx,%r12 # T1+=h - - mov %r8,%rdx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r8,%r13 - mov %r8,%r14 - - ror $28,%rdx - ror $34,%r13 - mov %r8,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rdx - ror $5,%r13 - or %r10,%r14 # a|c - - xor %r13,%rdx # h=Sigma0(a) - and %r10,%r15 # a&c - add %r12,%r11 # d+=T1 - - and %r9,%r14 # (a|c)&b - add %r12,%rdx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rdx # h+=Maj(a,b,c) - mov 8*13(%rsi),%r12 - bswap %r12 - mov %r11,%r13 - mov %r11,%r14 - mov %rax,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rbx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r11,%r15 # (f^g)&e - mov %r12,104(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rcx,%r12 # T1+=h - - mov %rdx,%rcx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rdx,%r13 - mov %rdx,%r14 - - ror $28,%rcx - ror $34,%r13 - mov %rdx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rcx - ror $5,%r13 - or %r9,%r14 # a|c - - xor %r13,%rcx # h=Sigma0(a) - and %r9,%r15 # a&c - add %r12,%r10 # d+=T1 - - and %r8,%r14 # (a|c)&b - add %r12,%rcx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rcx # h+=Maj(a,b,c) - mov 8*14(%rsi),%r12 - bswap %r12 - mov %r10,%r13 - mov %r10,%r14 - mov %r11,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rax,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r10,%r15 # (f^g)&e - mov %r12,112(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rbx,%r12 # T1+=h - - mov %rcx,%rbx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rcx,%r13 - mov %rcx,%r14 - - ror $28,%rbx - ror $34,%r13 - mov %rcx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rbx - ror $5,%r13 - or %r8,%r14 # a|c - - xor %r13,%rbx # h=Sigma0(a) - and %r8,%r15 # a&c - add %r12,%r9 # d+=T1 - - and %rdx,%r14 # (a|c)&b - add %r12,%rbx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rbx # h+=Maj(a,b,c) - mov 8*15(%rsi),%r12 - bswap %r12 - mov %r9,%r13 - mov %r9,%r14 - mov %r10,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r11,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r9,%r15 # (f^g)&e - mov %r12,120(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rax,%r12 # T1+=h - - mov %rbx,%rax - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rbx,%r13 - mov %rbx,%r14 - - ror $28,%rax - ror $34,%r13 - mov %rbx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rax - ror $5,%r13 - or %rdx,%r14 # a|c - - xor %r13,%rax # h=Sigma0(a) - and %rdx,%r15 # a&c - add %r12,%r8 # d+=T1 - - and %rcx,%r14 # (a|c)&b - add %r12,%rax # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rax # h+=Maj(a,b,c) - jmp .Lrounds_16_xx -.balign 16 -.Lrounds_16_xx: - mov 8(%rsp),%r13 - mov 112(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 72(%rsp),%r12 - - add 0(%rsp),%r12 - mov %r8,%r13 - mov %r8,%r14 - mov %r9,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r10,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r8,%r15 # (f^g)&e - mov %r12,0(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r11,%r12 # T1+=h - - mov %rax,%r11 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rax,%r13 - mov %rax,%r14 - - ror $28,%r11 - ror $34,%r13 - mov %rax,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r11 - ror $5,%r13 - or %rcx,%r14 # a|c - - xor %r13,%r11 # h=Sigma0(a) - and %rcx,%r15 # a&c - add %r12,%rdx # d+=T1 - - and %rbx,%r14 # (a|c)&b - add %r12,%r11 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r11 # h+=Maj(a,b,c) - mov 16(%rsp),%r13 - mov 120(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 80(%rsp),%r12 - - add 8(%rsp),%r12 - mov %rdx,%r13 - mov %rdx,%r14 - mov %r8,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r9,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rdx,%r15 # (f^g)&e - mov %r12,8(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r10,%r12 # T1+=h - - mov %r11,%r10 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r11,%r13 - mov %r11,%r14 - - ror $28,%r10 - ror $34,%r13 - mov %r11,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r10 - ror $5,%r13 - or %rbx,%r14 # a|c - - xor %r13,%r10 # h=Sigma0(a) - and %rbx,%r15 # a&c - add %r12,%rcx # d+=T1 - - and %rax,%r14 # (a|c)&b - add %r12,%r10 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r10 # h+=Maj(a,b,c) - mov 24(%rsp),%r13 - mov 0(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 88(%rsp),%r12 - - add 16(%rsp),%r12 - mov %rcx,%r13 - mov %rcx,%r14 - mov %rdx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r8,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rcx,%r15 # (f^g)&e - mov %r12,16(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r9,%r12 # T1+=h - - mov %r10,%r9 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r10,%r13 - mov %r10,%r14 - - ror $28,%r9 - ror $34,%r13 - mov %r10,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r9 - ror $5,%r13 - or %rax,%r14 # a|c - - xor %r13,%r9 # h=Sigma0(a) - and %rax,%r15 # a&c - add %r12,%rbx # d+=T1 - - and %r11,%r14 # (a|c)&b - add %r12,%r9 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r9 # h+=Maj(a,b,c) - mov 32(%rsp),%r13 - mov 8(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 96(%rsp),%r12 - - add 24(%rsp),%r12 - mov %rbx,%r13 - mov %rbx,%r14 - mov %rcx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rdx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rbx,%r15 # (f^g)&e - mov %r12,24(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r8,%r12 # T1+=h - - mov %r9,%r8 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r9,%r13 - mov %r9,%r14 - - ror $28,%r8 - ror $34,%r13 - mov %r9,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r8 - ror $5,%r13 - or %r11,%r14 # a|c - - xor %r13,%r8 # h=Sigma0(a) - and %r11,%r15 # a&c - add %r12,%rax # d+=T1 - - and %r10,%r14 # (a|c)&b - add %r12,%r8 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r8 # h+=Maj(a,b,c) - mov 40(%rsp),%r13 - mov 16(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 104(%rsp),%r12 - - add 32(%rsp),%r12 - mov %rax,%r13 - mov %rax,%r14 - mov %rbx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rcx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rax,%r15 # (f^g)&e - mov %r12,32(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rdx,%r12 # T1+=h - - mov %r8,%rdx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r8,%r13 - mov %r8,%r14 - - ror $28,%rdx - ror $34,%r13 - mov %r8,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rdx - ror $5,%r13 - or %r10,%r14 # a|c - - xor %r13,%rdx # h=Sigma0(a) - and %r10,%r15 # a&c - add %r12,%r11 # d+=T1 - - and %r9,%r14 # (a|c)&b - add %r12,%rdx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rdx # h+=Maj(a,b,c) - mov 48(%rsp),%r13 - mov 24(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 112(%rsp),%r12 - - add 40(%rsp),%r12 - mov %r11,%r13 - mov %r11,%r14 - mov %rax,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rbx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r11,%r15 # (f^g)&e - mov %r12,40(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rcx,%r12 # T1+=h - - mov %rdx,%rcx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rdx,%r13 - mov %rdx,%r14 - - ror $28,%rcx - ror $34,%r13 - mov %rdx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rcx - ror $5,%r13 - or %r9,%r14 # a|c - - xor %r13,%rcx # h=Sigma0(a) - and %r9,%r15 # a&c - add %r12,%r10 # d+=T1 - - and %r8,%r14 # (a|c)&b - add %r12,%rcx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rcx # h+=Maj(a,b,c) - mov 56(%rsp),%r13 - mov 32(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 120(%rsp),%r12 - - add 48(%rsp),%r12 - mov %r10,%r13 - mov %r10,%r14 - mov %r11,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rax,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r10,%r15 # (f^g)&e - mov %r12,48(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rbx,%r12 # T1+=h - - mov %rcx,%rbx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rcx,%r13 - mov %rcx,%r14 - - ror $28,%rbx - ror $34,%r13 - mov %rcx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rbx - ror $5,%r13 - or %r8,%r14 # a|c - - xor %r13,%rbx # h=Sigma0(a) - and %r8,%r15 # a&c - add %r12,%r9 # d+=T1 - - and %rdx,%r14 # (a|c)&b - add %r12,%rbx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rbx # h+=Maj(a,b,c) - mov 64(%rsp),%r13 - mov 40(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 0(%rsp),%r12 - - add 56(%rsp),%r12 - mov %r9,%r13 - mov %r9,%r14 - mov %r10,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r11,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r9,%r15 # (f^g)&e - mov %r12,56(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rax,%r12 # T1+=h - - mov %rbx,%rax - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rbx,%r13 - mov %rbx,%r14 - - ror $28,%rax - ror $34,%r13 - mov %rbx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rax - ror $5,%r13 - or %rdx,%r14 # a|c - - xor %r13,%rax # h=Sigma0(a) - and %rdx,%r15 # a&c - add %r12,%r8 # d+=T1 - - and %rcx,%r14 # (a|c)&b - add %r12,%rax # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rax # h+=Maj(a,b,c) - mov 72(%rsp),%r13 - mov 48(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 8(%rsp),%r12 - - add 64(%rsp),%r12 - mov %r8,%r13 - mov %r8,%r14 - mov %r9,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r10,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r8,%r15 # (f^g)&e - mov %r12,64(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r11,%r12 # T1+=h - - mov %rax,%r11 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rax,%r13 - mov %rax,%r14 - - ror $28,%r11 - ror $34,%r13 - mov %rax,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r11 - ror $5,%r13 - or %rcx,%r14 # a|c - - xor %r13,%r11 # h=Sigma0(a) - and %rcx,%r15 # a&c - add %r12,%rdx # d+=T1 - - and %rbx,%r14 # (a|c)&b - add %r12,%r11 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r11 # h+=Maj(a,b,c) - mov 80(%rsp),%r13 - mov 56(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 16(%rsp),%r12 - - add 72(%rsp),%r12 - mov %rdx,%r13 - mov %rdx,%r14 - mov %r8,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r9,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rdx,%r15 # (f^g)&e - mov %r12,72(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r10,%r12 # T1+=h - - mov %r11,%r10 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r11,%r13 - mov %r11,%r14 - - ror $28,%r10 - ror $34,%r13 - mov %r11,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r10 - ror $5,%r13 - or %rbx,%r14 # a|c - - xor %r13,%r10 # h=Sigma0(a) - and %rbx,%r15 # a&c - add %r12,%rcx # d+=T1 - - and %rax,%r14 # (a|c)&b - add %r12,%r10 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r10 # h+=Maj(a,b,c) - mov 88(%rsp),%r13 - mov 64(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 24(%rsp),%r12 - - add 80(%rsp),%r12 - mov %rcx,%r13 - mov %rcx,%r14 - mov %rdx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r8,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rcx,%r15 # (f^g)&e - mov %r12,80(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r9,%r12 # T1+=h - - mov %r10,%r9 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r10,%r13 - mov %r10,%r14 - - ror $28,%r9 - ror $34,%r13 - mov %r10,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r9 - ror $5,%r13 - or %rax,%r14 # a|c - - xor %r13,%r9 # h=Sigma0(a) - and %rax,%r15 # a&c - add %r12,%rbx # d+=T1 - - and %r11,%r14 # (a|c)&b - add %r12,%r9 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r9 # h+=Maj(a,b,c) - mov 96(%rsp),%r13 - mov 72(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 32(%rsp),%r12 - - add 88(%rsp),%r12 - mov %rbx,%r13 - mov %rbx,%r14 - mov %rcx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rdx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rbx,%r15 # (f^g)&e - mov %r12,88(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r8,%r12 # T1+=h - - mov %r9,%r8 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r9,%r13 - mov %r9,%r14 - - ror $28,%r8 - ror $34,%r13 - mov %r9,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r8 - ror $5,%r13 - or %r11,%r14 # a|c - - xor %r13,%r8 # h=Sigma0(a) - and %r11,%r15 # a&c - add %r12,%rax # d+=T1 - - and %r10,%r14 # (a|c)&b - add %r12,%r8 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r8 # h+=Maj(a,b,c) - mov 104(%rsp),%r13 - mov 80(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 40(%rsp),%r12 - - add 96(%rsp),%r12 - mov %rax,%r13 - mov %rax,%r14 - mov %rbx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rcx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rax,%r15 # (f^g)&e - mov %r12,96(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rdx,%r12 # T1+=h - - mov %r8,%rdx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r8,%r13 - mov %r8,%r14 - - ror $28,%rdx - ror $34,%r13 - mov %r8,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rdx - ror $5,%r13 - or %r10,%r14 # a|c - - xor %r13,%rdx # h=Sigma0(a) - and %r10,%r15 # a&c - add %r12,%r11 # d+=T1 - - and %r9,%r14 # (a|c)&b - add %r12,%rdx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rdx # h+=Maj(a,b,c) - mov 112(%rsp),%r13 - mov 88(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 48(%rsp),%r12 - - add 104(%rsp),%r12 - mov %r11,%r13 - mov %r11,%r14 - mov %rax,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rbx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r11,%r15 # (f^g)&e - mov %r12,104(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rcx,%r12 # T1+=h - - mov %rdx,%rcx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rdx,%r13 - mov %rdx,%r14 - - ror $28,%rcx - ror $34,%r13 - mov %rdx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rcx - ror $5,%r13 - or %r9,%r14 # a|c - - xor %r13,%rcx # h=Sigma0(a) - and %r9,%r15 # a&c - add %r12,%r10 # d+=T1 - - and %r8,%r14 # (a|c)&b - add %r12,%rcx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rcx # h+=Maj(a,b,c) - mov 120(%rsp),%r13 - mov 96(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 56(%rsp),%r12 - - add 112(%rsp),%r12 - mov %r10,%r13 - mov %r10,%r14 - mov %r11,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rax,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r10,%r15 # (f^g)&e - mov %r12,112(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rbx,%r12 # T1+=h - - mov %rcx,%rbx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rcx,%r13 - mov %rcx,%r14 - - ror $28,%rbx - ror $34,%r13 - mov %rcx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rbx - ror $5,%r13 - or %r8,%r14 # a|c - - xor %r13,%rbx # h=Sigma0(a) - and %r8,%r15 # a&c - add %r12,%r9 # d+=T1 - - and %rdx,%r14 # (a|c)&b - add %r12,%rbx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rbx # h+=Maj(a,b,c) - mov 0(%rsp),%r13 - mov 104(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 64(%rsp),%r12 - - add 120(%rsp),%r12 - mov %r9,%r13 - mov %r9,%r14 - mov %r10,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r11,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r9,%r15 # (f^g)&e - mov %r12,120(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rax,%r12 # T1+=h - - mov %rbx,%rax - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rbx,%r13 - mov %rbx,%r14 - - ror $28,%rax - ror $34,%r13 - mov %rbx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rax - ror $5,%r13 - or %rdx,%r14 # a|c - - xor %r13,%rax # h=Sigma0(a) - and %rdx,%r15 # a&c - add %r12,%r8 # d+=T1 - - and %rcx,%r14 # (a|c)&b - add %r12,%rax # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rax # h+=Maj(a,b,c) - cmp $80,%rdi - jb .Lrounds_16_xx - - mov 16*8+0*8(%rsp),%rdi - lea 16*8(%rsi),%rsi - - add 8*0(%rdi),%rax - add 8*1(%rdi),%rbx - add 8*2(%rdi),%rcx - add 8*3(%rdi),%rdx - add 8*4(%rdi),%r8 - add 8*5(%rdi),%r9 - add 8*6(%rdi),%r10 - add 8*7(%rdi),%r11 - - cmp 16*8+2*8(%rsp),%rsi - - mov %rax,8*0(%rdi) - mov %rbx,8*1(%rdi) - mov %rcx,8*2(%rdi) - mov %rdx,8*3(%rdi) - mov %r8,8*4(%rdi) - mov %r9,8*5(%rdi) - mov %r10,8*6(%rdi) - mov %r11,8*7(%rdi) - jb .Lloop - - mov 16*8+3*8(%rsp),%rsp -.cfi_def_cfa %rsp,56 - pop %r15 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r15 - pop %r14 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r14 - pop %r13 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r13 - pop %r12 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r12 - pop %rbp -.cfi_adjust_cfa_offset -8 -.cfi_restore %rbp - pop %rbx -.cfi_adjust_cfa_offset -8 -.cfi_restore %rbx - - RET -.cfi_endproc -SET_SIZE(SHA512TransformBlocks) - -SECTION_STATIC -.balign 64 -SET_OBJ(K512) -K512: - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 -#endif /* !lint && !__lint */ - -#if defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif diff --git a/module/icp/include/generic_impl.c b/module/icp/include/generic_impl.c new file mode 100644 index 000000000000..16f802cf7558 --- /dev/null +++ b/module/icp/include/generic_impl.c @@ -0,0 +1,233 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2003, 2010 Oracle and/or its affiliates. + * Copyright (c) 2022 Tino Reichardt + */ + +/* + * This file gets included by c files for implementing the full set + * of zfs_impl.h defines. + * + * It's ment for easier maintaining multiple implementations of + * algorithms. Look into blake3_impl.c, sha256_impl.c or sha512_impl.c + * for reference. + */ + +#include +#include +#include + +/* Two default implementations */ +#define IMPL_FASTEST (UINT32_MAX) +#define IMPL_CYCLE (UINT32_MAX - 1) + +#define IMPL_READ(i) (*(volatile uint32_t *) &(i)) + +/* Implementation that contains the fastest method */ +static IMPL_OPS_T generic_fastest_impl = { + .name = "fastest" +}; + +/* Hold all supported implementations */ +static const IMPL_OPS_T *generic_supp_impls[ARRAY_SIZE(IMPL_ARRAY)]; +static uint32_t generic_supp_impls_cnt = 0; + +/* Currently selected implementation */ +static uint32_t generic_impl_chosen = IMPL_FASTEST; + +static struct generic_impl_selector { + const char *name; + uint32_t sel; +} generic_impl_selectors[] = { + { "cycle", IMPL_CYCLE }, + { "fastest", IMPL_FASTEST } +}; + +/* check the supported implementations */ +static void +generic_impl_init(void) +{ + int i, c; + + /* init only once */ + if (likely(generic_supp_impls_cnt != 0)) + return; + + /* Move supported implementations into generic_supp_impls */ + for (i = 0, c = 0; i < ARRAY_SIZE(IMPL_ARRAY); i++) { + const IMPL_OPS_T *impl = IMPL_ARRAY[i]; + + if (impl->is_supported && impl->is_supported()) + generic_supp_impls[c++] = impl; + } + generic_supp_impls_cnt = c; + + /* first init generic impl, may be changed via set_fastest() */ + memcpy(&generic_fastest_impl, generic_supp_impls[0], + sizeof (generic_fastest_impl)); +} + +/* get number of supported implementations */ +static uint32_t +generic_impl_getcnt(void) +{ + generic_impl_init(); + return (generic_supp_impls_cnt); +} + +/* get id of selected implementation */ +static uint32_t +generic_impl_getid(void) +{ + generic_impl_init(); + return (IMPL_READ(generic_impl_chosen)); +} + +/* get name of selected implementation */ +static const char * +generic_impl_getname(void) +{ + uint32_t impl = IMPL_READ(generic_impl_chosen); + + generic_impl_init(); + switch (impl) { + case IMPL_FASTEST: + return ("fastest"); + case IMPL_CYCLE: + return ("cycle"); + default: + return (generic_supp_impls[impl]->name); + } +} + +/* set implementation by id */ +static void +generic_impl_setid(uint32_t id) +{ + generic_impl_init(); + switch (id) { + case IMPL_FASTEST: + atomic_swap_32(&generic_impl_chosen, IMPL_FASTEST); + break; + case IMPL_CYCLE: + atomic_swap_32(&generic_impl_chosen, IMPL_CYCLE); + break; + default: + ASSERT3U(id, <, generic_supp_impls_cnt); + atomic_swap_32(&generic_impl_chosen, id); + break; + } +} + +/* set implementation by name */ +static int +generic_impl_setname(const char *val) +{ + uint32_t impl = IMPL_READ(generic_impl_chosen); + size_t val_len; + int i, err = -EINVAL; + + generic_impl_init(); + val_len = strlen(val); + while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */ + val_len--; + + /* check mandatory implementations */ + for (i = 0; i < ARRAY_SIZE(generic_impl_selectors); i++) { + const char *name = generic_impl_selectors[i].name; + + if (val_len == strlen(name) && + strncmp(val, name, val_len) == 0) { + impl = generic_impl_selectors[i].sel; + err = 0; + break; + } + } + + /* check all supported implementations */ + if (err != 0) { + for (i = 0; i < generic_supp_impls_cnt; i++) { + const char *name = generic_supp_impls[i]->name; + + if (val_len == strlen(name) && + strncmp(val, name, val_len) == 0) { + impl = i; + err = 0; + break; + } + } + } + + if (err == 0) { + atomic_swap_32(&generic_impl_chosen, impl); + } + + return (err); +} + +/* setup id as fastest implementation */ +static void +generic_impl_set_fastest(uint32_t id) +{ + generic_impl_init(); + memcpy(&generic_fastest_impl, generic_supp_impls[id], + sizeof (generic_fastest_impl)); +} + +/* return impl iterating functions */ +const zfs_impl_t ZFS_IMPL_OPS = { + .name = IMPL_NAME, + .getcnt = generic_impl_getcnt, + .getid = generic_impl_getid, + .getname = generic_impl_getname, + .set_fastest = generic_impl_set_fastest, + .setid = generic_impl_setid, + .setname = generic_impl_setname +}; + +/* get impl ops_t of selected implementation */ +const IMPL_OPS_T * +IMPL_GET_OPS(void) +{ + const IMPL_OPS_T *ops = NULL; + uint32_t idx, impl = IMPL_READ(generic_impl_chosen); + static uint32_t cycle_count = 0; + + generic_impl_init(); + switch (impl) { + case IMPL_FASTEST: + ops = &generic_fastest_impl; + break; + case IMPL_CYCLE: + idx = (++cycle_count) % generic_supp_impls_cnt; + ops = generic_supp_impls[idx]; + break; + default: + ASSERT3U(impl, <, generic_supp_impls_cnt); + ops = generic_supp_impls[impl]; + break; + } + + ASSERT3P(ops, !=, NULL); + return (ops); +} diff --git a/module/icp/include/sha2/sha2_consts.h b/module/icp/include/sha2/sha2_consts.h deleted file mode 100644 index b33ddf821843..000000000000 --- a/module/icp/include/sha2/sha2_consts.h +++ /dev/null @@ -1,219 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_SHA2_CONSTS_H -#define _SYS_SHA2_CONSTS_H - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Loading 32-bit constants on a sparc is expensive since it involves both - * a `sethi' and an `or'. thus, we instead use `ld' to load the constants - * from an array called `sha2_consts'. however, on intel (and perhaps other - * processors), it is cheaper to load the constant directly. thus, the c - * code in SHA transform functions uses the macro SHA2_CONST() which either - * expands to a constant or an array reference, depending on - * the architecture the code is being compiled for. - * - * SHA512 constants are used for SHA384 - */ - -#include /* uint32_t */ - -extern const uint32_t sha256_consts[]; -extern const uint64_t sha512_consts[]; - -#if defined(__sparc) -#define SHA256_CONST(x) (sha256_consts[x]) -#define SHA512_CONST(x) (sha512_consts[x]) -#else -#define SHA256_CONST(x) (SHA256_CONST_ ## x) -#define SHA512_CONST(x) (SHA512_CONST_ ## x) -#endif - -/* constants, as provided in FIPS 180-2 */ - -#define SHA256_CONST_0 0x428a2f98U -#define SHA256_CONST_1 0x71374491U -#define SHA256_CONST_2 0xb5c0fbcfU -#define SHA256_CONST_3 0xe9b5dba5U -#define SHA256_CONST_4 0x3956c25bU -#define SHA256_CONST_5 0x59f111f1U -#define SHA256_CONST_6 0x923f82a4U -#define SHA256_CONST_7 0xab1c5ed5U - -#define SHA256_CONST_8 0xd807aa98U -#define SHA256_CONST_9 0x12835b01U -#define SHA256_CONST_10 0x243185beU -#define SHA256_CONST_11 0x550c7dc3U -#define SHA256_CONST_12 0x72be5d74U -#define SHA256_CONST_13 0x80deb1feU -#define SHA256_CONST_14 0x9bdc06a7U -#define SHA256_CONST_15 0xc19bf174U - -#define SHA256_CONST_16 0xe49b69c1U -#define SHA256_CONST_17 0xefbe4786U -#define SHA256_CONST_18 0x0fc19dc6U -#define SHA256_CONST_19 0x240ca1ccU -#define SHA256_CONST_20 0x2de92c6fU -#define SHA256_CONST_21 0x4a7484aaU -#define SHA256_CONST_22 0x5cb0a9dcU -#define SHA256_CONST_23 0x76f988daU - -#define SHA256_CONST_24 0x983e5152U -#define SHA256_CONST_25 0xa831c66dU -#define SHA256_CONST_26 0xb00327c8U -#define SHA256_CONST_27 0xbf597fc7U -#define SHA256_CONST_28 0xc6e00bf3U -#define SHA256_CONST_29 0xd5a79147U -#define SHA256_CONST_30 0x06ca6351U -#define SHA256_CONST_31 0x14292967U - -#define SHA256_CONST_32 0x27b70a85U -#define SHA256_CONST_33 0x2e1b2138U -#define SHA256_CONST_34 0x4d2c6dfcU -#define SHA256_CONST_35 0x53380d13U -#define SHA256_CONST_36 0x650a7354U -#define SHA256_CONST_37 0x766a0abbU -#define SHA256_CONST_38 0x81c2c92eU -#define SHA256_CONST_39 0x92722c85U - -#define SHA256_CONST_40 0xa2bfe8a1U -#define SHA256_CONST_41 0xa81a664bU -#define SHA256_CONST_42 0xc24b8b70U -#define SHA256_CONST_43 0xc76c51a3U -#define SHA256_CONST_44 0xd192e819U -#define SHA256_CONST_45 0xd6990624U -#define SHA256_CONST_46 0xf40e3585U -#define SHA256_CONST_47 0x106aa070U - -#define SHA256_CONST_48 0x19a4c116U -#define SHA256_CONST_49 0x1e376c08U -#define SHA256_CONST_50 0x2748774cU -#define SHA256_CONST_51 0x34b0bcb5U -#define SHA256_CONST_52 0x391c0cb3U -#define SHA256_CONST_53 0x4ed8aa4aU -#define SHA256_CONST_54 0x5b9cca4fU -#define SHA256_CONST_55 0x682e6ff3U - -#define SHA256_CONST_56 0x748f82eeU -#define SHA256_CONST_57 0x78a5636fU -#define SHA256_CONST_58 0x84c87814U -#define SHA256_CONST_59 0x8cc70208U -#define SHA256_CONST_60 0x90befffaU -#define SHA256_CONST_61 0xa4506cebU -#define SHA256_CONST_62 0xbef9a3f7U -#define SHA256_CONST_63 0xc67178f2U - -#define SHA512_CONST_0 0x428a2f98d728ae22ULL -#define SHA512_CONST_1 0x7137449123ef65cdULL -#define SHA512_CONST_2 0xb5c0fbcfec4d3b2fULL -#define SHA512_CONST_3 0xe9b5dba58189dbbcULL -#define SHA512_CONST_4 0x3956c25bf348b538ULL -#define SHA512_CONST_5 0x59f111f1b605d019ULL -#define SHA512_CONST_6 0x923f82a4af194f9bULL -#define SHA512_CONST_7 0xab1c5ed5da6d8118ULL -#define SHA512_CONST_8 0xd807aa98a3030242ULL -#define SHA512_CONST_9 0x12835b0145706fbeULL -#define SHA512_CONST_10 0x243185be4ee4b28cULL -#define SHA512_CONST_11 0x550c7dc3d5ffb4e2ULL -#define SHA512_CONST_12 0x72be5d74f27b896fULL -#define SHA512_CONST_13 0x80deb1fe3b1696b1ULL -#define SHA512_CONST_14 0x9bdc06a725c71235ULL -#define SHA512_CONST_15 0xc19bf174cf692694ULL -#define SHA512_CONST_16 0xe49b69c19ef14ad2ULL -#define SHA512_CONST_17 0xefbe4786384f25e3ULL -#define SHA512_CONST_18 0x0fc19dc68b8cd5b5ULL -#define SHA512_CONST_19 0x240ca1cc77ac9c65ULL -#define SHA512_CONST_20 0x2de92c6f592b0275ULL -#define SHA512_CONST_21 0x4a7484aa6ea6e483ULL -#define SHA512_CONST_22 0x5cb0a9dcbd41fbd4ULL -#define SHA512_CONST_23 0x76f988da831153b5ULL -#define SHA512_CONST_24 0x983e5152ee66dfabULL -#define SHA512_CONST_25 0xa831c66d2db43210ULL -#define SHA512_CONST_26 0xb00327c898fb213fULL -#define SHA512_CONST_27 0xbf597fc7beef0ee4ULL -#define SHA512_CONST_28 0xc6e00bf33da88fc2ULL -#define SHA512_CONST_29 0xd5a79147930aa725ULL -#define SHA512_CONST_30 0x06ca6351e003826fULL -#define SHA512_CONST_31 0x142929670a0e6e70ULL -#define SHA512_CONST_32 0x27b70a8546d22ffcULL -#define SHA512_CONST_33 0x2e1b21385c26c926ULL -#define SHA512_CONST_34 0x4d2c6dfc5ac42aedULL -#define SHA512_CONST_35 0x53380d139d95b3dfULL -#define SHA512_CONST_36 0x650a73548baf63deULL -#define SHA512_CONST_37 0x766a0abb3c77b2a8ULL -#define SHA512_CONST_38 0x81c2c92e47edaee6ULL -#define SHA512_CONST_39 0x92722c851482353bULL -#define SHA512_CONST_40 0xa2bfe8a14cf10364ULL -#define SHA512_CONST_41 0xa81a664bbc423001ULL -#define SHA512_CONST_42 0xc24b8b70d0f89791ULL -#define SHA512_CONST_43 0xc76c51a30654be30ULL -#define SHA512_CONST_44 0xd192e819d6ef5218ULL -#define SHA512_CONST_45 0xd69906245565a910ULL -#define SHA512_CONST_46 0xf40e35855771202aULL -#define SHA512_CONST_47 0x106aa07032bbd1b8ULL -#define SHA512_CONST_48 0x19a4c116b8d2d0c8ULL -#define SHA512_CONST_49 0x1e376c085141ab53ULL -#define SHA512_CONST_50 0x2748774cdf8eeb99ULL -#define SHA512_CONST_51 0x34b0bcb5e19b48a8ULL -#define SHA512_CONST_52 0x391c0cb3c5c95a63ULL -#define SHA512_CONST_53 0x4ed8aa4ae3418acbULL -#define SHA512_CONST_54 0x5b9cca4f7763e373ULL -#define SHA512_CONST_55 0x682e6ff3d6b2b8a3ULL -#define SHA512_CONST_56 0x748f82ee5defb2fcULL -#define SHA512_CONST_57 0x78a5636f43172f60ULL -#define SHA512_CONST_58 0x84c87814a1f0ab72ULL -#define SHA512_CONST_59 0x8cc702081a6439ecULL -#define SHA512_CONST_60 0x90befffa23631e28ULL -#define SHA512_CONST_61 0xa4506cebde82bde9ULL -#define SHA512_CONST_62 0xbef9a3f7b2c67915ULL -#define SHA512_CONST_63 0xc67178f2e372532bULL -#define SHA512_CONST_64 0xca273eceea26619cULL -#define SHA512_CONST_65 0xd186b8c721c0c207ULL -#define SHA512_CONST_66 0xeada7dd6cde0eb1eULL -#define SHA512_CONST_67 0xf57d4f7fee6ed178ULL -#define SHA512_CONST_68 0x06f067aa72176fbaULL -#define SHA512_CONST_69 0x0a637dc5a2c898a6ULL -#define SHA512_CONST_70 0x113f9804bef90daeULL -#define SHA512_CONST_71 0x1b710b35131c471bULL -#define SHA512_CONST_72 0x28db77f523047d84ULL -#define SHA512_CONST_73 0x32caab7b40c72493ULL -#define SHA512_CONST_74 0x3c9ebe0a15c9bebcULL -#define SHA512_CONST_75 0x431d67c49c100d4cULL -#define SHA512_CONST_76 0x4cc5d4becb3e42b6ULL -#define SHA512_CONST_77 0x597f299cfc657e2aULL -#define SHA512_CONST_78 0x5fcb6fab3ad6faecULL -#define SHA512_CONST_79 0x6c44198c4a475817ULL - - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SHA2_CONSTS_H */ diff --git a/module/icp/include/sha2/sha2_impl.h b/module/icp/include/sha2/sha2_impl.h index 0e89747eefd1..9a1bd38f1a77 100644 --- a/module/icp/include/sha2/sha2_impl.h +++ b/module/icp/include/sha2/sha2_impl.h @@ -18,9 +18,10 @@ * * CDDL HEADER END */ + /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2022 Tino Reichardt */ #ifndef _SHA2_IMPL_H @@ -32,6 +33,28 @@ extern "C" { #endif +/* transform function definition */ +typedef void (*sha256_f)(uint32_t state[8], const void *data, size_t blks); +typedef void (*sha512_f)(uint64_t state[8], const void *data, size_t blks); + +/* needed for checking valid implementations */ +typedef boolean_t (*sha2_is_supported_f)(void); + +typedef struct { + const char *name; + sha256_f transform; + sha2_is_supported_f is_supported; +} sha256_ops_t; + +typedef struct { + const char *name; + sha512_f transform; + sha2_is_supported_f is_supported; +} sha512_ops_t; + +extern const sha256_ops_t *sha256_get_ops(void); +extern const sha512_ops_t *sha512_get_ops(void); + typedef enum { SHA1_TYPE, SHA256_TYPE, diff --git a/module/icp/io/sha2_mod.c b/module/icp/io/sha2_mod.c index a58f0982c8c0..f068951b07f5 100644 --- a/module/icp/io/sha2_mod.c +++ b/module/icp/io/sha2_mod.c @@ -28,7 +28,6 @@ #include #include #include -#define _SHA2_IMPL #include #include diff --git a/module/os/freebsd/spl/sha224.h b/module/os/freebsd/spl/sha224.h deleted file mode 100644 index 0abd43068708..000000000000 --- a/module/os/freebsd/spl/sha224.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright 2005 Colin Percival - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _SHA224_H_ -#define _SHA224_H_ - -#ifndef _KERNEL -#include -#endif - -#define SHA224_BLOCK_LENGTH 64 -#define SHA224_DIGEST_LENGTH 28 -#define SHA224_DIGEST_STRING_LENGTH (SHA224_DIGEST_LENGTH * 2 + 1) - -typedef struct SHA224Context { - uint32_t state[8]; - uint64_t count; - uint8_t buf[SHA224_BLOCK_LENGTH]; -} SHA224_CTX; - -__BEGIN_DECLS - -/* Ensure libmd symbols do not clash with libcrypto */ - -#ifndef SHA224_Init -#define SHA224_Init _libmd_SHA224_Init -#endif -#ifndef SHA224_Update -#define SHA224_Update _libmd_SHA224_Update -#endif -#ifndef SHA224_Final -#define SHA224_Final _libmd_SHA224_Final -#endif -#ifndef SHA224_End -#define SHA224_End _libmd_SHA224_End -#endif -#ifndef SHA224_Fd -#define SHA224_Fd _libmd_SHA224_Fd -#endif -#ifndef SHA224_FdChunk -#define SHA224_FdChunk _libmd_SHA224_FdChunk -#endif -#ifndef SHA224_File -#define SHA224_File _libmd_SHA224_File -#endif -#ifndef SHA224_FileChunk -#define SHA224_FileChunk _libmd_SHA224_FileChunk -#endif -#ifndef SHA224_Data -#define SHA224_Data _libmd_SHA224_Data -#endif - -#ifndef SHA224_version -#define SHA224_version _libmd_SHA224_version -#endif - -void SHA224_Init(SHA224_CTX *); -void SHA224_Update(SHA224_CTX *, const void *, size_t); -void SHA224_Final(unsigned char [__min_size(SHA224_DIGEST_LENGTH)], - SHA224_CTX *); -#ifndef _KERNEL -char *SHA224_End(SHA224_CTX *, char *); -char *SHA224_Data(const void *, unsigned int, char *); -char *SHA224_Fd(int, char *); -char *SHA224_FdChunk(int, char *, off_t, off_t); -char *SHA224_File(const char *, char *); -char *SHA224_FileChunk(const char *, char *, off_t, off_t); -#endif -__END_DECLS - -#endif /* !_SHA224_H_ */ diff --git a/module/os/freebsd/spl/sha256.h b/module/os/freebsd/spl/sha256.h deleted file mode 100644 index 193c0c025120..000000000000 --- a/module/os/freebsd/spl/sha256.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright 2005 Colin Percival - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _SHA256_H_ -#define _SHA256_H_ - -#ifndef _KERNEL -#include -#endif - -#define SHA256_BLOCK_LENGTH 64 -#define SHA256_DIGEST_LENGTH 32 -#define SHA256_DIGEST_STRING_LENGTH (SHA256_DIGEST_LENGTH * 2 + 1) - -typedef struct SHA256Context { - uint32_t state[8]; - uint64_t count; - uint8_t buf[SHA256_BLOCK_LENGTH]; -} SHA256_CTX; - -__BEGIN_DECLS - -/* Ensure libmd symbols do not clash with libcrypto */ - -#ifndef SHA256_Init -#define SHA256_Init _libmd_SHA256_Init -#endif -#ifndef SHA256_Update -#define SHA256_Update _libmd_SHA256_Update -#endif -#ifndef SHA256_Final -#define SHA256_Final _libmd_SHA256_Final -#endif -#ifndef SHA256_End -#define SHA256_End _libmd_SHA256_End -#endif -#ifndef SHA256_Fd -#define SHA256_Fd _libmd_SHA256_Fd -#endif -#ifndef SHA256_FdChunk -#define SHA256_FdChunk _libmd_SHA256_FdChunk -#endif -#ifndef SHA256_File -#define SHA256_File _libmd_SHA256_File -#endif -#ifndef SHA256_FileChunk -#define SHA256_FileChunk _libmd_SHA256_FileChunk -#endif -#ifndef SHA256_Data -#define SHA256_Data _libmd_SHA256_Data -#endif - -#ifndef SHA256_Transform -#define SHA256_Transform _libmd_SHA256_Transform -#endif -#ifndef SHA256_version -#define SHA256_version _libmd_SHA256_version -#endif - -void SHA256_Init(SHA256_CTX *); -void SHA256_Update(SHA256_CTX *, const void *, size_t); -void SHA256_Final(unsigned char [__min_size(SHA256_DIGEST_LENGTH)], - SHA256_CTX *); -#ifndef _KERNEL -char *SHA256_End(SHA256_CTX *, char *); -char *SHA256_Data(const void *, unsigned int, char *); -char *SHA256_Fd(int, char *); -char *SHA256_FdChunk(int, char *, off_t, off_t); -char *SHA256_File(const char *, char *); -char *SHA256_FileChunk(const char *, char *, off_t, off_t); -#endif -__END_DECLS - -#endif /* !_SHA256_H_ */ diff --git a/module/os/freebsd/spl/sha256c.c b/module/os/freebsd/spl/sha256c.c deleted file mode 100644 index 52cf0df6c99d..000000000000 --- a/module/os/freebsd/spl/sha256c.c +++ /dev/null @@ -1,378 +0,0 @@ -/* - * Copyright 2005 Colin Percival - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include - -#ifdef _KERNEL -#include -#else -#include -#endif - - -#include -#include -#include "sha224.h" -#include "sha256.h" - -#if BYTE_ORDER == BIG_ENDIAN - -/* Copy a vector of big-endian uint32_t into a vector of bytes */ -#define be32enc_vect(dst, src, len) \ - memcpy((void *)dst, (const void *)src, (size_t)len) - -/* Copy a vector of bytes into a vector of big-endian uint32_t */ -#define be32dec_vect(dst, src, len) \ - memcpy((void *)dst, (const void *)src, (size_t)len) - -#else /* BYTE_ORDER != BIG_ENDIAN */ - -/* - * Encode a length len/4 vector of (uint32_t) into a length len vector of - * (unsigned char) in big-endian form. Assumes len is a multiple of 4. - */ -static void -be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) -{ - size_t i; - - for (i = 0; i < len / 4; i++) - be32enc(dst + i * 4, src[i]); -} - -/* - * Decode a big-endian length len vector of (unsigned char) into a length - * len/4 vector of (uint32_t). Assumes len is a multiple of 4. - */ -static void -be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len) -{ - size_t i; - - for (i = 0; i < len / 4; i++) - dst[i] = be32dec(src + i * 4); -} - -#endif /* BYTE_ORDER != BIG_ENDIAN */ - -/* SHA256 round constants. */ -static const uint32_t K[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -/* Elementary functions used by SHA256 */ -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define SHR(x, n) (x >> n) -#define ROTR(x, n) ((x >> n) | (x << (32 - n))) -#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) -#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) -#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) -#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) - -/* SHA256 round function */ -#define RND(a, b, c, d, e, f, g, h, k) \ - h += S1(e) + Ch(e, f, g) + k; \ - d += h; \ - h += S0(a) + Maj(a, b, c); - -/* Adjusted round function for rotating state */ -#define RNDr(S, W, i, ii) \ - RND(S[(64 - i) % 8], S[(65 - i) % 8], \ - S[(66 - i) % 8], S[(67 - i) % 8], \ - S[(68 - i) % 8], S[(69 - i) % 8], \ - S[(70 - i) % 8], S[(71 - i) % 8], \ - W[i + ii] + K[i + ii]) - -/* Message schedule computation */ -#define MSCH(W, ii, i) \ - W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \ - s0(W[i + ii + 1]) + W[i + ii] - -/* - * SHA256 block compression function. The 256-bit state is transformed via - * the 512-bit input block to produce a new state. - */ -static void -SHA256_Transform(uint32_t *state, const unsigned char block[64]) -{ - uint32_t W[64]; - uint32_t S[8]; - int i; - - /* 1. Prepare the first part of the message schedule W. */ - be32dec_vect(W, block, 64); - - /* 2. Initialize working variables. */ - memcpy(S, state, 32); - - /* 3. Mix. */ - for (i = 0; i < 64; i += 16) { - RNDr(S, W, 0, i); - RNDr(S, W, 1, i); - RNDr(S, W, 2, i); - RNDr(S, W, 3, i); - RNDr(S, W, 4, i); - RNDr(S, W, 5, i); - RNDr(S, W, 6, i); - RNDr(S, W, 7, i); - RNDr(S, W, 8, i); - RNDr(S, W, 9, i); - RNDr(S, W, 10, i); - RNDr(S, W, 11, i); - RNDr(S, W, 12, i); - RNDr(S, W, 13, i); - RNDr(S, W, 14, i); - RNDr(S, W, 15, i); - - if (i == 48) - break; - MSCH(W, 0, i); - MSCH(W, 1, i); - MSCH(W, 2, i); - MSCH(W, 3, i); - MSCH(W, 4, i); - MSCH(W, 5, i); - MSCH(W, 6, i); - MSCH(W, 7, i); - MSCH(W, 8, i); - MSCH(W, 9, i); - MSCH(W, 10, i); - MSCH(W, 11, i); - MSCH(W, 12, i); - MSCH(W, 13, i); - MSCH(W, 14, i); - MSCH(W, 15, i); - } - - /* 4. Mix local working variables into global state */ - for (i = 0; i < 8; i++) - state[i] += S[i]; -} - -static unsigned char PAD[64] = { - 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -/* Add padding and terminating bit-count. */ -static void -SHA256_Pad(SHA256_CTX * ctx) -{ - size_t r; - - /* Figure out how many bytes we have buffered. */ - r = (ctx->count >> 3) & 0x3f; - - /* Pad to 56 mod 64, transforming if we finish a block en route. */ - if (r < 56) { - /* Pad to 56 mod 64. */ - memcpy(&ctx->buf[r], PAD, 56 - r); - } else { - /* Finish the current block and mix. */ - memcpy(&ctx->buf[r], PAD, 64 - r); - SHA256_Transform(ctx->state, ctx->buf); - - /* The start of the final block is all zeroes. */ - memset(&ctx->buf[0], 0, 56); - } - - /* Add the terminating bit-count. */ - be64enc(&ctx->buf[56], ctx->count); - - /* Mix in the final block. */ - SHA256_Transform(ctx->state, ctx->buf); -} - -/* SHA-256 initialization. Begins a SHA-256 operation. */ -void -SHA256_Init(SHA256_CTX * ctx) -{ - - /* Zero bits processed so far */ - ctx->count = 0; - - /* Magic initialization constants */ - ctx->state[0] = 0x6A09E667; - ctx->state[1] = 0xBB67AE85; - ctx->state[2] = 0x3C6EF372; - ctx->state[3] = 0xA54FF53A; - ctx->state[4] = 0x510E527F; - ctx->state[5] = 0x9B05688C; - ctx->state[6] = 0x1F83D9AB; - ctx->state[7] = 0x5BE0CD19; -} - -/* Add bytes into the hash */ -void -SHA256_Update(SHA256_CTX * ctx, const void *in, size_t len) -{ - uint64_t bitlen; - uint32_t r; - const unsigned char *src = in; - - /* Number of bytes left in the buffer from previous updates */ - r = (ctx->count >> 3) & 0x3f; - - /* Convert the length into a number of bits */ - bitlen = len << 3; - - /* Update number of bits */ - ctx->count += bitlen; - - /* Handle the case where we don't need to perform any transforms */ - if (len < 64 - r) { - memcpy(&ctx->buf[r], src, len); - return; - } - - /* Finish the current block */ - memcpy(&ctx->buf[r], src, 64 - r); - SHA256_Transform(ctx->state, ctx->buf); - src += 64 - r; - len -= 64 - r; - - /* Perform complete blocks */ - while (len >= 64) { - SHA256_Transform(ctx->state, src); - src += 64; - len -= 64; - } - - /* Copy left over data into buffer */ - memcpy(ctx->buf, src, len); -} - -/* - * SHA-256 finalization. Pads the input data, exports the hash value, - * and clears the context state. - */ -void -SHA256_Final(unsigned char digest[static SHA256_DIGEST_LENGTH], SHA256_CTX *ctx) -{ - - /* Add padding */ - SHA256_Pad(ctx); - - /* Write the hash */ - be32enc_vect(digest, ctx->state, SHA256_DIGEST_LENGTH); - - /* Clear the context state */ - memset(ctx, 0, sizeof (*ctx)); -} - -/* SHA-224: ******************************************************* */ -/* - * the SHA224 and SHA256 transforms are identical - */ - -/* SHA-224 initialization. Begins a SHA-224 operation. */ -void -SHA224_Init(SHA224_CTX * ctx) -{ - - /* Zero bits processed so far */ - ctx->count = 0; - - /* Magic initialization constants */ - ctx->state[0] = 0xC1059ED8; - ctx->state[1] = 0x367CD507; - ctx->state[2] = 0x3070DD17; - ctx->state[3] = 0xF70E5939; - ctx->state[4] = 0xFFC00B31; - ctx->state[5] = 0x68581511; - ctx->state[6] = 0x64f98FA7; - ctx->state[7] = 0xBEFA4FA4; -} - -/* Add bytes into the SHA-224 hash */ -void -SHA224_Update(SHA224_CTX * ctx, const void *in, size_t len) -{ - - SHA256_Update((SHA256_CTX *)ctx, in, len); -} - -/* - * SHA-224 finalization. Pads the input data, exports the hash value, - * and clears the context state. - */ -void -SHA224_Final(unsigned char digest[static SHA224_DIGEST_LENGTH], SHA224_CTX *ctx) -{ - - /* Add padding */ - SHA256_Pad((SHA256_CTX *)ctx); - - /* Write the hash */ - be32enc_vect(digest, ctx->state, SHA224_DIGEST_LENGTH); - - /* Clear the context state */ - memset(ctx, 0, sizeof (*ctx)); -} - -#ifdef WEAK_REFS -/* - * When building libmd, provide weak references. Note: this is not - * activated in the context of compiling these sources for internal - * use in libcrypt. - */ -#undef SHA256_Init -__weak_reference(_libmd_SHA256_Init, SHA256_Init); -#undef SHA256_Update -__weak_reference(_libmd_SHA256_Update, SHA256_Update); -#undef SHA256_Final -__weak_reference(_libmd_SHA256_Final, SHA256_Final); -#undef SHA256_Transform -__weak_reference(_libmd_SHA256_Transform, SHA256_Transform); - -#undef SHA224_Init -__weak_reference(_libmd_SHA224_Init, SHA224_Init); -#undef SHA224_Update -__weak_reference(_libmd_SHA224_Update, SHA224_Update); -#undef SHA224_Final -__weak_reference(_libmd_SHA224_Final, SHA224_Final); -#endif diff --git a/module/os/freebsd/spl/sha384.h b/module/os/freebsd/spl/sha384.h deleted file mode 100644 index 67250cee0313..000000000000 --- a/module/os/freebsd/spl/sha384.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright 2005 Colin Percival - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _SHA384_H_ -#define _SHA384_H_ - -#ifndef _KERNEL -#include -#endif - -#define SHA384_BLOCK_LENGTH 128 -#define SHA384_DIGEST_LENGTH 48 -#define SHA384_DIGEST_STRING_LENGTH (SHA384_DIGEST_LENGTH * 2 + 1) - -typedef struct SHA384Context { - uint64_t state[8]; - uint64_t count[2]; - uint8_t buf[SHA384_BLOCK_LENGTH]; -} SHA384_CTX; - -__BEGIN_DECLS - -/* Ensure libmd symbols do not clash with libcrypto */ -#ifndef SHA384_Init -#define SHA384_Init _libmd_SHA384_Init -#endif -#ifndef SHA384_Update -#define SHA384_Update _libmd_SHA384_Update -#endif -#ifndef SHA384_Final -#define SHA384_Final _libmd_SHA384_Final -#endif -#ifndef SHA384_End -#define SHA384_End _libmd_SHA384_End -#endif -#ifndef SHA384_Fd -#define SHA384_Fd _libmd_SHA384_Fd -#endif -#ifndef SHA384_FdChunk -#define SHA384_FdChunk _libmd_SHA384_FdChunk -#endif -#ifndef SHA384_File -#define SHA384_File _libmd_SHA384_File -#endif -#ifndef SHA384_FileChunk -#define SHA384_FileChunk _libmd_SHA384_FileChunk -#endif -#ifndef SHA384_Data -#define SHA384_Data _libmd_SHA384_Data -#endif - -#ifndef SHA384_version -#define SHA384_version _libmd_SHA384_version -#endif - -void SHA384_Init(SHA384_CTX *); -void SHA384_Update(SHA384_CTX *, const void *, size_t); -void SHA384_Final(unsigned char [__min_size(SHA384_DIGEST_LENGTH)], - SHA384_CTX *); -#ifndef _KERNEL -char *SHA384_End(SHA384_CTX *, char *); -char *SHA384_Data(const void *, unsigned int, char *); -char *SHA384_Fd(int, char *); -char *SHA384_FdChunk(int, char *, off_t, off_t); -char *SHA384_File(const char *, char *); -char *SHA384_FileChunk(const char *, char *, off_t, off_t); -#endif - -__END_DECLS - -#endif /* !_SHA384_H_ */ diff --git a/module/os/freebsd/spl/sha512.h b/module/os/freebsd/spl/sha512.h deleted file mode 100644 index b6fb733ca54e..000000000000 --- a/module/os/freebsd/spl/sha512.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright 2005 Colin Percival - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _SHA512_H_ -#define _SHA512_H_ - -#ifndef _KERNEL -#include -#endif - -#define SHA512_BLOCK_LENGTH 128 -#define SHA512_DIGEST_LENGTH 64 -#define SHA512_DIGEST_STRING_LENGTH (SHA512_DIGEST_LENGTH * 2 + 1) - -typedef struct SHA512Context { - uint64_t state[8]; - uint64_t count[2]; - uint8_t buf[SHA512_BLOCK_LENGTH]; -} SHA512_CTX; - -__BEGIN_DECLS - -/* Ensure libmd symbols do not clash with libcrypto */ -#if 0 -#ifndef SHA512_Init -#define SHA512_Init _libmd_SHA512_Init -#endif -#ifndef SHA512_Update -#define SHA512_Update _libmd_SHA512_Update -#endif -#ifndef SHA512_Final -#define SHA512_Final _libmd_SHA512_Final -#endif -#endif -#ifndef SHA512_End -#define SHA512_End _libmd_SHA512_End -#endif -#ifndef SHA512_Fd -#define SHA512_Fd _libmd_SHA512_Fd -#endif -#ifndef SHA512_FdChunk -#define SHA512_FdChunk _libmd_SHA512_FdChunk -#endif -#ifndef SHA512_File -#define SHA512_File _libmd_SHA512_File -#endif -#ifndef SHA512_FileChunk -#define SHA512_FileChunk _libmd_SHA512_FileChunk -#endif -#ifndef SHA512_Data -#define SHA512_Data _libmd_SHA512_Data -#endif - -#ifndef SHA512_Transform -#define SHA512_Transform _libmd_SHA512_Transform -#endif -#ifndef SHA512_version -#define SHA512_version _libmd_SHA512_version -#endif - -void SHA512_Init(SHA512_CTX *); -void SHA512_Update(SHA512_CTX *, const void *, size_t); -void SHA512_Final(unsigned char [__min_size(SHA512_DIGEST_LENGTH)], - SHA512_CTX *); -#ifndef _KERNEL -char *SHA512_End(SHA512_CTX *, char *); -char *SHA512_Data(const void *, unsigned int, char *); -char *SHA512_Fd(int, char *); -char *SHA512_FdChunk(int, char *, off_t, off_t); -char *SHA512_File(const char *, char *); -char *SHA512_FileChunk(const char *, char *, off_t, off_t); -#endif - -__END_DECLS - -#endif /* !_SHA512_H_ */ diff --git a/module/os/freebsd/spl/sha512c.c b/module/os/freebsd/spl/sha512c.c deleted file mode 100644 index 254cc21565c1..000000000000 --- a/module/os/freebsd/spl/sha512c.c +++ /dev/null @@ -1,508 +0,0 @@ -/* - * Copyright 2005 Colin Percival - * Copyright (c) 2015 Allan Jude - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include - -#ifdef _KERNEL -#include -#else -#include -#endif - -#include "sha512.h" -#include "sha512t.h" -#include "sha384.h" - -#if BYTE_ORDER == BIG_ENDIAN - -/* Copy a vector of big-endian uint64_t into a vector of bytes */ -#define be64enc_vect(dst, src, len) \ - memcpy((void *)dst, (const void *)src, (size_t)len) - -/* Copy a vector of bytes into a vector of big-endian uint64_t */ -#define be64dec_vect(dst, src, len) \ - memcpy((void *)dst, (const void *)src, (size_t)len) - -#else /* BYTE_ORDER != BIG_ENDIAN */ - -/* - * Encode a length len/4 vector of (uint64_t) into a length len vector of - * (unsigned char) in big-endian form. Assumes len is a multiple of 8. - */ -static void -be64enc_vect(unsigned char *dst, const uint64_t *src, size_t len) -{ - size_t i; - - for (i = 0; i < len / 8; i++) - be64enc(dst + i * 8, src[i]); -} - -/* - * Decode a big-endian length len vector of (unsigned char) into a length - * len/4 vector of (uint64_t). Assumes len is a multiple of 8. - */ -static void -be64dec_vect(uint64_t *dst, const unsigned char *src, size_t len) -{ - size_t i; - - for (i = 0; i < len / 8; i++) - dst[i] = be64dec(src + i * 8); -} - -#endif /* BYTE_ORDER != BIG_ENDIAN */ - -/* SHA512 round constants. */ -static const uint64_t K[80] = { - 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, - 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, - 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, - 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, - 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, - 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, - 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, - 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, - 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, - 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, - 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, - 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, - 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, - 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, - 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, - 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, - 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, - 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, - 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, - 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, - 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, - 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, - 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, - 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, - 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, - 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, - 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, - 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, - 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, - 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, - 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, - 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, - 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, - 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, - 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, - 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, - 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, - 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, - 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, - 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL -}; - -/* Elementary functions used by SHA512 */ -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define SHR(x, n) (x >> n) -#define ROTR(x, n) ((x >> n) | (x << (64 - n))) -#define S0(x) (ROTR(x, 28) ^ ROTR(x, 34) ^ ROTR(x, 39)) -#define S1(x) (ROTR(x, 14) ^ ROTR(x, 18) ^ ROTR(x, 41)) -#define s0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x, 7)) -#define s1(x) (ROTR(x, 19) ^ ROTR(x, 61) ^ SHR(x, 6)) - -/* SHA512 round function */ -#define RND(a, b, c, d, e, f, g, h, k) \ - h += S1(e) + Ch(e, f, g) + k; \ - d += h; \ - h += S0(a) + Maj(a, b, c); - -/* Adjusted round function for rotating state */ -#define RNDr(S, W, i, ii) \ - RND(S[(80 - i) % 8], S[(81 - i) % 8], \ - S[(82 - i) % 8], S[(83 - i) % 8], \ - S[(84 - i) % 8], S[(85 - i) % 8], \ - S[(86 - i) % 8], S[(87 - i) % 8], \ - W[i + ii] + K[i + ii]) - -/* Message schedule computation */ -#define MSCH(W, ii, i) \ - W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \ - s0(W[i + ii + 1]) + W[i + ii] - -/* - * SHA512 block compression function. The 512-bit state is transformed via - * the 512-bit input block to produce a new state. - */ -static void -SHA512_Transform(uint64_t *state, - const unsigned char block[SHA512_BLOCK_LENGTH]) -{ - uint64_t W[80]; - uint64_t S[8]; - int i; - - /* 1. Prepare the first part of the message schedule W. */ - be64dec_vect(W, block, SHA512_BLOCK_LENGTH); - - /* 2. Initialize working variables. */ - memcpy(S, state, SHA512_DIGEST_LENGTH); - - /* 3. Mix. */ - for (i = 0; i < 80; i += 16) { - RNDr(S, W, 0, i); - RNDr(S, W, 1, i); - RNDr(S, W, 2, i); - RNDr(S, W, 3, i); - RNDr(S, W, 4, i); - RNDr(S, W, 5, i); - RNDr(S, W, 6, i); - RNDr(S, W, 7, i); - RNDr(S, W, 8, i); - RNDr(S, W, 9, i); - RNDr(S, W, 10, i); - RNDr(S, W, 11, i); - RNDr(S, W, 12, i); - RNDr(S, W, 13, i); - RNDr(S, W, 14, i); - RNDr(S, W, 15, i); - - if (i == 64) - break; - MSCH(W, 0, i); - MSCH(W, 1, i); - MSCH(W, 2, i); - MSCH(W, 3, i); - MSCH(W, 4, i); - MSCH(W, 5, i); - MSCH(W, 6, i); - MSCH(W, 7, i); - MSCH(W, 8, i); - MSCH(W, 9, i); - MSCH(W, 10, i); - MSCH(W, 11, i); - MSCH(W, 12, i); - MSCH(W, 13, i); - MSCH(W, 14, i); - MSCH(W, 15, i); - } - - /* 4. Mix local working variables into global state */ - for (i = 0; i < 8; i++) - state[i] += S[i]; -} - -static unsigned char PAD[SHA512_BLOCK_LENGTH] = { - 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -/* Add padding and terminating bit-count. */ -static void -SHA512_Pad(SHA512_CTX * ctx) -{ - size_t r; - - /* Figure out how many bytes we have buffered. */ - r = (ctx->count[1] >> 3) & 0x7f; - - /* Pad to 112 mod 128, transforming if we finish a block en route. */ - if (r < 112) { - /* Pad to 112 mod 128. */ - memcpy(&ctx->buf[r], PAD, 112 - r); - } else { - /* Finish the current block and mix. */ - memcpy(&ctx->buf[r], PAD, 128 - r); - SHA512_Transform(ctx->state, ctx->buf); - - /* The start of the final block is all zeroes. */ - memset(&ctx->buf[0], 0, 112); - } - - /* Add the terminating bit-count. */ - be64enc_vect(&ctx->buf[112], ctx->count, 16); - - /* Mix in the final block. */ - SHA512_Transform(ctx->state, ctx->buf); -} - -/* SHA-512 initialization. Begins a SHA-512 operation. */ -void -SHA512_Init(SHA512_CTX * ctx) -{ - - /* Zero bits processed so far */ - ctx->count[0] = ctx->count[1] = 0; - - /* Magic initialization constants */ - ctx->state[0] = 0x6a09e667f3bcc908ULL; - ctx->state[1] = 0xbb67ae8584caa73bULL; - ctx->state[2] = 0x3c6ef372fe94f82bULL; - ctx->state[3] = 0xa54ff53a5f1d36f1ULL; - ctx->state[4] = 0x510e527fade682d1ULL; - ctx->state[5] = 0x9b05688c2b3e6c1fULL; - ctx->state[6] = 0x1f83d9abfb41bd6bULL; - ctx->state[7] = 0x5be0cd19137e2179ULL; -} - -/* Add bytes into the hash */ -void -SHA512_Update(SHA512_CTX * ctx, const void *in, size_t len) -{ - uint64_t bitlen[2]; - uint64_t r; - const unsigned char *src = in; - - /* Number of bytes left in the buffer from previous updates */ - r = (ctx->count[1] >> 3) & 0x7f; - - /* Convert the length into a number of bits */ - bitlen[1] = ((uint64_t)len) << 3; - bitlen[0] = ((uint64_t)len) >> 61; - - /* Update number of bits */ - if ((ctx->count[1] += bitlen[1]) < bitlen[1]) - ctx->count[0]++; - ctx->count[0] += bitlen[0]; - - /* Handle the case where we don't need to perform any transforms */ - if (len < SHA512_BLOCK_LENGTH - r) { - memcpy(&ctx->buf[r], src, len); - return; - } - - /* Finish the current block */ - memcpy(&ctx->buf[r], src, SHA512_BLOCK_LENGTH - r); - SHA512_Transform(ctx->state, ctx->buf); - src += SHA512_BLOCK_LENGTH - r; - len -= SHA512_BLOCK_LENGTH - r; - - /* Perform complete blocks */ - while (len >= SHA512_BLOCK_LENGTH) { - SHA512_Transform(ctx->state, src); - src += SHA512_BLOCK_LENGTH; - len -= SHA512_BLOCK_LENGTH; - } - - /* Copy left over data into buffer */ - memcpy(ctx->buf, src, len); -} - -/* - * SHA-512 finalization. Pads the input data, exports the hash value, - * and clears the context state. - */ -void -SHA512_Final(unsigned char digest[static SHA512_DIGEST_LENGTH], SHA512_CTX *ctx) -{ - - /* Add padding */ - SHA512_Pad(ctx); - - /* Write the hash */ - be64enc_vect(digest, ctx->state, SHA512_DIGEST_LENGTH); - - /* Clear the context state */ - memset(ctx, 0, sizeof (*ctx)); -} - -/* SHA-512t: ******************************************************** */ -/* - * the SHA512t transforms are identical to SHA512 so reuse the existing function - */ -void -SHA512_224_Init(SHA512_CTX * ctx) -{ - - /* Zero bits processed so far */ - ctx->count[0] = ctx->count[1] = 0; - - /* Magic initialization constants */ - ctx->state[0] = 0x8c3d37c819544da2ULL; - ctx->state[1] = 0x73e1996689dcd4d6ULL; - ctx->state[2] = 0x1dfab7ae32ff9c82ULL; - ctx->state[3] = 0x679dd514582f9fcfULL; - ctx->state[4] = 0x0f6d2b697bd44da8ULL; - ctx->state[5] = 0x77e36f7304c48942ULL; - ctx->state[6] = 0x3f9d85a86a1d36c8ULL; - ctx->state[7] = 0x1112e6ad91d692a1ULL; -} - -void -SHA512_224_Update(SHA512_CTX * ctx, const void *in, size_t len) -{ - - SHA512_Update(ctx, in, len); -} - -void -SHA512_224_Final(unsigned char digest[static SHA512_224_DIGEST_LENGTH], - SHA512_CTX *ctx) -{ - - /* Add padding */ - SHA512_Pad(ctx); - - /* Write the hash */ - be64enc_vect(digest, ctx->state, SHA512_224_DIGEST_LENGTH); - - /* Clear the context state */ - memset(ctx, 0, sizeof (*ctx)); -} - -void -SHA512_256_Init(SHA512_CTX * ctx) -{ - - /* Zero bits processed so far */ - ctx->count[0] = ctx->count[1] = 0; - - /* Magic initialization constants */ - ctx->state[0] = 0x22312194fc2bf72cULL; - ctx->state[1] = 0x9f555fa3c84c64c2ULL; - ctx->state[2] = 0x2393b86b6f53b151ULL; - ctx->state[3] = 0x963877195940eabdULL; - ctx->state[4] = 0x96283ee2a88effe3ULL; - ctx->state[5] = 0xbe5e1e2553863992ULL; - ctx->state[6] = 0x2b0199fc2c85b8aaULL; - ctx->state[7] = 0x0eb72ddc81c52ca2ULL; -} - -void -SHA512_256_Update(SHA512_CTX * ctx, const void *in, size_t len) -{ - - SHA512_Update(ctx, in, len); -} - -void -SHA512_256_Final(unsigned char digest[static SHA512_256_DIGEST_LENGTH], - SHA512_CTX * ctx) -{ - - /* Add padding */ - SHA512_Pad(ctx); - - /* Write the hash */ - be64enc_vect(digest, ctx->state, SHA512_256_DIGEST_LENGTH); - - /* Clear the context state */ - memset(ctx, 0, sizeof (*ctx)); -} - -/* ** SHA-384: ******************************************************** */ -/* - * the SHA384 and SHA512 transforms are identical, so SHA384 is skipped - */ - -/* SHA-384 initialization. Begins a SHA-384 operation. */ -void -SHA384_Init(SHA384_CTX * ctx) -{ - - /* Zero bits processed so far */ - ctx->count[0] = ctx->count[1] = 0; - - /* Magic initialization constants */ - ctx->state[0] = 0xcbbb9d5dc1059ed8ULL; - ctx->state[1] = 0x629a292a367cd507ULL; - ctx->state[2] = 0x9159015a3070dd17ULL; - ctx->state[3] = 0x152fecd8f70e5939ULL; - ctx->state[4] = 0x67332667ffc00b31ULL; - ctx->state[5] = 0x8eb44a8768581511ULL; - ctx->state[6] = 0xdb0c2e0d64f98fa7ULL; - ctx->state[7] = 0x47b5481dbefa4fa4ULL; -} - -/* Add bytes into the SHA-384 hash */ -void -SHA384_Update(SHA384_CTX * ctx, const void *in, size_t len) -{ - - SHA512_Update((SHA512_CTX *)ctx, in, len); -} - -/* - * SHA-384 finalization. Pads the input data, exports the hash value, - * and clears the context state. - */ -void -SHA384_Final(unsigned char digest[static SHA384_DIGEST_LENGTH], SHA384_CTX *ctx) -{ - - /* Add padding */ - SHA512_Pad((SHA512_CTX *)ctx); - - /* Write the hash */ - be64enc_vect(digest, ctx->state, SHA384_DIGEST_LENGTH); - - /* Clear the context state */ - memset(ctx, 0, sizeof (*ctx)); -} - -#if 0 -/* - * When building libmd, provide weak references. Note: this is not - * activated in the context of compiling these sources for internal - * use in libcrypt. - */ -#undef SHA512_Init -__weak_reference(_libmd_SHA512_Init, SHA512_Init); -#undef SHA512_Update -__weak_reference(_libmd_SHA512_Update, SHA512_Update); -#undef SHA512_Final -__weak_reference(_libmd_SHA512_Final, SHA512_Final); -#undef SHA512_Transform -__weak_reference(_libmd_SHA512_Transform, SHA512_Transform); - -#undef SHA512_224_Init -__weak_reference(_libmd_SHA512_224_Init, SHA512_224_Init); -#undef SHA512_224_Update -__weak_reference(_libmd_SHA512_224_Update, SHA512_224_Update); -#undef SHA512_224_Final -__weak_reference(_libmd_SHA512_224_Final, SHA512_224_Final); - -#undef SHA512_256_Init -__weak_reference(_libmd_SHA512_256_Init, SHA512_256_Init); -#undef SHA512_256_Update -__weak_reference(_libmd_SHA512_256_Update, SHA512_256_Update); -#undef SHA512_256_Final -__weak_reference(_libmd_SHA512_256_Final, SHA512_256_Final); - -#undef SHA384_Init -__weak_reference(_libmd_SHA384_Init, SHA384_Init); -#undef SHA384_Update -__weak_reference(_libmd_SHA384_Update, SHA384_Update); -#undef SHA384_Final -__weak_reference(_libmd_SHA384_Final, SHA384_Final); -#endif diff --git a/module/os/freebsd/spl/sha512t.h b/module/os/freebsd/spl/sha512t.h deleted file mode 100644 index 703867fc0288..000000000000 --- a/module/os/freebsd/spl/sha512t.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2015 Allan Jude - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _SHA512T_H_ -#define _SHA512T_H_ - -#include "sha512.h" - -#ifndef _KERNEL -#include -#endif - -#define SHA512_224_DIGEST_LENGTH 28 -#define SHA512_224_DIGEST_STRING_LENGTH (SHA512_224_DIGEST_LENGTH * 2 + 1) -#define SHA512_256_DIGEST_LENGTH 32 -#define SHA512_256_DIGEST_STRING_LENGTH (SHA512_256_DIGEST_LENGTH * 2 + 1) - -__BEGIN_DECLS - -/* Ensure libmd symbols do not clash with libcrypto */ -#ifndef SHA512_224_Init -#define SHA512_224_Init _libmd_SHA512_224_Init -#endif -#ifndef SHA512_224_Update -#define SHA512_224_Update _libmd_SHA512_224_Update -#endif -#ifndef SHA512_224_Final -#define SHA512_224_Final _libmd_SHA512_224_Final -#endif -#ifndef SHA512_224_End -#define SHA512_224_End _libmd_SHA512_224_End -#endif -#ifndef SHA512_224_Fd -#define SHA512_224_Fd _libmd_SHA512_224_Fd -#endif -#ifndef SHA512_224_FdChunk -#define SHA512_224_FdChunk _libmd_SHA512_224_FdChunk -#endif -#ifndef SHA512_224_File -#define SHA512_224_File _libmd_SHA512_224_File -#endif -#ifndef SHA512_224_FileChunk -#define SHA512_224_FileChunk _libmd_SHA512_224_FileChunk -#endif -#ifndef SHA512_224_Data -#define SHA512_224_Data _libmd_SHA512_224_Data -#endif - -#ifndef SHA512_224_Transform -#define SHA512_224_Transform _libmd_SHA512_224_Transform -#endif -#ifndef SHA512_224_version -#define SHA512_224_version _libmd_SHA512_224_version -#endif - -#ifndef SHA512_256_Init -#define SHA512_256_Init _libmd_SHA512_256_Init -#endif -#ifndef SHA512_256_Update -#define SHA512_256_Update _libmd_SHA512_256_Update -#endif -#ifndef SHA512_256_Final -#define SHA512_256_Final _libmd_SHA512_256_Final -#endif -#ifndef SHA512_256_End -#define SHA512_256_End _libmd_SHA512_256_End -#endif -#ifndef SHA512_256_Fd -#define SHA512_256_Fd _libmd_SHA512_256_Fd -#endif -#ifndef SHA512_256_FdChunk -#define SHA512_256_FdChunk _libmd_SHA512_256_FdChunk -#endif -#ifndef SHA512_256_File -#define SHA512_256_File _libmd_SHA512_256_File -#endif -#ifndef SHA512_256_FileChunk -#define SHA512_256_FileChunk _libmd_SHA512_256_FileChunk -#endif -#ifndef SHA512_256_Data -#define SHA512_256_Data _libmd_SHA512_256_Data -#endif - -#ifndef SHA512_256_Transform -#define SHA512_256_Transform _libmd_SHA512_256_Transform -#endif -#ifndef SHA512_256_version -#define SHA512_256_version _libmd_SHA512_256_version -#endif - -void SHA512_224_Init(SHA512_CTX *); -void SHA512_224_Update(SHA512_CTX *, const void *, size_t); -void SHA512_224_Final(unsigned char [__min_size(SHA512_224_DIGEST_LENGTH)], - SHA512_CTX *); -#ifndef _KERNEL -char *SHA512_224_End(SHA512_CTX *, char *); -char *SHA512_224_Data(const void *, unsigned int, char *); -char *SHA512_224_Fd(int, char *); -char *SHA512_224_FdChunk(int, char *, off_t, off_t); -char *SHA512_224_File(const char *, char *); -char *SHA512_224_FileChunk(const char *, char *, off_t, off_t); -#endif -void SHA512_256_Init(SHA512_CTX *); -void SHA512_256_Update(SHA512_CTX *, const void *, size_t); -void SHA512_256_Final(unsigned char [__min_size(SHA512_256_DIGEST_LENGTH)], - SHA512_CTX *); -#ifndef _KERNEL -char *SHA512_256_End(SHA512_CTX *, char *); -char *SHA512_256_Data(const void *, unsigned int, char *); -char *SHA512_256_Fd(int, char *); -char *SHA512_256_FdChunk(int, char *, off_t, off_t); -char *SHA512_256_File(const char *, char *); -char *SHA512_256_FileChunk(const char *, char *, off_t, off_t); -#endif - -__END_DECLS - -#endif /* !_SHA512T_H_ */ diff --git a/module/zfs/sha256.c b/module/zfs/sha2_zfs.c similarity index 90% rename from module/zfs/sha256.c rename to module/zfs/sha2_zfs.c index 445d82ed020c..872b1e53ee66 100644 --- a/module/zfs/sha256.c +++ b/module/zfs/sha2_zfs.c @@ -18,16 +18,14 @@ * * CDDL HEADER END */ + /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. */ + #include -#include #include #include #include @@ -42,7 +40,7 @@ sha_incremental(void *buf, size_t size, void *arg) } void -abd_checksum_SHA256(abd_t *abd, uint64_t size, +abd_checksum_sha256(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; @@ -79,7 +77,7 @@ abd_checksum_SHA256(abd_t *abd, uint64_t size, } void -abd_checksum_SHA512_native(abd_t *abd, uint64_t size, +abd_checksum_sha512_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; @@ -91,12 +89,12 @@ abd_checksum_SHA512_native(abd_t *abd, uint64_t size, } void -abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size, +abd_checksum_sha512_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { zio_cksum_t tmp; - abd_checksum_SHA512_native(abd, size, ctx_template, &tmp); + abd_checksum_sha512_native(abd, size, ctx_template, &tmp); zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); diff --git a/module/zfs/zfs_chksum.c b/module/zfs/zfs_chksum.c index 91247f29278f..acedeab7a163 100644 --- a/module/zfs/zfs_chksum.c +++ b/module/zfs/zfs_chksum.c @@ -23,13 +23,13 @@ * Copyright (c) 2021-2022 Tino Reichardt */ -#include -#include #include #include #include +#include #include +#include /* limit benchmarking to max 256KiB, when EdonR is slower then this: */ #define LIMIT_PERF_MBS 300 @@ -56,25 +56,26 @@ static int chksum_stat_cnt = 0; static kstat_t *chksum_kstat = NULL; /* - * i3-1005G1 test output: + * Sample output on i3-1005G1 System: * - * implementation 1k 4k 16k 64k 256k 1m 4m - * fletcher-4 5421 15001 26468 32555 34720 32801 18847 - * edonr-generic 1196 1602 1761 1749 1762 1759 1751 - * skein-generic 546 591 608 615 619 612 616 - * sha256-generic 246 270 274 274 277 275 276 - * sha256-avx 262 296 304 307 307 307 306 - * sha256-sha-ni 769 1072 1172 1220 1219 1232 1228 - * sha256-openssl 240 300 316 314 304 285 276 - * sha512-generic 333 374 385 392 391 393 392 - * sha512-openssl 353 441 467 476 472 467 426 - * sha512-avx 362 444 473 475 479 476 478 - * sha512-avx2 394 500 530 538 543 545 542 - * blake3-generic 308 313 313 313 312 313 312 - * blake3-sse2 402 1289 1423 1446 1432 1458 1413 - * blake3-sse41 427 1470 1625 1704 1679 1607 1629 - * blake3-avx2 428 1920 3095 3343 3356 3318 3204 - * blake3-avx512 473 2687 4905 5836 5844 5643 5374 + * implementation 1k 4k 16k 64k 256k 1m 4m 16m + * edonr-generic 1278 1625 1769 1776 1783 1778 1771 1767 + * skein-generic 548 594 613 623 621 623 621 486 + * sha256-generic 255 270 281 278 279 281 283 283 + * sha256-x64 288 310 316 317 318 317 317 316 + * sha256-ssse3 304 342 351 355 356 357 356 356 + * sha256-avx 311 348 359 362 362 363 363 362 + * sha256-avx2 330 378 389 395 395 395 395 395 + * sha256-shani 908 1127 1212 1230 1233 1234 1223 1230 + * sha512-generic 359 409 431 427 429 430 428 423 + * sha512-x64 420 473 490 496 497 497 496 495 + * sha512-avx 406 522 546 560 560 560 556 560 + * sha512-avx2 464 568 601 606 609 610 607 608 + * blake3-generic 330 327 324 323 324 320 323 322 + * blake3-sse2 424 1366 1449 1468 1458 1453 1395 1408 + * blake3-sse41 453 1554 1658 1703 1689 1669 1622 1630 + * blake3-avx2 452 2013 3225 3351 3356 3261 3076 3101 + * blake3-avx512 498 2869 5269 5926 5872 5643 5014 5005 */ static int chksum_kstat_headers(char *buf, size_t size) @@ -237,25 +238,30 @@ chksum_benchit(chksum_stat_t *cs) static void chksum_benchmark(void) { - #ifndef _KERNEL /* we need the benchmark only for the kernel module */ return; #endif chksum_stat_t *cs; - int cbid = 0; - uint64_t max = 0; - uint32_t id, id_save; - - /* space for the benchmark times */ - chksum_stat_cnt = 4; - chksum_stat_cnt += blake3_impl_getcnt(); + uint64_t max; + uint32_t id, cbid = 0, id_save; + const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); + const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256"); + const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512"); + + /* count implementations */ + chksum_stat_cnt = 2; + chksum_stat_cnt += sha256->getcnt(); + chksum_stat_cnt += sha512->getcnt(); + chksum_stat_cnt += blake3->getcnt(); chksum_stat_data = kmem_zalloc( sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP); /* edonr - needs to be the first one here (slow CPU check) */ cs = &chksum_stat_data[cbid++]; + + /* edonr */ cs->init = abd_checksum_edonr_tmpl_init; cs->func = abd_checksum_edonr_native; cs->free = abd_checksum_edonr_tmpl_free; @@ -273,42 +279,58 @@ chksum_benchmark(void) chksum_benchit(cs); /* sha256 */ - cs = &chksum_stat_data[cbid++]; - cs->init = 0; - cs->func = abd_checksum_SHA256; - cs->free = 0; - cs->name = "sha256"; - cs->impl = "generic"; - chksum_benchit(cs); + id_save = sha256->getid(); + for (max = 0, id = 0; id < sha256->getcnt(); id++) { + sha256->setid(id); + cs = &chksum_stat_data[cbid++]; + cs->init = 0; + cs->func = abd_checksum_sha256; + cs->free = 0; + cs->name = sha256->name; + cs->impl = sha256->getname(); + chksum_benchit(cs); + if (cs->bs256k > max) { + max = cs->bs256k; + sha256->set_fastest(id); + } + } + sha256->setid(id_save); /* sha512 */ - cs = &chksum_stat_data[cbid++]; - cs->init = 0; - cs->func = abd_checksum_SHA512_native; - cs->free = 0; - cs->name = "sha512"; - cs->impl = "generic"; - chksum_benchit(cs); + id_save = sha512->getid(); + for (max = 0, id = 0; id < sha512->getcnt(); id++) { + sha512->setid(id); + cs = &chksum_stat_data[cbid++]; + cs->init = 0; + cs->func = abd_checksum_sha512_native; + cs->free = 0; + cs->name = sha512->name; + cs->impl = sha512->getname(); + chksum_benchit(cs); + if (cs->bs256k > max) { + max = cs->bs256k; + sha512->set_fastest(id); + } + } + sha512->setid(id_save); /* blake3 */ - id_save = blake3_impl_getid(); - for (id = 0; id < blake3_impl_getcnt(); id++) { - blake3_impl_setid(id); + id_save = blake3->getid(); + for (max = 0, id = 0; id < blake3->getcnt(); id++) { + blake3->setid(id); cs = &chksum_stat_data[cbid++]; cs->init = abd_checksum_blake3_tmpl_init; cs->func = abd_checksum_blake3_native; cs->free = abd_checksum_blake3_tmpl_free; - cs->name = "blake3"; - cs->impl = blake3_impl_getname(); + cs->name = blake3->name; + cs->impl = blake3->getname(); chksum_benchit(cs); if (cs->bs256k > max) { max = cs->bs256k; - blake3_impl_set_fastest(id); + blake3->set_fastest(id); } } - - /* restore initial value */ - blake3_impl_setid(id_save); + blake3->setid(id_save); } void diff --git a/module/zfs/zfs_impl.c b/module/zfs/zfs_impl.c new file mode 100644 index 000000000000..20322ff98b31 --- /dev/null +++ b/module/zfs/zfs_impl.c @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt + */ + +#include +#include +#include + +#include +#include + +/* + * impl_ops - backend for implementations of algorithms + */ +const zfs_impl_t *impl_ops[] = { + &zfs_blake3_ops, + &zfs_sha256_ops, + &zfs_sha512_ops, + NULL +}; + +/* + * zfs_impl_get_ops - Get the API functions for an impl backend + */ +const zfs_impl_t * +zfs_impl_get_ops(const char *algo) +{ + const zfs_impl_t **ops = impl_ops; + + if (!algo || !*algo) + return (*ops); + + for (; *ops; ops++) { + if (strcmp(algo, (*ops)->name) == 0) + break; + } + + ASSERT3P(ops, !=, NULL); + return (*ops); +} diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 3743eaa532ef..6090959c5b8c 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -165,10 +165,10 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, NULL, NULL, 0, "on"}, {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "off"}, - {{abd_checksum_SHA256, abd_checksum_SHA256}, + {{abd_checksum_sha256, abd_checksum_sha256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, - {{abd_checksum_SHA256, abd_checksum_SHA256}, + {{abd_checksum_sha256, abd_checksum_sha256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, @@ -177,14 +177,14 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { NULL, NULL, 0, "fletcher2"}, {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, - {{abd_checksum_SHA256, abd_checksum_SHA256}, + {{abd_checksum_sha256, abd_checksum_sha256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "noparity"}, - {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap}, + {{abd_checksum_sha512_native, abd_checksum_sha512_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, {{abd_checksum_skein_native, abd_checksum_skein_byteswap}, diff --git a/tests/zfs-tests/cmd/checksum/blake3_test.c b/tests/zfs-tests/cmd/checksum/blake3_test.c index 648e1faaaeb7..aebe0363cc6e 100644 --- a/tests/zfs-tests/cmd/checksum/blake3_test.c +++ b/tests/zfs-tests/cmd/checksum/blake3_test.c @@ -31,6 +31,8 @@ #include #include +#include + /* * set it to a define for debugging */ @@ -485,10 +487,14 @@ main(int argc, char *argv[]) uint8_t buffer[102400]; uint64_t cpu_mhz = 0; int id, i, j; + const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); if (argc == 2) cpu_mhz = atoi(argv[1]); + if (!blake3) + return (1); + /* fill test message */ for (i = 0, j = 0; i < sizeof (buffer); i++, j++) { if (j == 251) @@ -497,9 +503,9 @@ main(int argc, char *argv[]) } (void) printf("Running algorithm correctness tests:\n"); - for (id = 0; id < blake3_impl_getcnt(); id++) { - blake3_impl_setid(id); - const char *name = blake3_impl_getname(); + for (id = 0; id < blake3->getcnt(); id++) { + blake3->setid(id); + const char *name = blake3->getname(); dprintf("Result for BLAKE3-%s:\n", name); for (i = 0; TestArray[i].hash; i++) { blake3_test_t *cur = &TestArray[i]; @@ -565,9 +571,9 @@ main(int argc, char *argv[]) } while (0) printf("Running performance tests (hashing 1024 MiB of data):\n"); - for (id = 0; id < blake3_impl_getcnt(); id++) { - blake3_impl_setid(id); - const char *name = blake3_impl_getname(); + for (id = 0; id < blake3->getcnt(); id++) { + blake3->setid(id); + const char *name = blake3->getname(); BLAKE3_PERF_TEST(name, 256); } diff --git a/tests/zfs-tests/cmd/checksum/sha2_test.c b/tests/zfs-tests/cmd/checksum/sha2_test.c index d99e8757a24c..efcf812d7749 100644 --- a/tests/zfs-tests/cmd/checksum/sha2_test.c +++ b/tests/zfs-tests/cmd/checksum/sha2_test.c @@ -33,11 +33,11 @@ #include #include #include + #include -#define _SHA2_IMPL #include #include - +#include /* * Test messages from: @@ -174,9 +174,19 @@ main(int argc, char *argv[]) boolean_t failed = B_FALSE; uint64_t cpu_mhz = 0; + const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256"); + const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512"); + uint32_t id; + if (argc == 2) cpu_mhz = atoi(argv[1]); + if (!sha256) + return (1); + + if (!sha512) + return (1); + #define SHA2_ALGO_TEST(_m, mode, diglen, testdigest) \ do { \ SHA2_CTX ctx; \ @@ -194,7 +204,7 @@ main(int argc, char *argv[]) } \ } while (0) -#define SHA2_PERF_TEST(mode, diglen) \ +#define SHA2_PERF_TEST(mode, diglen, name) \ do { \ SHA2_CTX ctx; \ uint8_t digest[diglen / 8]; \ @@ -216,8 +226,8 @@ main(int argc, char *argv[]) cpb = (cpu_mhz * 1e6 * ((double)delta / \ 1000000)) / (8192 * 128 * 1024); \ } \ - (void) printf("SHA%-9s%llu us (%.02f CPB)\n", #mode, \ - (u_longlong_t)delta, cpb); \ + (void) printf("sha%s-%-9s%7llu us (%.02f CPB)\n", #mode,\ + name, (u_longlong_t)delta, cpb); \ } while (0) (void) printf("Running algorithm correctness tests:\n"); @@ -237,8 +247,18 @@ main(int argc, char *argv[]) (void) printf("Running performance tests (hashing 1024 MiB of " "data):\n"); - SHA2_PERF_TEST(256, 256); - SHA2_PERF_TEST(512, 512); + + for (id = 0; id < sha256->getcnt(); id++) { + sha256->setid(id); + const char *name = sha256->getname(); + SHA2_PERF_TEST(256, 256, name); + } + + for (id = 0; id < sha512->getcnt(); id++) { + sha512->setid(id); + const char *name = sha512->getname(); + SHA2_PERF_TEST(512, 512, name); + } return (0); }