From b5e34b7dc62eaa9088cee9f5f9d5808a1784ec11 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@arm.com>
Date: Mon, 14 Aug 2017 15:48:22 -0500
Subject: [PATCH] arm64: lib: use optimized memcmp

Patch written by Wilco Dijkstra submitted for review to newlib:
https://sourceware.org/ml/newlib/2017/msg00524.html

This is an optimized memcmp for AArch64.  This is a complete rewrite
using a different algorithm.  The previous version split into cases
where both inputs were aligned, the inputs were mutually aligned and
unaligned using a byte loop.  The new version combines all these cases,
while small inputs of less than 8 bytes are handled separately.

This allows the main code to be sped up using unaligned loads since
there are now at least 8 bytes to be compared.  After the first 8 bytes,
align the first input.  This ensures each iteration does at most one
unaligned access and mutually aligned inputs behave as aligned.
After the main loop, process the last 8 bytes using unaligned accesses.

This improves performance of (mutually) aligned cases by 25% and
unaligned by >500% (yes >6 times faster) on large inputs.

2017-06-28  Wilco Dijkstra  <wdijkstr@arm.com>

        * bionic/libc/arch-arm64/generic/bionic/memcmp.S (memcmp):
                Rewrite of optimized memcmp.

GLIBC benchtests/bench-memcmp.c performance comparison for Cortex-A53:

Length    1, alignment  1/ 1:        153%
Length    1, alignment  1/ 1:        119%
Length    1, alignment  1/ 1:        154%
Length    2, alignment  2/ 2:        121%
Length    2, alignment  2/ 2:        140%
Length    2, alignment  2/ 2:        121%
Length    3, alignment  3/ 3:        105%
Length    3, alignment  3/ 3:        105%
Length    3, alignment  3/ 3:        105%
Length    4, alignment  4/ 4:        155%
Length    4, alignment  4/ 4:        154%
Length    4, alignment  4/ 4:        161%
Length    5, alignment  5/ 5:        173%
Length    5, alignment  5/ 5:        173%
Length    5, alignment  5/ 5:        173%
Length    6, alignment  6/ 6:        145%
Length    6, alignment  6/ 6:        145%
Length    6, alignment  6/ 6:        145%
Length    7, alignment  7/ 7:        125%
Length    7, alignment  7/ 7:        125%
Length    7, alignment  7/ 7:        125%
Length    8, alignment  8/ 8:        111%
Length    8, alignment  8/ 8:        130%
Length    8, alignment  8/ 8:        124%
Length    9, alignment  9/ 9:        160%
Length    9, alignment  9/ 9:        160%
Length    9, alignment  9/ 9:        150%
Length   10, alignment 10/10:        170%
Length   10, alignment 10/10:        137%
Length   10, alignment 10/10:        150%
Length   11, alignment 11/11:        160%
Length   11, alignment 11/11:        160%
Length   11, alignment 11/11:        160%
Length   12, alignment 12/12:        146%
Length   12, alignment 12/12:        168%
Length   12, alignment 12/12:        156%
Length   13, alignment 13/13:        167%
Length   13, alignment 13/13:        167%
Length   13, alignment 13/13:        173%
Length   14, alignment 14/14:        167%
Length   14, alignment 14/14:        168%
Length   14, alignment 14/14:        168%
Length   15, alignment 15/15:        168%
Length   15, alignment 15/15:        173%
Length   15, alignment 15/15:        173%
Length    1, alignment  0/ 0:        134%
Length    1, alignment  0/ 0:        127%
Length    1, alignment  0/ 0:        119%
Length    2, alignment  0/ 0:        94%
Length    2, alignment  0/ 0:        94%
Length    2, alignment  0/ 0:        106%
Length    3, alignment  0/ 0:        82%
Length    3, alignment  0/ 0:        87%
Length    3, alignment  0/ 0:        82%
Length    4, alignment  0/ 0:        115%
Length    4, alignment  0/ 0:        115%
Length    4, alignment  0/ 0:        122%
Length    5, alignment  0/ 0:        127%
Length    5, alignment  0/ 0:        119%
Length    5, alignment  0/ 0:        127%
Length    6, alignment  0/ 0:        103%
Length    6, alignment  0/ 0:        100%
Length    6, alignment  0/ 0:        100%
Length    7, alignment  0/ 0:        82%
Length    7, alignment  0/ 0:        91%
Length    7, alignment  0/ 0:        87%
Length    8, alignment  0/ 0:        111%
Length    8, alignment  0/ 0:        124%
Length    8, alignment  0/ 0:        124%
Length    9, alignment  0/ 0:        136%
Length    9, alignment  0/ 0:        136%
Length    9, alignment  0/ 0:        136%
Length   10, alignment  0/ 0:        136%
Length   10, alignment  0/ 0:        135%
Length   10, alignment  0/ 0:        136%
Length   11, alignment  0/ 0:        136%
Length   11, alignment  0/ 0:        136%
Length   11, alignment  0/ 0:        135%
Length   12, alignment  0/ 0:        136%
Length   12, alignment  0/ 0:        136%
Length   12, alignment  0/ 0:        136%
Length   13, alignment  0/ 0:        135%
Length   13, alignment  0/ 0:        136%
Length   13, alignment  0/ 0:        136%
Length   14, alignment  0/ 0:        136%
Length   14, alignment  0/ 0:        136%
Length   14, alignment  0/ 0:        136%
Length   15, alignment  0/ 0:        136%
Length   15, alignment  0/ 0:        136%
Length   15, alignment  0/ 0:        136%
Length    4, alignment  0/ 0:        115%
Length    4, alignment  0/ 0:        115%
Length    4, alignment  0/ 0:        115%
Length   32, alignment  0/ 0:        127%
Length   32, alignment  7/ 2:        395%
Length   32, alignment  0/ 0:        127%
Length   32, alignment  0/ 0:        127%
Length    8, alignment  0/ 0:        111%
Length    8, alignment  0/ 0:        124%
Length    8, alignment  0/ 0:        124%
Length   64, alignment  0/ 0:        128%
Length   64, alignment  6/ 4:        475%
Length   64, alignment  0/ 0:        131%
Length   64, alignment  0/ 0:        134%
Length   16, alignment  0/ 0:        128%
Length   16, alignment  0/ 0:        119%
Length   16, alignment  0/ 0:        128%
Length  128, alignment  0/ 0:        129%
Length  128, alignment  5/ 6:        475%
Length  128, alignment  0/ 0:        130%
Length  128, alignment  0/ 0:        129%
Length   32, alignment  0/ 0:        126%
Length   32, alignment  0/ 0:        126%
Length   32, alignment  0/ 0:        126%
Length  256, alignment  0/ 0:        127%
Length  256, alignment  4/ 8:        545%
Length  256, alignment  0/ 0:        126%
Length  256, alignment  0/ 0:        128%
Length   64, alignment  0/ 0:        171%
Length   64, alignment  0/ 0:        171%
Length   64, alignment  0/ 0:        174%
Length  512, alignment  0/ 0:        126%
Length  512, alignment  3/10:        585%
Length  512, alignment  0/ 0:        126%
Length  512, alignment  0/ 0:        127%
Length  128, alignment  0/ 0:        129%
Length  128, alignment  0/ 0:        128%
Length  128, alignment  0/ 0:        129%
Length 1024, alignment  0/ 0:        125%
Length 1024, alignment  2/12:        611%
Length 1024, alignment  0/ 0:        126%
Length 1024, alignment  0/ 0:        126%
Length  256, alignment  0/ 0:        128%
Length  256, alignment  0/ 0:        127%
Length  256, alignment  0/ 0:        128%
Length 2048, alignment  0/ 0:        125%
Length 2048, alignment  1/14:        625%
Length 2048, alignment  0/ 0:        125%
Length 2048, alignment  0/ 0:        125%
Length  512, alignment  0/ 0:        126%
Length  512, alignment  0/ 0:        127%
Length  512, alignment  0/ 0:        127%
Length 4096, alignment  0/ 0:        125%
Length 4096, alignment  0/16:        125%
Length 4096, alignment  0/ 0:        125%
Length 4096, alignment  0/ 0:        125%
Length 1024, alignment  0/ 0:        126%
Length 1024, alignment  0/ 0:        126%
Length 1024, alignment  0/ 0:        126%
Length 8192, alignment  0/ 0:        125%
Length 8192, alignment 63/18:        636%
Length 8192, alignment  0/ 0:        125%
Length 8192, alignment  0/ 0:        125%
Length   16, alignment  1/ 2:        317%
Length   16, alignment  1/ 2:        317%
Length   16, alignment  1/ 2:        317%
Length   32, alignment  2/ 4:        395%
Length   32, alignment  2/ 4:        395%
Length   32, alignment  2/ 4:        398%
Length   64, alignment  3/ 6:        475%
Length   64, alignment  3/ 6:        475%
Length   64, alignment  3/ 6:        477%
Length  128, alignment  4/ 8:        479%
Length  128, alignment  4/ 8:        479%
Length  128, alignment  4/ 8:        479%
Length  256, alignment  5/10:        543%
Length  256, alignment  5/10:        539%
Length  256, alignment  5/10:        543%
Length  512, alignment  6/12:        585%
Length  512, alignment  6/12:        585%
Length  512, alignment  6/12:        585%
Length 1024, alignment  7/14:        611%
Length 1024, alignment  7/14:        611%
Length 1024, alignment  7/14:        611%

Signed-off-by: Francisco Franco <franciscofranco.1990@gmail.com>
Signed-off-by: kdrag0n <dragon@khronodragon.com>
Signed-off-by: Raphiel Rollerscaperers <raphielscape@outlook.com>
Signed-off-by: laststandrighthere <laststandrighthere@gmail.com>
Signed-off-by: Lau <laststandrighthere@gmail.com>
Signed-off-by: Azuki <id_azuki@protonmail.com>
---
 arch/arm64/lib/memcmp.S | 357 +++++++++++++---------------------------
 1 file changed, 115 insertions(+), 242 deletions(-)

diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
index 2a4e239bd17a..b58c300ab5d9 100644
--- a/arch/arm64/lib/memcmp.S
+++ b/arch/arm64/lib/memcmp.S
@@ -1,258 +1,131 @@
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2017 ARM Ltd
+ * All rights reserved.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * and re-licensed under GPLv2 for the Linux kernel. The original code can
- * be found @
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
  *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
  *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * ARMv8-a, AArch64, unaligned accesses.
  */
 
+/* includes here */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
-* compare memory areas(when two memory areas' offset are different,
-* alignment handled by the hardware)
-*
-* Parameters:
-*  x0 - const memory area 1 pointer
-*  x1 - const memory area 2 pointer
-*  x2 - the maximal compare byte length
-* Returns:
-*  x0 - a compare result, maybe less than, equal to, or greater than ZERO
-*/
-
 /* Parameters and result.  */
-src1		.req	x0
-src2		.req	x1
-limit		.req	x2
-result		.req	x0
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		w0
 
 /* Internal variables.  */
-data1		.req	x3
-data1w		.req	w3
-data2		.req	x4
-data2w		.req	w4
-has_nul		.req	x5
-diff		.req	x6
-endloop		.req	x7
-tmp1		.req	x8
-tmp2		.req	x9
-tmp3		.req	x10
-pos		.req	x11
-limit_wd	.req	x12
-mask		.req	x13
-
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define tmp1		x5
+
+/* Small inputs of less than 8 bytes are handled separately.  This allows the
+   main code to be sped up using unaligned loads since there are now at least
+   8 bytes to be compared.  If the first 8 bytes are equal, align src1.
+   This ensures each iteration does at most one unaligned access even if both
+   src1 and src2 are unaligned, and mutually aligned inputs behave as if
+   aligned.  After the main loop, process the last 8 bytes using unaligned
+   accesses.  */
+
+.p2align 6
 ENTRY(memcmp)
-	cbz	limit, .Lret0
-	eor	tmp1, src1, src2
-	tst	tmp1, #7
-	b.ne	.Lmisaligned8
-	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
-	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #3 /* Convert to Dwords.  */
-	/*
-	* The input source addresses are at alignment boundary.
-	* Directly compare eight bytes each time.
-	*/
-.Lloop_aligned:
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-.Lstart_realigned:
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, cs	/* Last Dword or differences.  */
-	cbz	endloop, .Lloop_aligned
-
-	/* Not reached the limit, must have found a diff.  */
-	tbz	limit_wd, #63, .Lnot_limit
-
-	/* Limit % 8 == 0 => the diff is in the last 8 bytes. */
-	ands	limit, limit, #7
-	b.eq	.Lnot_limit
-	/*
-	* The remained bytes less than 8. It is needed to extract valid data
-	* from last eight bytes of the intended memory range.
-	*/
-	lsl	limit, limit, #3	/* bytes-> bits.  */
-	mov	mask, #~0
-CPU_BE( lsr	mask, mask, limit )
-CPU_LE( lsl	mask, mask, limit )
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	orr	diff, diff, mask
-	b	.Lnot_limit
-
-.Lmutual_align:
-	/*
-	* Sources are mutually aligned, but are not currently at an
-	* alignment boundary. Round down the addresses and then mask off
-	* the bytes that precede the start point.
-	*/
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	/*
-	* We can not add limit with alignment offset(tmp1) here. Since the
-	* addition probably make the limit overflown.
-	*/
-	sub	limit_wd, limit, #1/*limit != 0, so no underflow.*/
-	and	tmp3, limit_wd, #7
-	lsr	limit_wd, limit_wd, #3
-	add	tmp3, tmp3, tmp1
-	add	limit_wd, limit_wd, tmp3, lsr #3
-	add	limit, limit, tmp1/* Adjust the limit for the extra.  */
-
-	lsl	tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
-	neg	tmp1, tmp1/* Bits to alignment -64.  */
-	mov	tmp2, #~0
-	/*mask off the non-intended bytes before the start address.*/
-CPU_BE( lsl	tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
-	/* Little-endian.  Early bytes are at LSB.  */
-CPU_LE( lsr	tmp2, tmp2, tmp1 )
-
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	b	.Lstart_realigned
-
-	/*src1 and src2 have different alignment offset.*/
-.Lmisaligned8:
-	cmp	limit, #8
-	b.lo	.Ltiny8proc /*limit < 8: compare byte by byte*/
-
-	and	tmp1, src1, #7
-	neg	tmp1, tmp1
-	add	tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
-	and	tmp2, src2, #7
-	neg	tmp2, tmp2
-	add	tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
-	subs	tmp3, tmp1, tmp2
-	csel	pos, tmp1, tmp2, hi /*Choose the maximum.*/
-
-	sub	limit, limit, pos
-	/*compare the proceeding bytes in the first 8 byte segment.*/
-.Ltinycmp:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	pos, pos, #1
-	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000.  */
-	b.eq	.Ltinycmp
-	cbnz	pos, 1f /*diff occurred before the last byte.*/
+	subs	limit, limit, 8
+	b.lo	.Lless8
+
+	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	and	tmp1, src1, 7
+	add	limit, limit, tmp1
+	cmp	data1, data2
+	bne	.Lreturn
+
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
+
+	subs	limit, limit, 8
+	b.ls	.Llast_bytes
+
+	/* Loop performing 8 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 8 and must be larger than zero.
+	   Exit if <= 8 bytes left to do or if the data is not equal.  */
+	.p2align 4
+.Lloop8:
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	subs	limit, limit, 8
+	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
+	b.eq	.Lloop8
+
+	cmp	data1, data2
+	bne	.Lreturn
+
+	/* Compare last 1-8 bytes using unaligned access.  */
+.Llast_bytes:
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+.Lreturn:
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	cmp     data1, data2
+.Lret_eq:
+	cset	result, ne
+	cneg	result, result, lo
+        ret
+
+	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+.Lless8:
+	adds	limit, limit, 4
+	b.lo	.Lless4
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
 	cmp	data1w, data2w
-	b.eq	.Lstart_align
-1:
-	sub	result, data1, data2
-	ret
-
-.Lstart_align:
-	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-
-	ands	xzr, src1, #7
-	b.eq	.Lrecal_offset
-	/*process more leading bytes to make src1 aligned...*/
-	add	src1, src1, tmp3 /*backwards src1 to alignment boundary*/
-	add	src2, src2, tmp3
-	sub	limit, limit, tmp3
-	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-	/*load 8 bytes from aligned SRC1..*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2  /*Non-zero if differences found.*/
-	csinv	endloop, diff, xzr, ne
-	cbnz	endloop, .Lunequal_proc
-	/*How far is the current SRC2 from the alignment boundary...*/
-	and	tmp3, tmp3, #7
-
-.Lrecal_offset:/*src1 is aligned now..*/
-	neg	pos, tmp3
-.Lloopcmp_proc:
-	/*
-	* Divide the eight bytes into two parts. First,backwards the src2
-	* to an alignment boundary,load eight bytes and compare from
-	* the SRC2 alignment boundary. If all 8 bytes are equal,then start
-	* the second part's comparison. Otherwise finish the comparison.
-	* This special handle can garantee all the accesses are in the
-	* thread/task space in avoid to overrange access.
-	*/
-	ldr	data1, [src1,pos]
-	ldr	data2, [src2,pos]
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	cbnz	diff, .Lnot_limit
-
-	/*The second part process*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	subs	limit_wd, limit_wd, #1
-	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
-	cbz	endloop, .Lloopcmp_proc
-.Lunequal_proc:
-	cbz	diff, .Lremain8
-
-/* There is difference occurred in the latest comparison. */
-.Lnot_limit:
-/*
-* For little endian,reverse the low significant equal bits into MSB,then
-* following CLZ can find how many equal bits exist.
-*/
-CPU_LE( rev	diff, diff )
-CPU_LE( rev	data1, data1 )
-CPU_LE( rev	data2, data2 )
-
-	/*
-	* The MS-non-zero bit of DIFF marks either the first bit
-	* that is different, or the end of the significant data.
-	* Shifting left now will bring the critical information into the
-	* top bits.
-	*/
-	clz	pos, diff
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/*
-	* We need to zero-extend (char is unsigned) the value and then
-	* perform a signed subtraction.
-	*/
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-
-.Lremain8:
-	/* Limit % 8 == 0 =>. all data are equal.*/
-	ands	limit, limit, #7
-	b.eq	.Lret0
-
-.Ltiny8proc:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-
-	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000. */
-	b.eq	.Ltiny8proc
-	sub	result, data1, data2
-	ret
-.Lret0:
-	mov	result, #0
+	b.ne	.Lreturn
+	sub	limit, limit, 4
+.Lless4:
+	adds	limit, limit, 4
+	beq	.Lret_eq
+.Lbyte_loop:
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	.Lbyte_loop
+	sub	result, data1w, data2w
 	ret
 ENDPIPROC(memcmp)