arm64: lib: use optimized memcmp

Patch written by Wilco Dijkstra submitted for review to newlib: https://sourceware.org/ml/newlib/2017/msg00524.html This is an optimized memcmp for AArch64. This is a complete rewrite using a different algorithm. The previous version split into cases where both inputs were aligned, the inputs were mutually aligned and unaligned using a byte loop. The new version combines all these cases, while small inputs of less than 8 bytes are handled separately. This allows the main code to be sped up using unaligned loads since there are now at least 8 bytes to be compared. After the first 8 bytes, align the first input. This ensures each iteration does at most one unaligned access and mutually aligned inputs behave as aligned. After the main loop, process the last 8 bytes using unaligned accesses. This improves performance of (mutually) aligned cases by 25% and unaligned by >500% (yes >6 times faster) on large inputs. 2017-06-28 Wilco Dijkstra <wdijkstr@arm.com> * bionic/libc/arch-arm64/generic/bionic/memcmp.S (memcmp): Rewrite of optimized memcmp. GLIBC benchtests/bench-memcmp.c performance comparison for Cortex-A53: Length 1, alignment 1/ 1: 153% Length 1, alignment 1/ 1: 119% Length 1, alignment 1/ 1: 154% Length 2, alignment 2/ 2: 121% Length 2, alignment 2/ 2: 140% Length 2, alignment 2/ 2: 121% Length 3, alignment 3/ 3: 105% Length 3, alignment 3/ 3: 105% Length 3, alignment 3/ 3: 105% Length 4, alignment 4/ 4: 155% Length 4, alignment 4/ 4: 154% Length 4, alignment 4/ 4: 161% Length 5, alignment 5/ 5: 173% Length 5, alignment 5/ 5: 173% Length 5, alignment 5/ 5: 173% Length 6, alignment 6/ 6: 145% Length 6, alignment 6/ 6: 145% Length 6, alignment 6/ 6: 145% Length 7, alignment 7/ 7: 125% Length 7, alignment 7/ 7: 125% Length 7, alignment 7/ 7: 125% Length 8, alignment 8/ 8: 111% Length 8, alignment 8/ 8: 130% Length 8, alignment 8/ 8: 124% Length 9, alignment 9/ 9: 160% Length 9, alignment 9/ 9: 160% Length 9, alignment 9/ 9: 150% Length 10, alignment 10/10: 170% Length 10, alignment 10/10: 137% Length 10, alignment 10/10: 150% Length 11, alignment 11/11: 160% Length 11, alignment 11/11: 160% Length 11, alignment 11/11: 160% Length 12, alignment 12/12: 146% Length 12, alignment 12/12: 168% Length 12, alignment 12/12: 156% Length 13, alignment 13/13: 167% Length 13, alignment 13/13: 167% Length 13, alignment 13/13: 173% Length 14, alignment 14/14: 167% Length 14, alignment 14/14: 168% Length 14, alignment 14/14: 168% Length 15, alignment 15/15: 168% Length 15, alignment 15/15: 173% Length 15, alignment 15/15: 173% Length 1, alignment 0/ 0: 134% Length 1, alignment 0/ 0: 127% Length 1, alignment 0/ 0: 119% Length 2, alignment 0/ 0: 94% Length 2, alignment 0/ 0: 94% Length 2, alignment 0/ 0: 106% Length 3, alignment 0/ 0: 82% Length 3, alignment 0/ 0: 87% Length 3, alignment 0/ 0: 82% Length 4, alignment 0/ 0: 115% Length 4, alignment 0/ 0: 115% Length 4, alignment 0/ 0: 122% Length 5, alignment 0/ 0: 127% Length 5, alignment 0/ 0: 119% Length 5, alignment 0/ 0: 127% Length 6, alignment 0/ 0: 103% Length 6, alignment 0/ 0: 100% Length 6, alignment 0/ 0: 100% Length 7, alignment 0/ 0: 82% Length 7, alignment 0/ 0: 91% Length 7, alignment 0/ 0: 87% Length 8, alignment 0/ 0: 111% Length 8, alignment 0/ 0: 124% Length 8, alignment 0/ 0: 124% Length 9, alignment 0/ 0: 136% Length 9, alignment 0/ 0: 136% Length 9, alignment 0/ 0: 136% Length 10, alignment 0/ 0: 136% Length 10, alignment 0/ 0: 135% Length 10, alignment 0/ 0: 136% Length 11, alignment 0/ 0: 136% Length 11, alignment 0/ 0: 136% Length 11, alignment 0/ 0: 135% Length 12, alignment 0/ 0: 136% Length 12, alignment 0/ 0: 136% Length 12, alignment 0/ 0: 136% Length 13, alignment 0/ 0: 135% Length 13, alignment 0/ 0: 136% Length 13, alignment 0/ 0: 136% Length 14, alignment 0/ 0: 136% Length 14, alignment 0/ 0: 136% Length 14, alignment 0/ 0: 136% Length 15, alignment 0/ 0: 136% Length 15, alignment 0/ 0: 136% Length 15, alignment 0/ 0: 136% Length 4, alignment 0/ 0: 115% Length 4, alignment 0/ 0: 115% Length 4, alignment 0/ 0: 115% Length 32, alignment 0/ 0: 127% Length 32, alignment 7/ 2: 395% Length 32, alignment 0/ 0: 127% Length 32, alignment 0/ 0: 127% Length 8, alignment 0/ 0: 111% Length 8, alignment 0/ 0: 124% Length 8, alignment 0/ 0: 124% Length 64, alignment 0/ 0: 128% Length 64, alignment 6/ 4: 475% Length 64, alignment 0/ 0: 131% Length 64, alignment 0/ 0: 134% Length 16, alignment 0/ 0: 128% Length 16, alignment 0/ 0: 119% Length 16, alignment 0/ 0: 128% Length 128, alignment 0/ 0: 129% Length 128, alignment 5/ 6: 475% Length 128, alignment 0/ 0: 130% Length 128, alignment 0/ 0: 129% Length 32, alignment 0/ 0: 126% Length 32, alignment 0/ 0: 126% Length 32, alignment 0/ 0: 126% Length 256, alignment 0/ 0: 127% Length 256, alignment 4/ 8: 545% Length 256, alignment 0/ 0: 126% Length 256, alignment 0/ 0: 128% Length 64, alignment 0/ 0: 171% Length 64, alignment 0/ 0: 171% Length 64, alignment 0/ 0: 174% Length 512, alignment 0/ 0: 126% Length 512, alignment 3/10: 585% Length 512, alignment 0/ 0: 126% Length 512, alignment 0/ 0: 127% Length 128, alignment 0/ 0: 129% Length 128, alignment 0/ 0: 128% Length 128, alignment 0/ 0: 129% Length 1024, alignment 0/ 0: 125% Length 1024, alignment 2/12: 611% Length 1024, alignment 0/ 0: 126% Length 1024, alignment 0/ 0: 126% Length 256, alignment 0/ 0: 128% Length 256, alignment 0/ 0: 127% Length 256, alignment 0/ 0: 128% Length 2048, alignment 0/ 0: 125% Length 2048, alignment 1/14: 625% Length 2048, alignment 0/ 0: 125% Length 2048, alignment 0/ 0: 125% Length 512, alignment 0/ 0: 126% Length 512, alignment 0/ 0: 127% Length 512, alignment 0/ 0: 127% Length 4096, alignment 0/ 0: 125% Length 4096, alignment 0/16: 125% Length 4096, alignment 0/ 0: 125% Length 4096, alignment 0/ 0: 125% Length 1024, alignment 0/ 0: 126% Length 1024, alignment 0/ 0: 126% Length 1024, alignment 0/ 0: 126% Length 8192, alignment 0/ 0: 125% Length 8192, alignment 63/18: 636% Length 8192, alignment 0/ 0: 125% Length 8192, alignment 0/ 0: 125% Length 16, alignment 1/ 2: 317% Length 16, alignment 1/ 2: 317% Length 16, alignment 1/ 2: 317% Length 32, alignment 2/ 4: 395% Length 32, alignment 2/ 4: 395% Length 32, alignment 2/ 4: 398% Length 64, alignment 3/ 6: 475% Length 64, alignment 3/ 6: 475% Length 64, alignment 3/ 6: 477% Length 128, alignment 4/ 8: 479% Length 128, alignment 4/ 8: 479% Length 128, alignment 4/ 8: 479% Length 256, alignment 5/10: 543% Length 256, alignment 5/10: 539% Length 256, alignment 5/10: 543% Length 512, alignment 6/12: 585% Length 512, alignment 6/12: 585% Length 512, alignment 6/12: 585% Length 1024, alignment 7/14: 611% Length 1024, alignment 7/14: 611% Length 1024, alignment 7/14: 611% Signed-off-by: Francisco Franco <franciscofranco.1990@gmail.com> Signed-off-by: kdrag0n <dragon@khronodragon.com> Signed-off-by: Raphiel Rollerscaperers <raphielscape@outlook.com> Signed-off-by: laststandrighthere <laststandrighthere@gmail.com> Signed-off-by: Lau <laststandrighthere@gmail.com> Signed-off-by: Azuki <id_azuki@protonmail.com>
xiaomi-mt6765-devs · Feb 2, 2024 · b5e34b7 · b5e34b7
1 parent be6f802
commit b5e34b7
Showing 1 changed file with 115 additions and 242 deletions.
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
@@ -1,258 +1,131 @@
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2017 ARM Ltd
+ * All rights reserved.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * and re-licensed under GPLv2 for the Linux kernel. The original code can
- * be found @
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
  *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
  *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * ARMv8-a, AArch64, unaligned accesses.
  */
 
+/* includes here */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
-* compare memory areas(when two memory areas' offset are different,
-* alignment handled by the hardware)
-*
-* Parameters:
-*  x0 - const memory area 1 pointer
-*  x1 - const memory area 2 pointer
-*  x2 - the maximal compare byte length
-* Returns:
-*  x0 - a compare result, maybe less than, equal to, or greater than ZERO
-*/
-
 /* Parameters and result.  */
-src1		.req	x0
-src2		.req	x1
-limit		.req	x2
-result		.req	x0
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		w0
 
 /* Internal variables.  */
-data1		.req	x3
-data1w		.req	w3
-data2		.req	x4
-data2w		.req	w4
-has_nul		.req	x5
-diff		.req	x6
-endloop		.req	x7
-tmp1		.req	x8
-tmp2		.req	x9
-tmp3		.req	x10
-pos		.req	x11
-limit_wd	.req	x12
-mask		.req	x13
-
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define tmp1		x5
+
+/* Small inputs of less than 8 bytes are handled separately.  This allows the
+   main code to be sped up using unaligned loads since there are now at least
+   8 bytes to be compared.  If the first 8 bytes are equal, align src1.
+   This ensures each iteration does at most one unaligned access even if both
+   src1 and src2 are unaligned, and mutually aligned inputs behave as if
+   aligned.  After the main loop, process the last 8 bytes using unaligned
+   accesses.  */
+
+.p2align 6
 ENTRY(memcmp)
-	cbz	limit, .Lret0
-	eor	tmp1, src1, src2
-	tst	tmp1, #7
-	b.ne	.Lmisaligned8
-	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
-	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #3 /* Convert to Dwords.  */
-	/*
-	* The input source addresses are at alignment boundary.
-	* Directly compare eight bytes each time.
-	*/
-.Lloop_aligned:
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-.Lstart_realigned:
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, cs	/* Last Dword or differences.  */
-	cbz	endloop, .Lloop_aligned
-
-	/* Not reached the limit, must have found a diff.  */
-	tbz	limit_wd, #63, .Lnot_limit
-
-	/* Limit % 8 == 0 => the diff is in the last 8 bytes. */
-	ands	limit, limit, #7
-	b.eq	.Lnot_limit
-	/*
-	* The remained bytes less than 8. It is needed to extract valid data
-	* from last eight bytes of the intended memory range.
-	*/
-	lsl	limit, limit, #3	/* bytes-> bits.  */
-	mov	mask, #~0
-CPU_BE( lsr	mask, mask, limit )
-CPU_LE( lsl	mask, mask, limit )
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	orr	diff, diff, mask
-	b	.Lnot_limit
-
-.Lmutual_align:
-	/*
-	* Sources are mutually aligned, but are not currently at an
-	* alignment boundary. Round down the addresses and then mask off
-	* the bytes that precede the start point.
-	*/
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	/*
-	* We can not add limit with alignment offset(tmp1) here. Since the
-	* addition probably make the limit overflown.
-	*/
-	sub	limit_wd, limit, #1/*limit != 0, so no underflow.*/
-	and	tmp3, limit_wd, #7
-	lsr	limit_wd, limit_wd, #3
-	add	tmp3, tmp3, tmp1
-	add	limit_wd, limit_wd, tmp3, lsr #3
-	add	limit, limit, tmp1/* Adjust the limit for the extra.  */
-
-	lsl	tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
-	neg	tmp1, tmp1/* Bits to alignment -64.  */
-	mov	tmp2, #~0
-	/*mask off the non-intended bytes before the start address.*/
-CPU_BE( lsl	tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
-	/* Little-endian.  Early bytes are at LSB.  */
-CPU_LE( lsr	tmp2, tmp2, tmp1 )
-
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	b	.Lstart_realigned
-
-	/*src1 and src2 have different alignment offset.*/
-.Lmisaligned8:
-	cmp	limit, #8
-	b.lo	.Ltiny8proc /*limit < 8: compare byte by byte*/
-
-	and	tmp1, src1, #7
-	neg	tmp1, tmp1
-	add	tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
-	and	tmp2, src2, #7
-	neg	tmp2, tmp2
-	add	tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
-	subs	tmp3, tmp1, tmp2
-	csel	pos, tmp1, tmp2, hi /*Choose the maximum.*/
-
-	sub	limit, limit, pos
-	/*compare the proceeding bytes in the first 8 byte segment.*/
-.Ltinycmp:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	pos, pos, #1
-	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000.  */
-	b.eq	.Ltinycmp
-	cbnz	pos, 1f /*diff occurred before the last byte.*/
+	subs	limit, limit, 8
+	b.lo	.Lless8
+
+	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	and	tmp1, src1, 7
+	add	limit, limit, tmp1
+	cmp	data1, data2
+	bne	.Lreturn
+
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
+
+	subs	limit, limit, 8
+	b.ls	.Llast_bytes
+
+	/* Loop performing 8 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 8 and must be larger than zero.
+	   Exit if <= 8 bytes left to do or if the data is not equal.  */
+	.p2align 4
+.Lloop8:
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	subs	limit, limit, 8
+	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
+	b.eq	.Lloop8
+
+	cmp	data1, data2
+	bne	.Lreturn
+
+	/* Compare last 1-8 bytes using unaligned access.  */
+.Llast_bytes:
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+.Lreturn:
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	cmp     data1, data2
+.Lret_eq:
+	cset	result, ne
+	cneg	result, result, lo
+        ret
+
+	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+.Lless8:
+	adds	limit, limit, 4
+	b.lo	.Lless4
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
 	cmp	data1w, data2w
-	b.eq	.Lstart_align
-1:
-	sub	result, data1, data2
-	ret
-
-.Lstart_align:
-	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-
-	ands	xzr, src1, #7
-	b.eq	.Lrecal_offset
-	/*process more leading bytes to make src1 aligned...*/
-	add	src1, src1, tmp3 /*backwards src1 to alignment boundary*/
-	add	src2, src2, tmp3
-	sub	limit, limit, tmp3
-	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-	/*load 8 bytes from aligned SRC1..*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2  /*Non-zero if differences found.*/
-	csinv	endloop, diff, xzr, ne
-	cbnz	endloop, .Lunequal_proc
-	/*How far is the current SRC2 from the alignment boundary...*/
-	and	tmp3, tmp3, #7
-
-.Lrecal_offset:/*src1 is aligned now..*/
-	neg	pos, tmp3
-.Lloopcmp_proc:
-	/*
-	* Divide the eight bytes into two parts. First,backwards the src2
-	* to an alignment boundary,load eight bytes and compare from
-	* the SRC2 alignment boundary. If all 8 bytes are equal,then start
-	* the second part's comparison. Otherwise finish the comparison.
-	* This special handle can garantee all the accesses are in the
-	* thread/task space in avoid to overrange access.
-	*/
-	ldr	data1, [src1,pos]
-	ldr	data2, [src2,pos]
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	cbnz	diff, .Lnot_limit
-
-	/*The second part process*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	subs	limit_wd, limit_wd, #1
-	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
-	cbz	endloop, .Lloopcmp_proc
-.Lunequal_proc:
-	cbz	diff, .Lremain8
-
-/* There is difference occurred in the latest comparison. */
-.Lnot_limit:
-/*
-* For little endian,reverse the low significant equal bits into MSB,then
-* following CLZ can find how many equal bits exist.
-*/
-CPU_LE( rev	diff, diff )
-CPU_LE( rev	data1, data1 )
-CPU_LE( rev	data2, data2 )
-
-	/*
-	* The MS-non-zero bit of DIFF marks either the first bit
-	* that is different, or the end of the significant data.
-	* Shifting left now will bring the critical information into the
-	* top bits.
-	*/
-	clz	pos, diff
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/*
-	* We need to zero-extend (char is unsigned) the value and then
-	* perform a signed subtraction.
-	*/
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-
-.Lremain8:
-	/* Limit % 8 == 0 =>. all data are equal.*/
-	ands	limit, limit, #7
-	b.eq	.Lret0
-
-.Ltiny8proc:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-
-	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000. */
-	b.eq	.Ltiny8proc
-	sub	result, data1, data2
-	ret
-.Lret0:
-	mov	result, #0
+	b.ne	.Lreturn
+	sub	limit, limit, 4
+.Lless4:
+	adds	limit, limit, 4
+	beq	.Lret_eq
+.Lbyte_loop:
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	.Lbyte_loop
+	sub	result, data1w, data2w
 	ret
 ENDPIPROC(memcmp)