arm64: lse: Prefetch operands to speed up atomic operations

On a Kryo 485 CPU (semi-custom Cortex-A76 derivative) in a Snapdragon 855 (SM8150) SoC, switching from traditional LL/SC atomics to LSE causes LKDTM's ATOMIC_TIMING test to regress by 2x: LL/SC ATOMIC_TIMING: 34.14s 34.08s LSE ATOMIC_TIMING: 70.84s 71.06s Prefetching the target operands fixes the regression and makes LSE perform better than LSE as expected: LSE+prfm ATOMIC_TIMING: 21.36s 21.21s "dd if=/dev/zero of=/dev/null count=10000000" also runs faster: LL/SC: 3.3 3.2 3.3 s LSE: 3.1 3.2 3.2 s LSE+p: 2.3 2.3 2.3 s Commit 0ea366f applied the same change to LL/SC atomics, but it was never ported to LSE. Signed-off-by: Danny Lin <danny@kdrag0n.dev> [Kazuki: Port to v5.4] Signed-off-by: Kazuki Hashimoto <kazukih@tuta.io>
Kaz205 · Jun 5, 2023 · a0309cd · a0309cd
1 parent 769dade
commit a0309cd
Showing 1 changed file with 7 additions and 0 deletions.
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
@@ -16,6 +16,7 @@ __lse_atomic_##op(int i, atomic_t *v)					\
 {									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
+	"	prfm	pstl1strm, %[v]\n"				\
 	"	" #asm_op "	%w[i], %[v]\n"				\
 	: [v] "+Q" (v->counter)						\
 	: [i] "r" (i));							\
@@ -41,6 +42,7 @@ __lse_atomic_fetch_##op##name(int i, atomic_t *v)			\
 									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
+	"	prfm	pstl1strm, %[v]\n"				\
 	"	" #asm_op #mb "	%w[i], %w[old], %[v]"			\
 	: [v] "+Q" (v->counter),					\
 	  [old] "=r" (old)						\
@@ -123,6 +125,7 @@ __lse_atomic64_##op(s64 i, atomic64_t *v)				\
 {									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
+	"	prfm	pstl1strm, %[v]\n"				\
 	"	" #asm_op "	%[i], %[v]\n"				\
 	: [v] "+Q" (v->counter)						\
 	: [i] "r" (i));							\
@@ -148,6 +151,7 @@ __lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v)			\
 									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
+	"	prfm	pstl1strm, %[v]\n"				\
 	"	" #asm_op #mb "	%[i], %[old], %[v]"			\
 	: [v] "+Q" (v->counter),					\
 	  [old] "=r" (old)						\
@@ -230,6 +234,7 @@ static __always_inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v)
 
 	asm volatile(
 	__LSE_PREAMBLE
+	"	prfm	pstl1strm, %[v]\n"
 	"1:	ldr	%x[tmp], %[v]\n"
 	"	subs	%[ret], %x[tmp], #1\n"
 	"	b.lt	2f\n"
@@ -258,6 +263,7 @@ __lse__cmpxchg_case_##name##sz(volatile void *ptr,			\
 									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
+	"	prfm	pstl1strm, %[v]\n"				\
 	"	mov	%" #w "[tmp], %" #w "[old]\n"			\
 	"	cas" #mb #sfx "\t%" #w "[tmp], %" #w "[new], %[v]\n"	\
 	"	mov	%" #w "[ret], %" #w "[tmp]"			\
@@ -306,6 +312,7 @@ __lse__cmpxchg_double##name(unsigned long old1,				\
 									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
+	"	prfm	pstl1strm, %[v]\n"				\
 	"	casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
 	"	eor	%[old1], %[old1], %[oldval1]\n"			\
 	"	eor	%[old2], %[old2], %[oldval2]\n"			\