Skip to content

Commit

Permalink
Use AVX unaligned memcpy only if AVX2 is available
Browse files Browse the repository at this point in the history
memcpy with unaligned 256-bit AVX register loads/stores are slow on older
processorsl like Sandy Bridge.  This patch adds bit_AVX_Fast_Unaligned_Load
and sets it only when AVX2 is available.

	[BZ #17801]
	* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
	Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
	* sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
	New.
	(index_AVX_Fast_Unaligned_Load): Likewise.
	(HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
	* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
	bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
	* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
	* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
	* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
	* sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
	HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
	* sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
  • Loading branch information
hjl-tools committed Jan 30, 2015
1 parent b658fdd commit 5f3d0b7
Show file tree
Hide file tree
Showing 10 changed files with 37 additions and 10 deletions.
18 changes: 18 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
2015-01-30 H.J. Lu <hongjiu.lu@intel.com>

[BZ #17801]
* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
* sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
New.
(index_AVX_Fast_Unaligned_Load): Likewise.
(HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
* sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
* sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.

2015-01-29 Andreas Schwab <schwab@suse.de>

* sysdeps/nptl/allocrtsig.c: Include <signal.h>.
Expand Down
4 changes: 2 additions & 2 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ Version 2.21
17601, 17608, 17616, 17625, 17630, 17633, 17634, 17635, 17647, 17653,
17657, 17658, 17664, 17665, 17668, 17682, 17702, 17717, 17719, 17722,
17723, 17724, 17725, 17732, 17733, 17744, 17745, 17746, 17747, 17748,
17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17803,
17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892.
17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17801,
17803, 17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892.

* A new semaphore algorithm has been implemented in generic C code for all
machines. Previous custom assembly implementations of semaphore were
Expand Down
9 changes: 7 additions & 2 deletions sysdeps/x86_64/multiarch/init-arch.c
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,14 @@ __init_cpu_features (void)
/* Determine if AVX is usable. */
if (CPUID_AVX)
__cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable;
/* Determine if AVX2 is usable. */
#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
#endif
/* Determine if AVX2 is usable. Unaligned load with 256-bit
AVX registers are faster on processors with AVX2. */
if (CPUID_AVX2)
__cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable;
__cpu_features.feature[index_AVX2_Usable]
|= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
/* Determine if FMA is usable. */
if (CPUID_FMA)
__cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable;
Expand Down
4 changes: 4 additions & 0 deletions sysdeps/x86_64/multiarch/init-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#define bit_FMA4_Usable (1 << 8)
#define bit_Slow_SSE4_2 (1 << 9)
#define bit_AVX2_Usable (1 << 10)
#define bit_AVX_Fast_Unaligned_Load (1 << 11)

/* CPUID Feature flags. */

Expand Down Expand Up @@ -74,6 +75,7 @@
# define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE
# define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE
# define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE
# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE

#else /* __ASSEMBLER__ */

Expand Down Expand Up @@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_FMA4_Usable FEATURE_INDEX_1
# define index_Slow_SSE4_2 FEATURE_INDEX_1
# define index_AVX2_Usable FEATURE_INDEX_1
# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1

# define HAS_ARCH_FEATURE(name) \
((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
Expand All @@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void)
# define HAS_AVX2 HAS_ARCH_FEATURE (AVX2_Usable)
# define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable)
# define HAS_FMA4 HAS_ARCH_FEATURE (FMA4_Usable)
# define HAS_AVX_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)

#endif /* __ASSEMBLER__ */
2 changes: 1 addition & 1 deletion sysdeps/x86_64/multiarch/memcpy.S
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ ENTRY(__new_memcpy)
jne 1f
call __init_cpu_features
1: leaq __memcpy_avx_unaligned(%rip), %rax
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
jz 1f
ret
1: leaq __memcpy_sse2(%rip), %rax
Expand Down
2 changes: 1 addition & 1 deletion sysdeps/x86_64/multiarch/memcpy_chk.S
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ ENTRY(__memcpy_chk)
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
jz 2f
leaq __memcpy_chk_ssse3_back(%rip), %rax
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
jz 2f
leaq __memcpy_chk_avx_unaligned(%rip), %rax
2: ret
Expand Down
2 changes: 1 addition & 1 deletion sysdeps/x86_64/multiarch/memmove.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
ifunc symbol properly. */
extern __typeof (__redirect_memmove) __libc_memmove;
libc_ifunc (__libc_memmove,
HAS_AVX
HAS_AVX_FAST_UNALIGNED_LOAD
? __memmove_avx_unaligned
: (HAS_SSSE3
? (HAS_FAST_COPY_BACKWARD
Expand Down
2 changes: 1 addition & 1 deletion sysdeps/x86_64/multiarch/memmove_chk.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
#include "debug/memmove_chk.c"

libc_ifunc (__memmove_chk,
HAS_AVX ? __memmove_chk_avx_unaligned :
HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned :
(HAS_SSSE3
? (HAS_FAST_COPY_BACKWARD
? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
Expand Down
2 changes: 1 addition & 1 deletion sysdeps/x86_64/multiarch/mempcpy.S
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ ENTRY(__mempcpy)
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
jz 2f
leaq __mempcpy_ssse3_back(%rip), %rax
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
jz 2f
leaq __mempcpy_avx_unaligned(%rip), %rax
2: ret
Expand Down
2 changes: 1 addition & 1 deletion sysdeps/x86_64/multiarch/mempcpy_chk.S
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk)
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
jz 2f
leaq __mempcpy_chk_ssse3_back(%rip), %rax
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
jz 2f
leaq __mempcpy_chk_avx_unaligned(%rip), %rax
2: ret
Expand Down

0 comments on commit 5f3d0b7

Please sign in to comment.