Skip to content

Commit

Permalink
Use proper unaligned access attributes
Browse files Browse the repository at this point in the history
Instead of using packed attribute hack, just use aligned attribute. It
improves code generation on armv6 and armv7, and slightly improves code
generation on aarch64. GCC generates identical code to regular aligned
access on ARMv6 for all versions between 4.5 and trunk, except GCC 5
which is buggy and generates the same (bad) code as packed access:
https://gcc.godbolt.org/z/hq37rz7sb
  • Loading branch information
Hello71 authored and terrelln committed Dec 15, 2022
1 parent fbff782 commit a78c91a
Show file tree
Hide file tree
Showing 10 changed files with 106 additions and 167 deletions.
1 change: 0 additions & 1 deletion contrib/linux-kernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ libzstd:
-DFSE_STATIC_LINKING_ONLY \
-DHUF_STATIC_LINKING_ONLY \
-DXXH_STATIC_LINKING_ONLY \
-DMEM_FORCE_MEMORY_ACCESS=0 \
-D__GNUC__ \
-D__linux__=1 \
-DSTATIC_BMI2=0 \
Expand Down
49 changes: 16 additions & 33 deletions lib/common/mem.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,21 +133,15 @@ MEM_STATIC size_t MEM_swapST(size_t in);
/*-**************************************************************
* Memory I/O Implementation
*****************************************************************/
/* MEM_FORCE_MEMORY_ACCESS :
* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
* The below switch allow to select different access method for improved performance.
* Method 0 (default) : use `memcpy()`. Safe and portable.
* Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable).
* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
* Method 0 : always use `memcpy()`. Safe and portable.
* Method 1 : Use compiler extension to set unaligned access.
* Method 2 : direct access. This method is portable but violate C standard.
* It can generate buggy code on targets depending on alignment.
* In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6)
* See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
* Prefer these methods in priority order (0 > 1 > 2)
* Default : method 1 if supported, else method 0
*/
#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
# if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
# ifdef __GNUC__
# define MEM_FORCE_MEMORY_ACCESS 1
# endif
#endif
Expand Down Expand Up @@ -190,30 +184,19 @@ MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }

#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)

/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
/* currently only defined for gcc and icc */
#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
__pragma( pack(push, 1) )
typedef struct { U16 v; } unalign16;
typedef struct { U32 v; } unalign32;
typedef struct { U64 v; } unalign64;
typedef struct { size_t v; } unalignArch;
__pragma( pack(pop) )
#else
typedef struct { U16 v; } __attribute__((packed)) unalign16;
typedef struct { U32 v; } __attribute__((packed)) unalign32;
typedef struct { U64 v; } __attribute__((packed)) unalign64;
typedef struct { size_t v; } __attribute__((packed)) unalignArch;
#endif
typedef __attribute__((aligned(1))) U16 unalign16;
typedef __attribute__((aligned(1))) U32 unalign32;
typedef __attribute__((aligned(1))) U64 unalign64;
typedef __attribute__((aligned(1))) size_t unalignArch;

MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; }
MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign32*)ptr)->v; }
MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign64*)ptr)->v; }
MEM_STATIC size_t MEM_readST(const void* ptr) { return ((const unalignArch*)ptr)->v; }
MEM_STATIC U16 MEM_read16(const void* ptr) { return *(const unalign16*)ptr; }
MEM_STATIC U32 MEM_read32(const void* ptr) { return *(const unalign32*)ptr; }
MEM_STATIC U64 MEM_read64(const void* ptr) { return *(const unalign64*)ptr; }
MEM_STATIC size_t MEM_readST(const void* ptr) { return *(const unalignArch*)ptr; }

MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign16*)memPtr)->v = value; }
MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign32*)memPtr)->v = value; }
MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = value; }
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; }
MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(unalign32*)memPtr = value; }
MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(unalign64*)memPtr = value; }

#else

Expand Down
30 changes: 12 additions & 18 deletions lib/legacy/zstd_v01.c
Original file line number Diff line number Diff line change
Expand Up @@ -190,21 +190,15 @@ typedef signed long long S64;
/****************************************************************
* Memory I/O
*****************************************************************/
/* FSE_FORCE_MEMORY_ACCESS
* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
* The below switch allow to select different access method for improved performance.
* Method 0 (default) : use `memcpy()`. Safe and portable.
* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
/* FSE_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
* Method 0 : always use `memcpy()`. Safe and portable.
* Method 1 : Use compiler extension to set unaligned access.
* Method 2 : direct access. This method is portable but violate C standard.
* It can generate buggy code on targets generating assembly depending on alignment.
* But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
* See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
* Prefer these methods in priority order (0 > 1 > 2)
* It can generate buggy code on targets depending on alignment.
* Default : method 1 if supported, else method 0
*/
#ifndef FSE_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
# if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
# ifdef __GNUC__
# define FSE_FORCE_MEMORY_ACCESS 1
# endif
#endif
Expand All @@ -229,13 +223,13 @@ static U64 FSE_read64(const void* memPtr) { return *(const U64*) memPtr; }

#elif defined(FSE_FORCE_MEMORY_ACCESS) && (FSE_FORCE_MEMORY_ACCESS==1)

/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
/* currently only defined for gcc and icc */
typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
typedef __attribute__((aligned(1))) U16 unalign16;
typedef __attribute__((aligned(1))) U32 unalign32;
typedef __attribute__((aligned(1))) U64 unalign64;

static U16 FSE_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
static U32 FSE_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
static U64 FSE_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
static U16 FSE_read16(const void* ptr) { return *(const unalign16*)ptr; }
static U32 FSE_read32(const void* ptr) { return *(const unalign32*)ptr; }
static U64 FSE_read64(const void* ptr) { return *(const unalign64*)ptr; }

#else

Expand Down
32 changes: 13 additions & 19 deletions lib/legacy/zstd_v02.c
Original file line number Diff line number Diff line change
Expand Up @@ -115,21 +115,15 @@ extern "C" {
/****************************************************************
* Memory I/O
*****************************************************************/
/* MEM_FORCE_MEMORY_ACCESS
* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
* The below switch allow to select different access method for improved performance.
* Method 0 (default) : use `memcpy()`. Safe and portable.
* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
* Method 0 : always use `memcpy()`. Safe and portable.
* Method 1 : Use compiler extension to set unaligned access.
* Method 2 : direct access. This method is portable but violate C standard.
* It can generate buggy code on targets generating assembly depending on alignment.
* But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
* See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
* Prefer these methods in priority order (0 > 1 > 2)
* It can generate buggy code on targets depending on alignment.
* Default : method 1 if supported, else method 0
*/
#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
# if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
# ifdef __GNUC__
# define MEM_FORCE_MEMORY_ACCESS 1
# endif
#endif
Expand All @@ -155,15 +149,15 @@ MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }

#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)

/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
/* currently only defined for gcc and icc */
typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
typedef __attribute__((aligned(1))) U16 unalign16;
typedef __attribute__((aligned(1))) U32 unalign32;
typedef __attribute__((aligned(1))) U64 unalign64;

MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
MEM_STATIC U16 MEM_read16(const void* ptr) { return *(const unalign16*)ptr; }
MEM_STATIC U32 MEM_read32(const void* ptr) { return *(const unalign32*)ptr; }
MEM_STATIC U64 MEM_read64(const void* ptr) { return *(const unalign64*)ptr; }

MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; }

#else

Expand Down
32 changes: 13 additions & 19 deletions lib/legacy/zstd_v03.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,21 +116,15 @@ extern "C" {
/****************************************************************
* Memory I/O
*****************************************************************/
/* MEM_FORCE_MEMORY_ACCESS
* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
* The below switch allow to select different access method for improved performance.
* Method 0 (default) : use `memcpy()`. Safe and portable.
* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
* Method 0 : always use `memcpy()`. Safe and portable.
* Method 1 : Use compiler extension to set unaligned access.
* Method 2 : direct access. This method is portable but violate C standard.
* It can generate buggy code on targets generating assembly depending on alignment.
* But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
* See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
* Prefer these methods in priority order (0 > 1 > 2)
* It can generate buggy code on targets depending on alignment.
* Default : method 1 if supported, else method 0
*/
#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
# if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
# ifdef __GNUC__
# define MEM_FORCE_MEMORY_ACCESS 1
# endif
#endif
Expand All @@ -156,15 +150,15 @@ MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }

#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)

/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
/* currently only defined for gcc and icc */
typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
typedef __attribute__((aligned(1))) U16 unalign16;
typedef __attribute__((aligned(1))) U32 unalign32;
typedef __attribute__((aligned(1))) U64 unalign64;

MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
MEM_STATIC U16 MEM_read16(const void* ptr) { return *(const unalign16*)ptr; }
MEM_STATIC U32 MEM_read32(const void* ptr) { return *(const unalign32*)ptr; }
MEM_STATIC U64 MEM_read64(const void* ptr) { return *(const unalign64*)ptr; }

MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; }

#else

Expand Down
32 changes: 13 additions & 19 deletions lib/legacy/zstd_v04.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,21 +87,15 @@ extern "C" {
/****************************************************************
* Memory I/O
*****************************************************************/
/* MEM_FORCE_MEMORY_ACCESS
* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
* The below switch allow to select different access method for improved performance.
* Method 0 (default) : use `memcpy()`. Safe and portable.
* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
* Method 0 : always use `memcpy()`. Safe and portable.
* Method 1 : Use compiler extension to set unaligned access.
* Method 2 : direct access. This method is portable but violate C standard.
* It can generate buggy code on targets generating assembly depending on alignment.
* But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
* See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
* Prefer these methods in priority order (0 > 1 > 2)
* It can generate buggy code on targets depending on alignment.
* Default : method 1 if supported, else method 0
*/
#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
# if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
# ifdef __GNUC__
# define MEM_FORCE_MEMORY_ACCESS 1
# endif
#endif
Expand All @@ -127,15 +121,15 @@ MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }

#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)

/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
/* currently only defined for gcc and icc */
typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
typedef __attribute__((aligned(1))) U16 unalign16;
typedef __attribute__((aligned(1))) U32 unalign32;
typedef __attribute__((aligned(1))) U64 unalign64;

MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
MEM_STATIC U16 MEM_read16(const void* ptr) { return *(const unalign16*)ptr; }
MEM_STATIC U32 MEM_read32(const void* ptr) { return *(const unalign32*)ptr; }
MEM_STATIC U64 MEM_read64(const void* ptr) { return *(const unalign64*)ptr; }

MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; }

#else

Expand Down
34 changes: 14 additions & 20 deletions lib/legacy/zstd_v05.c
Original file line number Diff line number Diff line change
Expand Up @@ -106,21 +106,15 @@ extern "C" {
/*-**************************************************************
* Memory I/O
*****************************************************************/
/* MEM_FORCE_MEMORY_ACCESS :
* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
* The below switch allow to select different access method for improved performance.
* Method 0 (default) : use `memcpy()`. Safe and portable.
* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
* Method 0 : always use `memcpy()`. Safe and portable.
* Method 1 : Use compiler extension to set unaligned access.
* Method 2 : direct access. This method is portable but violate C standard.
* It can generate buggy code on targets depending on alignment.
* In some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
* See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
* Prefer these methods in priority order (0 > 1 > 2)
* Default : method 1 if supported, else method 0
*/
#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
# if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
# ifdef __GNUC__
# define MEM_FORCE_MEMORY_ACCESS 1
# endif
#endif
Expand Down Expand Up @@ -148,17 +142,17 @@ MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }

#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)

/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
/* currently only defined for gcc and icc */
typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign;
typedef __attribute__((aligned(1))) U16 unalign16;
typedef __attribute__((aligned(1))) U32 unalign32;
typedef __attribute__((aligned(1))) U64 unalign64;

MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
MEM_STATIC U16 MEM_read16(const void* ptr) { return *(const unalign16*)ptr; }
MEM_STATIC U32 MEM_read32(const void* ptr) { return *(const unalign32*)ptr; }
MEM_STATIC U64 MEM_read64(const void* ptr) { return *(const unalign64*)ptr; }

MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign*)memPtr)->u64 = value; }
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; }
MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(unalign32*)memPtr = value; }
MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(unalign64*)memPtr = value; }

#else

Expand Down
Loading

0 comments on commit a78c91a

Please sign in to comment.