Skip to content

Commit

Permalink
Support for SME1 based sgemm_direct kernel for cblas_sgemm level 3 API
Browse files Browse the repository at this point in the history
* Added ARMV9SME target
* Added SGEMM_DIRECT kernel based on SME1
  • Loading branch information
vaiskv committed Jan 27, 2025
1 parent 18014b0 commit da50c80
Show file tree
Hide file tree
Showing 22 changed files with 650 additions and 33 deletions.
5 changes: 5 additions & 0 deletions Makefile.arm64
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve
endif
endif

ifeq ($(CORE), ARMV9SME)
CCOMMON_OPT += -march=armv9-a+sve2+sme
FCOMMON_OPT += -march=armv9-a+sve2
endif

ifeq ($(CORE), CORTEXA53)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
ifneq ($(F_COMPILER), NAG)
Expand Down
8 changes: 8 additions & 0 deletions Makefile.system
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ ifeq ($(ARCH), arm64)
export MACOSX_DEPLOYMENT_TARGET=11.0
ifeq ($(C_COMPILER), GCC)
export NO_SVE = 1
export NO_SME = 1
endif
else
export MACOSX_DEPLOYMENT_TARGET=10.8
Expand Down Expand Up @@ -709,6 +710,9 @@ DYNAMIC_CORE += NEOVERSEN2
DYNAMIC_CORE += ARMV8SVE
DYNAMIC_CORE += A64FX
endif
ifneq ($(NO_SME), 1)
DYNAMIC_CORE += ARMV9SME
endif
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99
DYNAMIC_CORE += TSV110
Expand Down Expand Up @@ -1474,6 +1478,10 @@ ifeq ($(NO_SVE), 1)
CCOMMON_OPT += -DNO_SVE
endif

ifeq ($(NO_SME), 1)
CCOMMON_OPT += -DNO_SME
endif

ifdef SMP
CCOMMON_OPT += -DSMP_SERVER

Expand Down
1 change: 1 addition & 0 deletions TargetList.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ THUNDERX3T110
VORTEX
A64FX
ARMV8SVE
ARMV9SME
FT2000

9.System Z:
Expand Down
18 changes: 15 additions & 3 deletions cmake/arch.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,21 @@ endif ()

if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
endif ()
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
endif()
elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
endif ()
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
endif()
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
Expand Down
6 changes: 6 additions & 0 deletions cmake/cc.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,12 @@ if (${CORE} STREQUAL ARMV8SVE)
endif ()
endif ()

if (${CORE} STREQUAL ARMV9SME)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme")
endif ()
endif ()

if (${CORE} STREQUAL CORTEXA510)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
Expand Down
2 changes: 1 addition & 1 deletion cmake/prebuild.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1014,7 +1014,7 @@ endif ()
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "NEOVERSEN2")
elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
Expand Down
38 changes: 27 additions & 11 deletions cmake/system.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,15 @@ endif()
# Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet?
# It seems we are meant to use TARGET as input and CORE internally as kernel.
if(NOT DEFINED CORE AND DEFINED TARGET)
set(CORE ${TARGET})
if (${TARGET} STREQUAL "LOONGSON3R5")
set(CORE "LA464")
elseif (${TARGET} STREQUAL "LOONGSON2K1000")
set(CORE "LA264")
elseif (${TARGET} STREQUAL "LOONGSONGENERIC")
set(CORE "LA64_GENERIC)")
else ()
set(CORE ${TARGET})
endif()
endif()

# TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
Expand Down Expand Up @@ -310,6 +318,9 @@ if (${TARGET} STREQUAL NEOVERSEV1)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
endif()
endif()
if (${TARGET} STREQUAL ARMV9SME)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3")
endif()
if (${TARGET} STREQUAL A64FX)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx")
Expand Down Expand Up @@ -382,6 +393,8 @@ if (NEED_PIC)
if (NOT NOFORTRAN)
if (${F_COMPILER} STREQUAL "SUN")
set(FCOMMON_OPT "${FCOMMON_OPT} -pic")
elseif (${F_COMPILER} STREQUAL "NAGFOR")
set(FCOMMON_OPT "${FCOMMON_OPT} -PIC")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC")
endif ()
Expand Down Expand Up @@ -640,17 +653,17 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
endif ()

if (CMAKE_Fortran_COMPILER)
if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
message(STATUS "removing fortran flags")
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
message(STATUS "removing fortran flags")
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
endif ()
foreach (FILTER_FLAG ${FILTER_FLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
endforeach ()
endif ()
foreach (FILTER_FLAG ${FILTER_FLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
endforeach ()
endif ()
endif ()

if ("${F_COMPILER}" STREQUAL "GFORTRAN")
Expand All @@ -670,6 +683,9 @@ endif ()
if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE")
endif ()
if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE")
endif ()

if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
if ("${F_COMPILER}" STREQUAL "FLANG")
Expand Down
1 change: 1 addition & 0 deletions common.h
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,7 @@ void gotoblas_profile_init(void);
void gotoblas_profile_quit(void);

int support_avx512(void);
int support_sme1(void);

#ifdef USE_OPENMP

Expand Down
2 changes: 1 addition & 1 deletion common_arm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HUGE_PAGESIZE ( 4 << 20)

#ifndef BUFFERSIZE
#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE)
#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME)
#define BUFFER_SIZE (32 << 22)
#else
#define BUFFER_SIZE (32 << 20)
Expand Down
6 changes: 6 additions & 0 deletions common_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,12 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
#endif
#ifdef ARCH_ARM64
#ifdef HAVE_SME
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
#endif
#endif


int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
Expand Down
4 changes: 2 additions & 2 deletions common_s.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,9 +213,9 @@
#ifdef ARCH_X86_64
#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant
#define SGEMM_DIRECT gotoblas -> sgemm_direct
#else
#elif ARCH_ARM64
#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant
#define SGEMM_DIRECT sgemm_direct
#define SGEMM_DIRECT gotoblas -> sgemm_direct
#endif

#define SGEMM_ONCOPY gotoblas -> sgemm_oncopy
Expand Down
34 changes: 34 additions & 0 deletions driver/others/dynamic_arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ extern gotoblas_t gotoblas_ARMV8SVE;
#else
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#endif
#ifdef DYN_ARMV9SME
extern gotoblas_t gotoblas_ARMV9SME;
#else
#define gotoblas_ARMV9SME gotoblas_ARMV8
#endif
#ifdef DYN_CORTEX_A55
extern gotoblas_t gotoblas_CORTEXA55;
#else
Expand Down Expand Up @@ -148,6 +153,13 @@ extern gotoblas_t gotoblas_A64FX;
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#define gotoblas_A64FX gotoblas_ARMV8
#endif

#ifndef NO_SME
extern gotoblas_t gotoblas_ARMV9SME;
#else
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE
#endif

extern gotoblas_t gotoblas_THUNDERX3T110;
#endif
#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1
Expand All @@ -168,6 +180,9 @@ extern void openblas_warning(int verbose, const char * msg);
#ifndef HWCAP_SVE
#define HWCAP_SVE (1 << 22)
#endif
#ifndef HWCAP2_SME
#define HWCAP2_SME 1<<23
#endif

#define get_cpu_ftr(id, var) ({ \
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
Expand Down Expand Up @@ -393,6 +408,13 @@ static gotoblas_t *get_coretype(void) {
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
openblas_warning(1, coremsg);
}

#if !defined(NO_SME) && defined(HWCAP2_SME)
if ((getauxval(AT_HWCAP2) & HWCAP2_SME)) {
return &gotoblas_ARMV9SME;
}
#endif

#ifndef NO_SVE
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
return &gotoblas_ARMV8SVE;
Expand Down Expand Up @@ -443,3 +465,15 @@ void gotoblas_dynamic_init(void) {
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

int support_sme1(void) {
int ret = 0;

#if (defined OS_LINUX || defined OS_ANDROID)
ret = getauxval(AT_HWCAP2) & HWCAP2_SME;
if(getauxval(AT_HWCAP2) & HWCAP2_SME){
ret = 1;
}
#endif
return ret;
}
13 changes: 13 additions & 0 deletions getarch.c
Original file line number Diff line number Diff line change
Expand Up @@ -1289,6 +1289,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "ARMV8SVE"
#endif

#ifdef FORCE_ARMV9SME
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "ARMV9SME"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DARMV9SME " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9"
#define LIBNAME "armv9sme"
#define CORENAME "ARMV9SME"
#endif

#ifdef FORCE_ARMV8
#define FORCE
Expand Down
Loading

0 comments on commit da50c80

Please sign in to comment.