From 23229011db2ab03c7643f1e0a007efc8e0276201 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 6 Aug 2018 18:20:40 +0300 Subject: [PATCH 001/189] [ZARCH] Z14 support, BLAS 1/2 single precision implementations, Some missing double precision implementations, Gemv optimization --- Makefile.zarch | 4 + cpuid_zarch.c | 35 +- kernel/zarch/KERNEL.Z13 | 20 +- kernel/zarch/KERNEL.Z14 | 146 +++++++ kernel/zarch/camax.c | 269 +++++++++++++ kernel/zarch/camin.c | 269 +++++++++++++ kernel/zarch/casum.c | 167 ++++++++ kernel/zarch/caxpy.c | 174 +++++++++ kernel/zarch/ccopy.c | 99 +++++ kernel/zarch/cdot.c | 182 +++++++++ kernel/zarch/crot.c | 256 ++++++++++++ kernel/zarch/cscal.c | 456 +++++++++++++++++++++ kernel/zarch/cswap.c | 183 +++++++++ kernel/zarch/damax.c | 206 ++++++++++ kernel/zarch/damin.c | 206 ++++++++++ kernel/zarch/dasum.c | 158 ++++---- kernel/zarch/daxpy.c | 177 ++++----- kernel/zarch/dcopy.c | 122 +----- kernel/zarch/ddot.c | 155 +++----- kernel/zarch/dgemv_n_4.c | 516 ++++++++++++++++-------- kernel/zarch/dgemv_t_4.c | 578 ++++++++++++++++++++------- kernel/zarch/dmax.c | 182 +++++++++ kernel/zarch/dmin.c | 182 +++++++++ kernel/zarch/drot.c | 338 ++++++++-------- kernel/zarch/dscal.c | 200 ++++------ kernel/zarch/dsdot.c | 180 +++++++++ kernel/zarch/dswap.c | 292 ++++---------- kernel/zarch/icamax.c | 319 +++++++++++++++ kernel/zarch/icamin.c | 319 +++++++++++++++ kernel/zarch/idamax.c | 295 +++++++------- kernel/zarch/idamin.c | 325 ++++++++------- kernel/zarch/idmax.c | 232 +++++++++++ kernel/zarch/idmin.c | 232 +++++++++++ kernel/zarch/isamax.c | 299 ++++++++++++++ kernel/zarch/isamin.c | 299 ++++++++++++++ kernel/zarch/ismax.c | 275 +++++++++++++ kernel/zarch/ismin.c | 275 +++++++++++++ kernel/zarch/izamax.c | 334 ++++++++-------- kernel/zarch/izamin.c | 400 +++++++++---------- kernel/zarch/samax.c | 210 ++++++++++ kernel/zarch/samin.c | 210 ++++++++++ kernel/zarch/sasum.c | 174 +++++++++ kernel/zarch/saxpy.c | 184 +++++++++ kernel/zarch/scopy.c | 85 ++++ kernel/zarch/sdot.c | 140 +++++++ kernel/zarch/sgemv_n_4.c | 668 +++++++++++++++++++++++++++++++ kernel/zarch/sgemv_t_4.c | 826 +++++++++++++++++++++++++++++++++++++++ kernel/zarch/smax.c | 186 +++++++++ kernel/zarch/smin.c | 186 +++++++++ kernel/zarch/srot.c | 246 ++++++++++++ kernel/zarch/sscal.c | 201 ++++++++++ kernel/zarch/sswap.c | 164 ++++++++ kernel/zarch/zamax.c | 221 +++++++++++ kernel/zarch/zamin.c | 221 +++++++++++ kernel/zarch/zasum.c | 152 +++---- kernel/zarch/zaxpy.c | 216 +++++----- kernel/zarch/zcopy.c | 86 +--- kernel/zarch/zdot.c | 213 ++++------ kernel/zarch/zrot.c | 339 ++++++++-------- kernel/zarch/zscal.c | 460 ++++++++++------------ kernel/zarch/zswap.c | 291 ++++---------- ztest/Makefile | 437 +++++++++++++++++++++ ztest/amax.c | 235 +++++++++++ ztest/amin.c | 235 +++++++++++ ztest/asum.c | 263 +++++++++++++ ztest/axpy.c | 303 ++++++++++++++ ztest/copy.c | 291 ++++++++++++++ ztest/dot.c | 296 ++++++++++++++ ztest/dsdot.c | 229 +++++++++++ ztest/gemv.c | 618 +++++++++++++++++++++++++++++ ztest/iamax.c | 284 ++++++++++++++ ztest/iamin.c | 284 ++++++++++++++ ztest/imax.c | 231 +++++++++++ ztest/imin.c | 231 +++++++++++ ztest/max.c | 229 +++++++++++ ztest/min.c | 229 +++++++++++ ztest/rot.c | 303 ++++++++++++++ ztest/scal.c | 308 +++++++++++++++ ztest/swap.c | 306 +++++++++++++++ 79 files changed, 17382 insertions(+), 2965 deletions(-) create mode 100644 kernel/zarch/KERNEL.Z14 create mode 100644 kernel/zarch/camax.c create mode 100644 kernel/zarch/camin.c create mode 100644 kernel/zarch/casum.c create mode 100644 kernel/zarch/caxpy.c create mode 100644 kernel/zarch/ccopy.c create mode 100644 kernel/zarch/cdot.c create mode 100644 kernel/zarch/crot.c create mode 100644 kernel/zarch/cscal.c create mode 100644 kernel/zarch/cswap.c create mode 100644 kernel/zarch/damax.c create mode 100644 kernel/zarch/damin.c create mode 100644 kernel/zarch/dmax.c create mode 100644 kernel/zarch/dmin.c create mode 100644 kernel/zarch/dsdot.c create mode 100644 kernel/zarch/icamax.c create mode 100644 kernel/zarch/icamin.c create mode 100644 kernel/zarch/idmax.c create mode 100644 kernel/zarch/idmin.c create mode 100644 kernel/zarch/isamax.c create mode 100644 kernel/zarch/isamin.c create mode 100644 kernel/zarch/ismax.c create mode 100644 kernel/zarch/ismin.c create mode 100644 kernel/zarch/samax.c create mode 100644 kernel/zarch/samin.c create mode 100644 kernel/zarch/sasum.c create mode 100644 kernel/zarch/saxpy.c create mode 100644 kernel/zarch/scopy.c create mode 100644 kernel/zarch/sdot.c create mode 100644 kernel/zarch/sgemv_n_4.c create mode 100644 kernel/zarch/sgemv_t_4.c create mode 100644 kernel/zarch/smax.c create mode 100644 kernel/zarch/smin.c create mode 100644 kernel/zarch/srot.c create mode 100644 kernel/zarch/sscal.c create mode 100644 kernel/zarch/sswap.c create mode 100644 kernel/zarch/zamax.c create mode 100644 kernel/zarch/zamin.c create mode 100644 ztest/Makefile create mode 100644 ztest/amax.c create mode 100644 ztest/amin.c create mode 100644 ztest/asum.c create mode 100644 ztest/axpy.c create mode 100644 ztest/copy.c create mode 100644 ztest/dot.c create mode 100644 ztest/dsdot.c create mode 100644 ztest/gemv.c create mode 100644 ztest/iamax.c create mode 100644 ztest/iamin.c create mode 100644 ztest/imax.c create mode 100644 ztest/imin.c create mode 100644 ztest/max.c create mode 100644 ztest/min.c create mode 100644 ztest/rot.c create mode 100644 ztest/scal.c create mode 100644 ztest/swap.c diff --git a/Makefile.zarch b/Makefile.zarch index 9ec9dc79fc..47ea1eb717 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector FCOMMON_OPT += -march=z13 -mzvector endif +ifeq ($(CORE), Z14) +CCOMMON_OPT += -march=z14 -mzvector +FCOMMON_OPT += -march=z14 -mzvector +endif diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 4e19354297..0ae32f27d7 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -29,40 +29,25 @@ #define CPU_GENERIC 0 #define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", - "Z13" + "Z13", + "Z14" }; static char *cpuname_lower[] = { "zarch_generic", - "z13" + "z13", + "z14" }; int detect(void) { - FILE *infile; - char buffer[512], *p; - - p = (char *)NULL; - infile = fopen("/proc/sysinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("Type", buffer, 4)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if (strstr(p, "2964")) return CPU_Z13; - if (strstr(p, "2965")) return CPU_Z13; - - return CPU_GENERIC; + // return CPU_GENERIC; + return CPU_Z14; + } void get_libname(void) @@ -107,5 +92,9 @@ void get_cpuconfig(void) printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; + case CPU_Z14: + printf("#define Z14\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + break; } } diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index add628bfe1..d39b9d904b 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -1,18 +1,18 @@ SAMAXKERNEL = ../arm/amax.c -DAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = damax.c CAMAXKERNEL = ../arm/zamax.c -ZAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = zamax.c SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c +DAMINKERNEL = damin.c CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = zamin.c SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c +DMAXKERNEL = dmax.c SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c +DMINKERNEL = dmin.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = idamax.c @@ -25,10 +25,10 @@ ICAMINKERNEL = ../arm/izamin.c IZAMINKERNEL = izamin.c ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = idmax.c ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c +IDMINKERNEL = idmin.c SASUMKERNEL = ../arm/asum.c DASUMKERNEL = dasum.c @@ -74,12 +74,12 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = dgemv_n_4.c CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = zgemv_n_4.c +ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = zgemv_t_4.c +ZGEMVTKERNEL = ../arm/zgemv_t.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 new file mode 100644 index 0000000000..fa88b68810 --- /dev/null +++ b/kernel/zarch/KERNEL.Z14 @@ -0,0 +1,146 @@ +SAMAXKERNEL = samax.c +DAMAXKERNEL = damax.c +CAMAXKERNEL = camax.c +ZAMAXKERNEL = zamax.c + +SAMINKERNEL = samin.c +DAMINKERNEL = damin.c +CAMINKERNEL = camin.c +ZAMINKERNEL = zamin.c + +SMAXKERNEL = smax.c +DMAXKERNEL = dmax.c + +SMINKERNEL = smin.c +DMINKERNEL = dmin.c + +ISAMAXKERNEL = isamax.c +IDAMAXKERNEL = idamax.c +ICAMAXKERNEL = icamax.c +IZAMAXKERNEL = izamax.c + +ISAMINKERNEL = isamin.c +IDAMINKERNEL = idamin.c +ICAMINKERNEL = icamin.c +IZAMINKERNEL = izamin.c + +ISMAXKERNEL = ismax.c +IDMAXKERNEL = idmax.c + +ISMINKERNEL = ismin.c +IDMINKERNEL = idmin.c + +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c + +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c + +SCOPYKERNEL = scopy.c +DCOPYKERNEL = dcopy.c +CCOPYKERNEL = ccopy.c +ZCOPYKERNEL = zcopy.c + +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c +DSDOTKERNEL = dsdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = srot.c +DROTKERNEL = drot.c +CROTKERNEL = crot.c +ZROTKERNEL = zrot.c + +SSCALKERNEL = sscal.c +DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c +ZSCALKERNEL = zscal.c + +SSWAPKERNEL = sswap.c +DSWAPKERNEL = dswap.c +CSWAPKERNEL = cswap.c +ZSWAPKERNEL = zswap.c + +SGEMVNKERNEL = sgemv_n_4.c +DGEMVNKERNEL = dgemv_n_4.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = sgemv_t_4.c +DGEMVTKERNEL = dgemv_t_4.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = strmm8x4V.S +DTRMMKERNEL = trmm8x4V.S +CTRMMKERNEL = ctrmm4x4V.S +ZTRMMKERNEL = ztrmm4x4V.S + +SGEMMKERNEL = strmm8x4V.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + + + +DGEMMKERNEL = gemm8x4V.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ctrmm4x4V.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ztrmm4x4V.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + + diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c new file mode 100644 index 0000000000..6394be7694 --- /dev/null +++ b/kernel/zarch/camax.c @@ -0,0 +1,269 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vlef %%v0,0(%2),0 \n\t" + "vlef %%v16,4(%2),0 \n\t" + "vlef %%v0,8(%2),1 \n\t" + "vlef %%v16,12(%2),1 \n\t" + "vlef %%v0,16(%2),2 \n\t" + "vlef %%v16,20(%2),2 \n\t" + "vlef %%v0,24(%2),3 \n\t" + "vlef %%v16,28(%2),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v16,%%v16 \n\t" + "vfasb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%2) \n\t" + + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v17,4(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),1 \n\t" + "vlef %%v17,12(%%r1,%2),1 \n\t" + "vlef %%v16,16(%%r1,%2),2 \n\t" + "vlef %%v17,20(%%r1,%2),2 \n\t" + "vlef %%v16,24(%%r1,%2),3 \n\t" + "vlef %%v17,28(%%r1,%2),3 \n\t" + + "vlef %%v18,32(%%r1,%2),0 \n\t" + "vlef %%v19,36(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),1 \n\t" + "vlef %%v19,44(%%r1,%2),1 \n\t" + "vlef %%v18,48(%%r1,%2),2 \n\t" + "vlef %%v19,52(%%r1,%2),2 \n\t" + "vlef %%v18,56(%%r1,%2),3 \n\t" + "vlef %%v19,30(%%r1,%2),3 \n\t" + + "vlef %%v20,64(%%r1,%2),0 \n\t" + "vlef %%v21,68(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),1 \n\t" + "vlef %%v21,76(%%r1,%2),1 \n\t" + "vlef %%v20,80(%%r1,%2),2 \n\t" + "vlef %%v21,84(%%r1,%2),2 \n\t" + "vlef %%v20,88(%%r1,%2),3 \n\t" + "vlef %%v21,92(%%r1,%2),3 \n\t" + + "vlef %%v22,96(%%r1,%2),0 \n\t" + "vlef %%v23,100(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),1 \n\t" + "vlef %%v23,108(%%r1,%2),1 \n\t" + "vlef %%v22,112(%%r1,%2),2 \n\t" + "vlef %%v23,116(%%r1,%2),2 \n\t" + "vlef %%v22,120(%%r1,%2),3 \n\t" + "vlef %%v23,124(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vlef %%v16,128(%%r1,%2),0 \n\t" + "vlef %%v17,132(%%r1,%2),0 \n\t" + "vlef %%v16,136(%%r1,%2),1 \n\t" + "vlef %%v17,140(%%r1,%2),1 \n\t" + "vlef %%v16,144(%%r1,%2),2 \n\t" + "vlef %%v17,148(%%r1,%2),2 \n\t" + "vlef %%v16,152(%%r1,%2),3 \n\t" + "vlef %%v17,156(%%r1,%2),3 \n\t" + + "vlef %%v18,160(%%r1,%2),0 \n\t" + "vlef %%v19,164(%%r1,%2),0 \n\t" + "vlef %%v18,168(%%r1,%2),1 \n\t" + "vlef %%v19,172(%%r1,%2),1 \n\t" + "vlef %%v18,176(%%r1,%2),2 \n\t" + "vlef %%v19,180(%%r1,%2),2 \n\t" + "vlef %%v18,184(%%r1,%2),3 \n\t" + "vlef %%v19,188(%%r1,%2),3 \n\t" + + "vlef %%v20,192(%%r1,%2),0 \n\t" + "vlef %%v21,196(%%r1,%2),0 \n\t" + "vlef %%v20,200(%%r1,%2),1 \n\t" + "vlef %%v21,204(%%r1,%2),1 \n\t" + "vlef %%v20,208(%%r1,%2),2 \n\t" + "vlef %%v21,212(%%r1,%2),2 \n\t" + "vlef %%v20,216(%%r1,%2),3 \n\t" + "vlef %%v21,220(%%r1,%2),3 \n\t" + + "vlef %%v22,224(%%r1,%2),0 \n\t" + "vlef %%v23,228(%%r1,%2),0 \n\t" + "vlef %%v22,232(%%r1,%2),1 \n\t" + "vlef %%v23,236(%%r1,%2),1 \n\t" + "vlef %%v22,240(%%r1,%2),2 \n\t" + "vlef %%v23,244(%%r1,%2),2 \n\t" + "vlef %%v22,248(%%r1,%2),3 \n\t" + "vlef %%v23,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = camax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) > maxf) { + maxf = ABS(x[i*2]); + } + i++; + } + return (maxf); + + } else { + + inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) > maxf) { + maxf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) > maxf) { + maxf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) > maxf) { + maxf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c new file mode 100644 index 0000000000..936c300c88 --- /dev/null +++ b/kernel/zarch/camin.c @@ -0,0 +1,269 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vlef %%v0,0(%2),0 \n\t" + "vlef %%v16,4(%2),0 \n\t" + "vlef %%v0,8(%2),0 \n\t" + "vlef %%v16,12(%2),0 \n\t" + "vlef %%v0,16(%2),2 \n\t" + "vlef %%v16,20(%2),2 \n\t" + "vlef %%v0,24(%2),3 \n\t" + "vlef %%v16,28(%2),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v16,%%v16 \n\t" + "vfasb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v17,4(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),0 \n\t" + "vlef %%v17,12(%%r1,%2),0 \n\t" + "vlef %%v16,16(%%r1,%2),2 \n\t" + "vlef %%v17,20(%%r1,%2),2 \n\t" + "vlef %%v16,24(%%r1,%2),3 \n\t" + "vlef %%v17,28(%%r1,%2),3 \n\t" + + "vlef %%v18,32(%%r1,%2),0 \n\t" + "vlef %%v19,36(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),0 \n\t" + "vlef %%v19,44(%%r1,%2),0 \n\t" + "vlef %%v18,48(%%r1,%2),2 \n\t" + "vlef %%v19,52(%%r1,%2),2 \n\t" + "vlef %%v18,56(%%r1,%2),3 \n\t" + "vlef %%v19,30(%%r1,%2),3 \n\t" + + "vlef %%v20,64(%%r1,%2),0 \n\t" + "vlef %%v21,68(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),0 \n\t" + "vlef %%v21,76(%%r1,%2),0 \n\t" + "vlef %%v20,80(%%r1,%2),2 \n\t" + "vlef %%v21,84(%%r1,%2),2 \n\t" + "vlef %%v20,88(%%r1,%2),3 \n\t" + "vlef %%v21,92(%%r1,%2),3 \n\t" + + "vlef %%v22,96(%%r1,%2),0 \n\t" + "vlef %%v23,100(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),0 \n\t" + "vlef %%v23,108(%%r1,%2),0 \n\t" + "vlef %%v22,112(%%r1,%2),2 \n\t" + "vlef %%v23,116(%%r1,%2),2 \n\t" + "vlef %%v22,120(%%r1,%2),3 \n\t" + "vlef %%v23,124(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vlef %%v16,128(%%r1,%2),0 \n\t" + "vlef %%v17,132(%%r1,%2),0 \n\t" + "vlef %%v16,136(%%r1,%2),0 \n\t" + "vlef %%v17,140(%%r1,%2),0 \n\t" + "vlef %%v16,144(%%r1,%2),2 \n\t" + "vlef %%v17,148(%%r1,%2),2 \n\t" + "vlef %%v16,152(%%r1,%2),3 \n\t" + "vlef %%v17,156(%%r1,%2),3 \n\t" + + "vlef %%v18,160(%%r1,%2),0 \n\t" + "vlef %%v19,164(%%r1,%2),0 \n\t" + "vlef %%v18,168(%%r1,%2),0 \n\t" + "vlef %%v19,172(%%r1,%2),0 \n\t" + "vlef %%v18,176(%%r1,%2),2 \n\t" + "vlef %%v19,180(%%r1,%2),2 \n\t" + "vlef %%v18,184(%%r1,%2),3 \n\t" + "vlef %%v19,188(%%r1,%2),3 \n\t" + + "vlef %%v20,192(%%r1,%2),0 \n\t" + "vlef %%v21,196(%%r1,%2),0 \n\t" + "vlef %%v20,200(%%r1,%2),0 \n\t" + "vlef %%v21,204(%%r1,%2),0 \n\t" + "vlef %%v20,208(%%r1,%2),2 \n\t" + "vlef %%v21,212(%%r1,%2),2 \n\t" + "vlef %%v20,216(%%r1,%2),3 \n\t" + "vlef %%v21,220(%%r1,%2),3 \n\t" + + "vlef %%v22,224(%%r1,%2),0 \n\t" + "vlef %%v23,228(%%r1,%2),0 \n\t" + "vlef %%v22,232(%%r1,%2),0 \n\t" + "vlef %%v23,236(%%r1,%2),0 \n\t" + "vlef %%v22,240(%%r1,%2),2 \n\t" + "vlef %%v23,244(%%r1,%2),2 \n\t" + "vlef %%v22,248(%%r1,%2),3 \n\t" + "vlef %%v23,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = camin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) < minf) { + minf = ABS(x[i*2]); + } + i++; + } + return (minf); + + } else { + + inc_x2 = 2 * inc_x; + minf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) < minf) { + minf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) < minf) { + minf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) < minf) { + minf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c new file mode 100644 index 0000000000..f4ebc21bd8 --- /dev/null +++ b/kernel/zarch/casum.c @@ -0,0 +1,167 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT asum; + + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vfasb %%v0,%%v0,%%v3 \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "ler %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ip=0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -32; + if ( n1 > 0 ) + { + + sumf = casum_kernel_32(n1, x); + i=n1; + ip=2*n1; + } + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + i++; + ip+=2; + } + + } + else + { + inc_x2 = 2* inc_x; + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + ip+=inc_x2; + i++; + } + + } + return(sumf); +} + + diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c new file mode 100644 index 0000000000..2176f3dcd9 --- /dev/null +++ b/kernel/zarch/caxpy.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( +#if !defined(CONJ) + "vlrepf %%v0,0(%3) \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v1,4(%3),2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,4(%3),1 \n\t" + "vlef %%v1,4(%3),3 \n\t" +#else + "vlef %%v0,0(%3),1 \n\t" + "vlef %%v0,0(%3),3 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v0,0(%3),2 \n\t" + "vlrepf %%v1,4(%3) \n\t" +#endif + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + + "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,80(%%r1,%1) \n\t" + "vl %%v18,96(%%r1,%1) \n\t" + "vl %%v19,112(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + + "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,64(%%r1,%2) \n\t" + "vst %%v29,80(%%r1,%2) \n\t" + "vst %%v30,96(%%r1,%2) \n\t" + "vst %%v31,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2]; + + if (n <= 0) return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + + if (n1) { + da[0] = da_r; + da[1] = da_i; + caxpy_kernel_16(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { +#if !defined(CONJ) + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + i++; + ix += 2; + + } + return (0); + + + } + + inc_x *= 2; + inc_y *= 2; + + while (i < n) { + +#if !defined(CONJ) + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + ix += inc_x; + iy += inc_y; + i++; + + } + return (0); + +} + + diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c new file mode 100644 index 0000000000..fc0b8d6485 --- /dev/null +++ b/kernel/zarch/ccopy.c @@ -0,0 +1,99 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,5 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","r2" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + ccopy_kernel_32(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + + return(0); +} diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c new file mode 100644 index 0000000000..3eda2979b9 --- /dev/null +++ b/kernel/zarch/cdot.c @@ -0,0 +1,182 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "vzero %%v28 \n\t" + "vzero %%v29 \n\t" + "vzero %%v30 \n\t" + "vzero %%v31 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + "verllg %%v22,%%v18,32 \n\t" + "verllg %%v23,%%v19,32 \n\t" + + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" + + "vl %%v16, 64(%%r1,%1) \n\t" + "vl %%v17, 80(%%r1,%1) \n\t" + "vl %%v18, 96(%%r1,%1) \n\t" + "vl %%v19, 112(%%r1,%1) \n\t" + "vl %%v0, 64(%%r1,%2) \n\t" + "vl %%v1, 80(%%r1,%2) \n\t" + "vl %%v2, 96(%%r1,%2) \n\t" + "vl %%v3, 112(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + "verllg %%v22,%%v18,32 \n\t" + "verllg %%v23,%%v19,32 \n\t" + + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v24,%%v24,%%v26 \n\t" + "vfasb %%v24,%%v24,%%v28 \n\t" + "vfasb %%v24,%%v24,%%v30 \n\t" + "vrepg %%v26,%%v24,1 \n\t" + "vfasb %%v24,%%v24,%%v26 \n\t" + "vfasb %%v25,%%v25,%%v27 \n\t" + "vfasb %%v25,%%v25,%%v29 \n\t" + "vfasb %%v25,%%v25,%%v31 \n\t" + "vrepg %%v27,%%v25,1 \n\t" + "vfasb %%v25,%%v25,%%v27 \n\t" + "vstef %%v24,0(%3),0 \n\t" + "vstef %%v24,4(%3),1 \n\t" + "vstef %%v25,8(%3),1 \n\t" + "vstef %%v25,12(%3),0 " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); + + } + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + + if (n1) + cdot_kernel_16(n1, x, y, dot); + + i = n1; + BLASLONG j = i * 2; + + while (i < n) { + + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; + + j += 2; + i++; + + } + + + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { + + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; + + ix += inc_x; + iy += inc_y; + i++; + + } + } + +#if !defined(CONJ) + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; +#else + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; + +#endif + + return (result); + +} + + diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c new file mode 100644 index 0000000000..f04a624ac7 --- /dev/null +++ b/kernel/zarch/crot.c @@ -0,0 +1,256 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + __asm__ ( + "vlrepf %%v0,%3 \n\t" + "vlrepf %%v1,%4 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + FLOAT cosa,sina; + cosa=c; + sina=s; + crot_kernel_32(n1, x, y, &cosa, &sina); + i=n1; + ix=2*n1; + } + + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c new file mode 100644 index 0000000000..0c15c5addb --- /dev/null +++ b/kernel/zarch/cscal.c @@ -0,0 +1,456 @@ +/*************************************************************************** +Copyright (c) 2013 - 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepf %%v0,0(%1) \n\t" + "vlef %%v1,4(%1),0 \n\t" + "vlef %%v1,4(%1),2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,4(%1),1 \n\t" + "vlef %%v1,4(%1),3 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + "verllg %%v28,%%v20,32 \n\t" + "verllg %%v29,%%v21,32 \n\t" + "verllg %%v30,%%v22,32 \n\t" + "verllg %%v31,%%v23,32 \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + "vfmasb %%v16,%%v24,%%v1,%%v16 \n\t" + "vfmasb %%v17,%%v25,%%v1,%%v17 \n\t" + "vfmasb %%v18,%%v26,%%v1,%%v18 \n\t" + "vfmasb %%v19,%%v27,%%v1,%%v19 \n\t" + "vfmasb %%v20,%%v28,%%v1,%%v20 \n\t" + "vfmasb %%v21,%%v29,%%v1,%%v21 \n\t" + "vfmasb %%v22,%%v30,%%v1,%%v22 \n\t" + "vfmasb %%v23,%%v31,%%v1,%%v23 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlef %%v0,4(%1),0 \n\t" + "vlef %%v0,4(%1),2 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,4(%1),1 \n\t" + "vlef %%v0,4(%1),3 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v16,%%v16,32 \n\t" + "verllg %%v17,%%v17,32 \n\t" + "verllg %%v18,%%v18,32 \n\t" + "verllg %%v19,%%v19,32 \n\t" + "verllg %%v20,%%v20,32 \n\t" + "verllg %%v21,%%v21,32 \n\t" + "verllg %%v22,%%v22,32 \n\t" + "verllg %%v23,%%v23,32 \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepf %%v0,0(%1) \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); +} + +static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) + { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + } +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2] __attribute__ ((aligned(16))); + + if (inc_x != 1) { + inc_x <<= 1; + + if (da_r == 0.0) { + + BLASLONG n1 = n & -2; + + if (da_i == 0.0) { + + while (j < n1) { + + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; + + } + + } else { + + while (j < n1) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + + } + + + + } + + } else { + + + if (da_i == 0.0) { + BLASLONG n1 = n & -2; + + while (j < n1) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; + + } + + } else { + + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + cscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } + + while (j < n) { + + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + + } + + } + + } + + return (0); + } + + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + alpha[0] = da_r; + alpha[1] = da_i; + + if (da_r == 0.0) + if (da_i == 0) + cscal_kernel_16_zero(n1, x); + else + cscal_kernel_16_zero_r(n1, alpha, x); + else + if (da_i == 0) + cscal_kernel_16_zero_i(n1, alpha, x); + else + cscal_kernel_16(n1, alpha, x); + + i = n1 << 1; + j = n1; + } + + + if (da_r == 0.0) { + + if (da_i == 0.0) { + + while (j < n) { + + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; + + } + + } else { + + while (j < n) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } + + } + + } else { + + if (da_i == 0.0) { + + while (j < n) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; + + } + + } else { + + while (j < n) { + + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } + + } + + } + + return (0); +} diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c new file mode 100644 index 0000000000..256995d500 --- /dev/null +++ b/kernel/zarch/cswap.c @@ -0,0 +1,183 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + cswap_kernel_32(n1, x, y); + i=n1; + ix = 2* n1; + iy = 2* n1; + } + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += 2 ; + iy += 2 ; + i++ ; + + + } + + + } + else + { + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c new file mode 100644 index 0000000000..b74af5d372 --- /dev/null +++ b/kernel/zarch/damax.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = damax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c new file mode 100644 index 0000000000..4cf5e88b13 --- /dev/null +++ b/kernel/zarch/damin.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = damin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 7a42a08634..fea431c34f 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,8 +23,7 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" #include @@ -35,80 +34,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ABS fabsf #endif - - - -static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT asum ; - __asm__ ( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_temp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v2,%%v2,%%v26 \n\t" - "vfadb %%v3,%%v3,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v2,%%v2,%%v30 \n\t" - "vfadb %%v3,%%v3,%%v31 \n\t" - - "vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "la %[ptr_temp],256(%[ptr_temp]) \n\t" - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v2,%%v2,%%v26 \n\t" - "vfadb %%v3,%%v3,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v2,%%v2,%%v30 \n\t" - "vfadb %%v3,%%v3,%%v31 \n\t" - - "clgrjl %[ptr_temp],%%r0,1b \n\t" - "vfadb %%v24,%%v0,%%v1 \n\t" - "vfadb %%v25,%%v2,%%v3 \n\t" - "vfadb %%v0,%%v25,%%v24 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %[asum],%%f0 \n\t" - : [asum] "=f"(asum),[ptr_temp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x) - : "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return asum; - +static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT asum; + + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v2 \n\t" + "vfadb %%v0,%%v0,%%v3 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; } - - - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 16f82a5879..e8823745e4 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -25,98 +25,99 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" -#define PREFETCH_INS 1 -#if defined(Z13_A) -#include - -static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) -{ - BLASLONG i = 0; - __vector double v_a = {alpha,alpha}; - __vector double * v_y=(__vector double *)y; - __vector double * v_x=(__vector double *)x; - - for(; i -#endif - -#ifdef HAVE_KERNEL_4x4 - -#elif HAVE_KERNEL_4x4_VEC - -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double v_x1 = {x1,x1}; - __vector double v_x2 = {x2,x2}; - __vector double v_x3 = {x3,x3}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* va2 = (__vector double*)ap[2]; - __vector double* va3 = (__vector double*)ap[3]; - - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ; - } -} - -#else - static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - - for ( i=0; i<4; i++) - x[i] = xo[i] * *alpha; - - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; - y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; - y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; - y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; - } -} - - -#endif - -#ifdef HAVE_KERNEL_4x2 - -#elif HAVE_KERNEL_4x2_VEC - -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double v_x1 = {x1,x1}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; - } + __asm__ volatile ( + "vlrepg %%v0,0(%5) \n\t" + "vlrepg %%v1,8(%5) \n\t" + "vlrepg %%v2,16(%5) \n\t" + "vlrepg %%v3,24(%5) \n\t" + "vlrepg %%v4,%7 \n\t" + "vfmdb %%v0,%%v0,%%v4 \n\t" + "vfmdb %%v1,%%v1,%%v4 \n\t" + "vfmdb %%v2,%%v2,%%v4 \n\t" + "vfmdb %%v3,%%v3,%%v4 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + "vl %%v24,32(%%r1,%1) \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vl %%v28,48(%%r1,%1) \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "vl %%v4,16(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + + "vl %%v4,32(%%r1,%6) \n\t" + "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,32(%%r1,%6) \n\t" + + "vl %%v4,48(%%r1,%6) \n\t" + "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,48(%%r1,%6) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,64(%%r1,%2) \n\t" + "vl %%v18,64(%%r1,%3) \n\t" + "vl %%v19,64(%%r1,%4) \n\t" + "vl %%v20,80(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,80(%%r1,%3) \n\t" + "vl %%v23,80(%%r1,%4) \n\t" + "vl %%v24,96(%%r1,%1) \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vl %%v28,112(%%r1,%1) \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + + "vl %%v4,64(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,64(%%r1,%6) \n\t" + + "vl %%v4,80(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,80(%%r1,%6) \n\t" + + "vl %%v4,96(%%r1,%6) \n\t" + "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,96(%%r1,%6) \n\t" + + "vl %%v4,112(%%r1,%6) \n\t" + "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,112(%%r1,%6) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "vl %%v4,16(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0,*a1; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap[0]; - a1 = ap[1]; - - for ( i=0; i<2; i++) - x[i] = xo[i] * *alpha; - - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0] + a1[i]*x[1]; - y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; - y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; - y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; - } -} - - -#endif - -#ifdef HAVE_KERNEL_4x1 - -#elif HAVE_KERNEL_4x1_VEC -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0; - x0 = xo[0] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap; - - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] ; - v_y[i+1] += v_x0 * va0[i+1] ; - } + __asm__ volatile ( + "vlrepg %%v0,0(%3) \n\t" + "vlrepg %%v1,8(%3) \n\t" + "vlrepg %%v2,%5 \n\t" + "vfmdb %%v0,%%v0,%%v2 \n\t" + "vfmdb %%v1,%%v1,%%v2 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "vl %%v20,32(%%r1,%1) \n\t" + "vl %%v21,32(%%r1,%2) \n\t" + "vl %%v22,48(%%r1,%1) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vl %%v26,80(%%r1,%1) \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vl %%v28,96(%%r1,%1) \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%1) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "vl %%v2,32(%%r1,%4) \n\t" + "vfmadb %%v2,%%v20,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v21,%%v1,%%v2 \n\t" + "vst %%v2,32(%%r1,%4) \n\t" + + "vl %%v2,48(%%r1,%4) \n\t" + "vfmadb %%v2,%%v22,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v23,%%v1,%%v2 \n\t" + "vst %%v2,48(%%r1,%4) \n\t" + + "vl %%v2,64(%%r1,%4) \n\t" + "vfmadb %%v2,%%v24,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v25,%%v1,%%v2 \n\t" + "vst %%v2,64(%%r1,%4) \n\t" + + "vl %%v2,80(%%r1,%4) \n\t" + "vfmadb %%v2,%%v26,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v27,%%v1,%%v2 \n\t" + "vst %%v2,80(%%r1,%4) \n\t" + + "vl %%v2,96(%%r1,%4) \n\t" + "vfmadb %%v2,%%v28,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v29,%%v1,%%v2 \n\t" + "vst %%v2,96(%%r1,%4) \n\t" + + "vl %%v2,112(%%r1,%4) \n\t" + "vfmadb %%v2,%%v30,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v31,%%v1,%%v2 \n\t" + "vst %%v2,112(%%r1,%4) \n\t" - + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap; - - for ( i=0; i<1; i++) - x[i] = xo[i] * *alpha; - - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0]; - y[i+1] += a0[i+1]*x[0]; - y[i+2] += a0[i+2]*x[0]; - y[i+3] += a0[i+3]*x[0]; - } + __asm__ volatile ( + "vlrepg %%v0,0(%2) \n\t" + "vlrepg %%v1,%4 \n\t" + "vfmdb %%v0,%%v0,%%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%1) \n\t" + "vl %%v22,96(%%r1,%1) \n\t" + "vl %%v23,112(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "vl %%v1,32(%%r1,%3) \n\t" + "vfmadb %%v1,%%v18,%%v0,%%v1 \n\t" + "vst %%v1,32(%%r1,%3) \n\t" + + "vl %%v1,48(%%r1,%3) \n\t" + "vfmadb %%v1,%%v19,%%v0,%%v1 \n\t" + "vst %%v1,48(%%r1,%3) \n\t" + + "vl %%v1,64(%%r1,%3) \n\t" + "vfmadb %%v1,%%v20,%%v0,%%v1 \n\t" + "vst %%v1,64(%%r1,%3) \n\t" + + "vl %%v1,80(%%r1,%3) \n\t" + "vfmadb %%v1,%%v21,%%v0,%%v1 \n\t" + "vst %%v1,80(%%r1,%3) \n\t" + + "vl %%v1,96(%%r1,%3) \n\t" + "vfmadb %%v1,%%v22,%%v0,%%v1 \n\t" + "vst %%v1,96(%%r1,%3) \n\t" + + "vl %%v1,112(%%r1,%3) \n\t" + "vfmadb %%v1,%%v23,%%v0,%%v1 \n\t" + "vst %%v1,112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } - -#endif - - - static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { BLASLONG i; - - for ( i=0; i -#endif #define NBMAX 2048 -#ifdef HAVE_KERNEL_4x4 - -#elif HAVE_KERNEL_4x4_VEC - static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* va2 = (__vector double*)ap[2]; - __vector double* va3 = (__vector double*)ap[3]; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; - __vector double temp1 = {0,0}; - __vector double temp2 = {0,0}; - __vector double temp3 = {0,0}; - - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; - temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ; - temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; - y[1] = temp1[0] + temp1[1]; - y[2] = temp2[0] + temp2[1]; - y[3] = temp3[0] + temp3[1];; -} -#else -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - FLOAT temp3 = 0.0; - - for ( i=0; i< n; i+=4 ) - { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; - temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; - temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; - } - y[0] = temp0; - y[1] = temp1; - y[2] = temp2; - y[3] = temp3; + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + "vl %%v18,32(%%r1,%5) \n\t" + "vl %%v19,48(%%r1,%5) \n\t" + "vl %%v20,64(%%r1,%5) \n\t" + "vl %%v21,80(%%r1,%5) \n\t" + "vl %%v22,96(%%r1,%5) \n\t" + "vl %%v23,112(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" + + "vl %%v24,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v24,%%v0 \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vfmadb %%v1,%%v18,%%v25,%%v1 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2 \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vfmadb %%v3,%%v18,%%v27,%%v3 \n\t" + + "vl %%v28,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v28,%%v0 \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vfmadb %%v1,%%v19,%%v29,%%v1 \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vfmadb %%v2,%%v19,%%v30,%%v2 \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + "vfmadb %%v3,%%v19,%%v31,%%v3 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" + "vl %%v26,64(%%r1,%3) \n\t" + "vfmadb %%v2,%%v20,%%v26,%%v2 \n\t" + "vl %%v27,64(%%r1,%4) \n\t" + "vfmadb %%v3,%%v20,%%v27,%%v3 \n\t" + + "vl %%v28,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vfmadb %%v1,%%v21,%%v29,%%v1 \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vfmadb %%v2,%%v21,%%v30,%%v2 \n\t" + "vl %%v31,80(%%r1,%4) \n\t" + "vfmadb %%v3,%%v21,%%v31,%%v3 \n\t" + + "vl %%v24,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v24,%%v0 \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vfmadb %%v1,%%v22,%%v25,%%v1 \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vfmadb %%v2,%%v22,%%v26,%%v2 \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vfmadb %%v3,%%v22,%%v27,%%v3 \n\t" + + "vl %%v28,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v28,%%v0 \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vfmadb %%v1,%%v23,%%v29,%%v1 \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vfmadb %%v2,%%v23,%%v30,%%v2 \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + "vfmadb %%v3,%%v23,%%v31,%%v3 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v4,%%v0,1 \n\t" + "adbr %%f0,%%f4 \n\t" + "std %%f0,0(%6) \n\t" + "vrepg %%v4,%%v1,1 \n\t" + "adbr %%f1,%%f4 \n\t" + "std %%f1,8(%6) \n\t" + "vrepg %%v4,%%v2,1 \n\t" + "adbr %%f2,%%f4 \n\t" + "std %%f2,16(%6) \n\t" + "vrepg %%v4,%%v3,1 \n\t" + "adbr %%f3,%%f4 \n\t" + "std %%f3,24(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } - -#endif - -#ifdef HAVE_KERNEL_4x2 - -#elif HAVE_KERNEL_4x2_VEC static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; - __vector double temp1 = {0,0}; - - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; - y[1] = temp1[0] + temp1[1]; -} -#else -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - - BLASLONG i; - FLOAT *a0,*a1; - a0 = ap[0]; - a1 = ap[1]; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - - for ( i=0; i< n; i+=4 ) - { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; - } - y[0] = temp0; - y[1] = temp1; - + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" + + "vl %%v28,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v28,%%v0 \n\t" + "vl %%v29,32(%%r1,%2) \n\t" + "vfmadb %%v1,%%v18,%%v29,%%v1 \n\t" + + "vl %%v30,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v30,%%v0 \n\t" + "vl %%v31,48(%%r1,%2) \n\t" + "vfmadb %%v1,%%v19,%%v31,%%v1 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" + + "vl %%v26,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v26,%%v0 \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vfmadb %%v1,%%v21,%%v27,%%v1 \n\t" + + "vl %%v28,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v28,%%v0 \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vfmadb %%v1,%%v22,%%v29,%%v1 \n\t" + + "vl %%v30,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vfmadb %%v1,%%v23,%%v31,%%v1 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v2,%%v0,1 \n\t" + "adbr %%f0,%%f2 \n\t" + "std %%f0,0(%4) \n\t" + "vrepg %%v2,%%v1,1 \n\t" + "adbr %%f1,%%f2 \n\t" + "std %%f1,8(%4) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - -#ifdef HAVE_KERNEL_4x1 - -#elif HAVE_KERNEL_4x1_VEC static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)a0; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; - - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; + __asm__ volatile ( + "vzero %%v0 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + + "vl %%v26,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" + + "vl %%v27,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" + + "vl %%v28,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" + + "vl %%v29,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" + + "vl %%v30,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" + + "vl %%v31,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "std %%f0,0(%3) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { BLASLONG i; - - - FLOAT temp0 = 0.0; - - for ( i=0; i< n; i+=4 ) + for (i = 0; i < n; i++) { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + dest[i] = *src; + src += inc_src; } - y[0] = temp0; } -#endif - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { - BLASLONG i; - for ( i=0; i 0) { + + maxf = dmax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c new file mode 100644 index 0000000000..d7c86735f7 --- /dev/null +++ b/kernel/zarch/dmin.c @@ -0,0 +1,182 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = dmin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index bf29538c7a..c91f958005 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) +static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__ ( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "lgdr %%r1,%[cos] \n\t" - "vlvgp %%v0,%%r1,%%r1 \n\t" - "lgdr %%r1,%[sin] \n\t" - "vlvgp %%v1,%%r1,%%r1 \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 112(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) - : "cc", "r1" ,"v0","v1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ ( + "vlrepg %%v0,%3 \n\t" + "vlrepg %%v1,%4 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -214,8 +204,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -32; if ( n1 > 0 ) { - - drot_kernel_32(n1, x, y, c, s); + FLOAT cosa,sina; + cosa=c; + sina=s; + drot_kernel_32(n1, x, y, &cosa, &sina); i=n1; } @@ -229,6 +221,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + } else { @@ -250,3 +243,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index e29f51012c..ccc6dd95d2 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -27,135 +27,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#ifdef Z13_A -static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) -{ - - - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v0,%%r0,%%r0 \n\t" - "srlg %[n],%[n],4 \n\t" - "vlr %%v1,%%v0 \n\t" - "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "la %[x_ptr], 128(%[x_ptr]) \n\t" - "aghik %[n], %[n], -1 \n\t" - "jle 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - "vfmdb %%v24, %%v16, %%v0 \n\t" - "vfmdb %%v25, %%v17, %%v0 \n\t" - "vfmdb %%v26, %%v18, %%v0 \n\t" - "vfmdb %%v27, %%v19, %%v1 \n\t" - "vlm %%v16,%%v19, 0(%[x_ptr]) \n\t" - "vfmdb %%v28, %%v20, %%v0 \n\t" - "vfmdb %%v29, %%v21, %%v1 \n\t" - "vfmdb %%v30, %%v22, %%v0 \n\t" - "vfmdb %%v31, %%v23, %%v1 \n\t" - "vlm %%v20,%%v23, 64(%[x_ptr]) \n\t" - "lay %[x_ptr], -128(%[x_ptr]) \n\t" - "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" - "la %[x_ptr],256(%[x_ptr]) \n\t" - "brctg %[n],1b \n\t" - "2: \n\t" - "vfmdb %%v24, %%v16, %%v0 \n\t" - "vfmdb %%v25, %%v17, %%v1 \n\t" - "vfmdb %%v26, %%v18, %%v0 \n\t" - "vfmdb %%v27, %%v19, %%v1 \n\t" - "lay %[x_ptr] , -128(%[x_ptr]) \n\t" - "vfmdb %%v28, %%v20, %%v0 \n\t" - "vfmdb %%v29, %%v21, %%v1 \n\t" - "vfmdb %%v30, %%v22, %%v0 \n\t" - "vfmdb %%v31, %%v23, %%v1 \n\t" - "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" - : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n) - : [alpha] "f"(da) - :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", - "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - } -#else -static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) +static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { + __asm__ volatile ( + "vlrepg %%v0,%1 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%2) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 0(%%r1,%2) \n\t" + "vl %%v25, 16(%%r1,%2) \n\t" + "vfmdb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 16(%%r1,%2) \n\t" + "vl %%v26, 32(%%r1,%2) \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 32(%%r1,%2) \n\t" + "vl %%v27, 48(%%r1,%2) \n\t" + "vfmdb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 48(%%r1,%2) \n\t" + "vl %%v24, 64(%%r1,%2) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 64(%%r1,%2) \n\t" + "vl %%v25, 80(%%r1,%2) \n\t" + "vfmdb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 80(%%r1,%2) \n\t" + "vl %%v26, 96(%%r1,%2) \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 96(%%r1,%2) \n\t" + "vl %%v27, 112(%%r1,%2) \n\t" + "vfmdb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 112(%%r1,%2) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v24","v25","v26","v27" + ); +} - /* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */ - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v0,%%r0,%%r0 \n\t" - "vlr %%v1,%%v0 \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%[x_ptr]) \n\t" - "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v1 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v1 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v1 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v1 \n\t" - "vstm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "vlm %%v24,%%v31,128(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vfmdb %%v25,%%v25,%%v1 \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vfmdb %%v27,%%v27,%%v1 \n\t" - "vfmdb %%v28,%%v28,%%v0 \n\t" - "vfmdb %%v29,%%v29,%%v1 \n\t" - "vfmdb %%v30,%%v30,%%v0 \n\t" - "vfmdb %%v31,%%v31,%%v1 \n\t" - "vstm %%v24,%%v31,128(%[x_ptr]) \n\t" - "la %[x_ptr], 256(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da) - :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", - "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - } -#endif -static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x ) +static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { - - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "vzero %%v24 \n\t" - "sllg %%r0,%[n],3 \n\t" - "vzero %%v25 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%[x_ptr]) \n\t" - "vst %%v24, 0(%[x_ptr]) \n\t" - "vst %%v25, 16(%[x_ptr]) \n\t" - "vst %%v24, 32(%[x_ptr]) \n\t" - "vst %%v25, 48(%[x_ptr]) \n\t" - "vst %%v24, 64(%[x_ptr]) \n\t" - "vst %%v25, 80(%[x_ptr]) \n\t" - "vst %%v24, 96(%[x_ptr]) \n\t" - "vst %%v25, 112(%[x_ptr]) \n\t" - "vst %%v24, 128(%[x_ptr]) \n\t" - "vst %%v25, 144(%[x_ptr]) \n\t" - "vst %%v24, 160(%[x_ptr]) \n\t" - "vst %%v25, 176(%[x_ptr]) \n\t" - "vst %%v24, 192(%[x_ptr]) \n\t" - "vst %%v25, 208(%[x_ptr]) \n\t" - "vst %%v24, 224(%[x_ptr]) \n\t" - "vst %%v25, 240(%[x_ptr]) \n\t" - "la %[x_ptr],256(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n) - :"cc" , "r0", "v24" ,"v25" - ); + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); } - - - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0,j=0; @@ -169,11 +109,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 > 0 ) { - dscal_kernel_32_zero(n1 , x); + dscal_kernel_16_zero(n1, x); j=n1; } @@ -188,10 +128,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 > 0 ) { - dscal_kernel_32(n1 , da , x); + dscal_kernel_16(n1, da, x); j=n1; } while(j < n) @@ -260,4 +200,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } return 0; -} \ No newline at end of file +} + + diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c new file mode 100644 index 0000000000..17461a0290 --- /dev/null +++ b/kernel/zarch/dsdot.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2018,The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms,with or without +modification,are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice,this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice,this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL +DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + double dot; + + __asm__ volatile ( + "vzero %%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%3) \n\t" + "vfmsb %%v16,%%v16,%%v24 \n\t" + "vl %%v25,16(%%r1,%3) \n\t" + "vfmsb %%v17,%%v17,%%v25 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmsb %%v18,%%v18,%%v26 \n\t" + "vl %%v27,48(%%r1,%3) \n\t" + "vfmsb %%v19,%%v19,%%v27 \n\t" + "vl %%v28,64(%%r1,%3) \n\t" + "vfmsb %%v20,%%v20,%%v28 \n\t" + "vl %%v29,80(%%r1,%3) \n\t" + "vfmsb %%v21,%%v21,%%v29 \n\t" + "vl %%v30,96(%%r1,%3) \n\t" + "vfmsb %%v22,%%v22,%%v30 \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vfmsb %%v23,%%v23,%%v31 \n\t" + + "vflls %%v24,%%v16 \n\t" + "vflls %%v25,%%v17 \n\t" + "vflls %%v26,%%v18 \n\t" + "vflls %%v27,%%v19 \n\t" + "vflls %%v28,%%v20 \n\t" + "vflls %%v29,%%v21 \n\t" + "vflls %%v30,%%v22 \n\t" + "vflls %%v31,%%v23 \n\t" + + "veslg %%v16,%%v16,32 \n\t" + "veslg %%v17,%%v17,32 \n\t" + "veslg %%v18,%%v18,32 \n\t" + "veslg %%v19,%%v19,32 \n\t" + "veslg %%v20,%%v20,32 \n\t" + "veslg %%v21,%%v21,32 \n\t" + "veslg %%v22,%%v22,32 \n\t" + "veslg %%v23,%%v23,32 \n\t" + + "vflls %%v16,%%v16 \n\t" + "vflls %%v17,%%v17 \n\t" + "vflls %%v18,%%v18 \n\t" + "vflls %%v19,%%v19 \n\t" + "vflls %%v20,%%v20 \n\t" + "vflls %%v21,%%v21 \n\t" + "vflls %%v22,%%v22 \n\t" + "vflls %%v23,%%v23 \n\t" + + "vfadb %%v16,%%v16,%%v24 \n\t" + "vfadb %%v17,%%v17,%%v25 \n\t" + "vfadb %%v18,%%v18,%%v26 \n\t" + "vfadb %%v19,%%v19,%%v27 \n\t" + "vfadb %%v20,%%v20,%%v28 \n\t" + "vfadb %%v21,%%v21,%%v29 \n\t" + "vfadb %%v22,%%v22,%%v30 \n\t" + "vfadb %%v23,%%v23,%%v31 \n\t" + "vfadb %%v16,%%v16,%%v20 \n\t" + "vfadb %%v17,%%v17,%%v21 \n\t" + "vfadb %%v18,%%v18,%%v22 \n\t" + "vfadb %%v19,%%v19,%%v23 \n\t" + "vfadb %%v16,%%v16,%%v18 \n\t" + "vfadb %%v17,%%v17,%%v19 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v0,%%v16,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(dot) + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return dot; +} + +double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + double dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + + if ( n1 ) + dot = dsdot_kernel_32(n1,x,y); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index d7e079147e..8070ef41ac 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,217 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - - #include "common.h" - - -#if defined(Z13_SWAP_A) -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" - ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - -} - -#else - -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__ volatile( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - - - "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" - - "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ volatile( + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; @@ -284,5 +160,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, } - - diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c new file mode 100644 index 0000000000..e7f096e0d4 --- /dev/null +++ b/kernel/zarch/icamax.c @@ -0,0 +1,319 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + + __asm__ volatile ( + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v0,8(%3),1 \n\t" + "vlef %%v1,12(%3),1 \n\t" + "vlef %%v0,16(%3),2 \n\t" + "vlef %%v1,20(%3),2 \n\t" + "vlef %%v0,24(%3),3 \n\t" + "vlef %%v1,28(%3),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v1,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,16 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%3) \n\t" + + "vlef %%v16,0(%%r1,%3),0 \n\t" + "vlef %%v17,4(%%r1,%3),0 \n\t" + "vlef %%v16,8(%%r1,%3),1 \n\t" + "vlef %%v17,12(%%r1,%3),1 \n\t" + "vlef %%v16,16(%%r1,%3),2 \n\t" + "vlef %%v17,20(%%r1,%3),2 \n\t" + "vlef %%v16,24(%%r1,%3),3 \n\t" + "vlef %%v17,28(%%r1,%3),3 \n\t" + + "vlef %%v18,32(%%r1,%3),0 \n\t" + "vlef %%v19,36(%%r1,%3),0 \n\t" + "vlef %%v18,40(%%r1,%3),1 \n\t" + "vlef %%v19,44(%%r1,%3),1 \n\t" + "vlef %%v18,48(%%r1,%3),2 \n\t" + "vlef %%v19,52(%%r1,%3),2 \n\t" + "vlef %%v18,56(%%r1,%3),3 \n\t" + "vlef %%v19,30(%%r1,%3),3 \n\t" + + "vlef %%v20,64(%%r1,%3),0 \n\t" + "vlef %%v21,68(%%r1,%3),0 \n\t" + "vlef %%v20,72(%%r1,%3),1 \n\t" + "vlef %%v21,76(%%r1,%3),1 \n\t" + "vlef %%v20,80(%%r1,%3),2 \n\t" + "vlef %%v21,84(%%r1,%3),2 \n\t" + "vlef %%v20,88(%%r1,%3),3 \n\t" + "vlef %%v21,92(%%r1,%3),3 \n\t" + + "vlef %%v22,96(%%r1,%3),0 \n\t" + "vlef %%v23,100(%%r1,%3),0 \n\t" + "vlef %%v22,104(%%r1,%3),1 \n\t" + "vlef %%v23,108(%%r1,%3),1 \n\t" + "vlef %%v22,112(%%r1,%3),2 \n\t" + "vlef %%v23,116(%%r1,%3),2 \n\t" + "vlef %%v22,120(%%r1,%3),3 \n\t" + "vlef %%v23,124(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vlef %%v16,128(%%r1,%3),0 \n\t" + "vlef %%v17,132(%%r1,%3),0 \n\t" + "vlef %%v16,136(%%r1,%3),1 \n\t" + "vlef %%v17,140(%%r1,%3),1 \n\t" + "vlef %%v16,144(%%r1,%3),2 \n\t" + "vlef %%v17,148(%%r1,%3),2 \n\t" + "vlef %%v16,152(%%r1,%3),3 \n\t" + "vlef %%v17,156(%%r1,%3),3 \n\t" + + "vlef %%v18,160(%%r1,%3),0 \n\t" + "vlef %%v19,164(%%r1,%3),0 \n\t" + "vlef %%v18,168(%%r1,%3),1 \n\t" + "vlef %%v19,172(%%r1,%3),1 \n\t" + "vlef %%v18,176(%%r1,%3),2 \n\t" + "vlef %%v19,180(%%r1,%3),2 \n\t" + "vlef %%v18,184(%%r1,%3),3 \n\t" + "vlef %%v19,188(%%r1,%3),3 \n\t" + + "vlef %%v20,192(%%r1,%3),0 \n\t" + "vlef %%v21,196(%%r1,%3),0 \n\t" + "vlef %%v20,200(%%r1,%3),1 \n\t" + "vlef %%v21,204(%%r1,%3),1 \n\t" + "vlef %%v20,208(%%r1,%3),2 \n\t" + "vlef %%v21,212(%%r1,%3),2 \n\t" + "vlef %%v20,216(%%r1,%3),3 \n\t" + "vlef %%v21,220(%%r1,%3),3 \n\t" + + "vlef %%v22,224(%%r1,%3),0 \n\t" + "vlef %%v23,228(%%r1,%3),0 \n\t" + "vlef %%v22,232(%%r1,%3),1 \n\t" + "vlef %%v23,236(%%r1,%3),1 \n\t" + "vlef %%v22,240(%%r1,%3),2 \n\t" + "vlef %%v23,244(%%r1,%3),2 \n\t" + "vlef %%v22,248(%%r1,%3),3 \n\t" + "vlef %%v23,252(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = icamax_kernel_32(n1, x, &maxf); + + i = n1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } +} + + diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c new file mode 100644 index 0000000000..b9c1ccd9c9 --- /dev/null +++ b/kernel/zarch/icamin.c @@ -0,0 +1,319 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v0,8(%3),1 \n\t" + "vlef %%v1,12(%3),1 \n\t" + "vlef %%v0,16(%3),2 \n\t" + "vlef %%v1,20(%3),2 \n\t" + "vlef %%v0,24(%3),3 \n\t" + "vlef %%v1,28(%3),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v1,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,16 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vlef %%v16,0(%%r1,%3),0 \n\t" + "vlef %%v17,4(%%r1,%3),0 \n\t" + "vlef %%v16,8(%%r1,%3),1 \n\t" + "vlef %%v17,12(%%r1,%3),1 \n\t" + "vlef %%v16,16(%%r1,%3),2 \n\t" + "vlef %%v17,20(%%r1,%3),2 \n\t" + "vlef %%v16,24(%%r1,%3),3 \n\t" + "vlef %%v17,28(%%r1,%3),3 \n\t" + + "vlef %%v18,32(%%r1,%3),0 \n\t" + "vlef %%v19,36(%%r1,%3),0 \n\t" + "vlef %%v18,40(%%r1,%3),1 \n\t" + "vlef %%v19,44(%%r1,%3),1 \n\t" + "vlef %%v18,48(%%r1,%3),2 \n\t" + "vlef %%v19,52(%%r1,%3),2 \n\t" + "vlef %%v18,56(%%r1,%3),3 \n\t" + "vlef %%v19,30(%%r1,%3),3 \n\t" + + "vlef %%v20,64(%%r1,%3),0 \n\t" + "vlef %%v21,68(%%r1,%3),0 \n\t" + "vlef %%v20,72(%%r1,%3),1 \n\t" + "vlef %%v21,76(%%r1,%3),1 \n\t" + "vlef %%v20,80(%%r1,%3),2 \n\t" + "vlef %%v21,84(%%r1,%3),2 \n\t" + "vlef %%v20,88(%%r1,%3),3 \n\t" + "vlef %%v21,92(%%r1,%3),3 \n\t" + + "vlef %%v22,96(%%r1,%3),0 \n\t" + "vlef %%v23,100(%%r1,%3),0 \n\t" + "vlef %%v22,104(%%r1,%3),1 \n\t" + "vlef %%v23,108(%%r1,%3),1 \n\t" + "vlef %%v22,112(%%r1,%3),2 \n\t" + "vlef %%v23,116(%%r1,%3),2 \n\t" + "vlef %%v22,120(%%r1,%3),3 \n\t" + "vlef %%v23,124(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vlef %%v16,128(%%r1,%3),0 \n\t" + "vlef %%v17,132(%%r1,%3),0 \n\t" + "vlef %%v16,136(%%r1,%3),1 \n\t" + "vlef %%v17,140(%%r1,%3),1 \n\t" + "vlef %%v16,144(%%r1,%3),2 \n\t" + "vlef %%v17,148(%%r1,%3),2 \n\t" + "vlef %%v16,152(%%r1,%3),3 \n\t" + "vlef %%v17,156(%%r1,%3),3 \n\t" + + "vlef %%v18,160(%%r1,%3),0 \n\t" + "vlef %%v19,164(%%r1,%3),0 \n\t" + "vlef %%v18,168(%%r1,%3),1 \n\t" + "vlef %%v19,172(%%r1,%3),1 \n\t" + "vlef %%v18,176(%%r1,%3),2 \n\t" + "vlef %%v19,180(%%r1,%3),2 \n\t" + "vlef %%v18,184(%%r1,%3),3 \n\t" + "vlef %%v19,188(%%r1,%3),3 \n\t" + + "vlef %%v20,192(%%r1,%3),0 \n\t" + "vlef %%v21,196(%%r1,%3),0 \n\t" + "vlef %%v20,200(%%r1,%3),1 \n\t" + "vlef %%v21,204(%%r1,%3),1 \n\t" + "vlef %%v20,208(%%r1,%3),2 \n\t" + "vlef %%v21,212(%%r1,%3),2 \n\t" + "vlef %%v20,216(%%r1,%3),3 \n\t" + "vlef %%v21,220(%%r1,%3),3 \n\t" + + "vlef %%v22,224(%%r1,%3),0 \n\t" + "vlef %%v23,228(%%r1,%3),0 \n\t" + "vlef %%v22,232(%%r1,%3),1 \n\t" + "vlef %%v23,236(%%r1,%3),1 \n\t" + "vlef %%v22,240(%%r1,%3),2 \n\t" + "vlef %%v23,244(%%r1,%3),2 \n\t" + "vlef %%v22,248(%%r1,%3),3 \n\t" + "vlef %%v23,252(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + min = icamin_kernel_32(n1, x, &minf); + + i = n1; + } + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } +} + + diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index b670911480..aba880949f 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -23,164 +23,173 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ + #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif - -/** - * Find maximum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vleig %%v20,0,0 \n\t" - "vleig %%v20,1,1 \n\t" - "vleig %%v21,2,0 \n\t" - "vleig %%v21,3,1 \n\t" - "vleig %%v22,4,0 \n\t" - "vleig %%v22,5,1 \n\t" - "vleig %%v23,6,0 \n\t" - "vleig %%v23,7,1 \n\t" - "vrepig %%v4,8 \n\t" - "vzero %%v5 \n\t" - "vzero %%v18 \n\t" - "vzero %%v19 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "vfchdb %%v16,%%v25,%%v24 \n\t " - "vfchdb %%v17,%%v27,%%v26 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v29,%%v28 \n\t " - "vfchdb %%v17,%%v31,%%v30 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - - "vfchdb %%v28, %%v3,%%v0 \n\t" - "vfchdb %%v29,%%v27, %%v25 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - "vfchdb %%v16,%%v25 , %%v0 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - "vfchdb %%v17, %%v29,%%v18 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "vfchdb %%v16,%%v25,%%v24 \n\t " - "vfchdb %%v17,%%v27,%%v26 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v29,%%v28 \n\t " - "vfchdb %%v17,%%v31,%%v30 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - - "vfchdb %%v28, %%v3,%%v0 \n\t" - "vfchdb %%v29,%%v27, %%v25 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - "vfchdb %%v16,%%v25 , %%v0 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - "vfchdb %%v17, %%v29,%%v18 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - "vrepg %%v26,%%v18,1 \n\t" - "vrepg %%v5,%%v19,1 \n\t" - "wfcdb %%v26,%%v18 \n\t" - "jne 2f \n\t" - "vsteg %%v18,%[maxf],0 \n\t" - "vmnlg %%v1,%%v5,%%v19 \n\t" - "j 3f \n\t" - - "2: \n\t" - "wfchdb %%v16,%%v26,%%v18 \n\t" - "vsel %%v1,%%v5,%%v19,%%v16 \n\t" - "vsel %%v0,%%v26,%%v18,%%v16 \n\t" - "std %%f0,%[maxf] \n\t" - - "3: \n\t" - "vlgvg %[index],%%v1,0 \n\t" - : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return index; +static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; -} + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamax; +} BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; - BLASLONG ix = 0; FLOAT maxf = 0.0; BLASLONG max = 0; @@ -191,7 +200,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG n1 = n & -32; if (n1 > 0) { - max = diamax_kernel_32_TUNED(n1, x, &maxf); + max = idamax_kernel_32(n1, x, &maxf); i = n1; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 8a7ff1659a..3213efa4da 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -23,192 +23,185 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ + #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif -/** - * Find minimum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return minimum index - */ -static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vleig %%v20,0,0 \n\t" - "vleig %%v20,1,1 \n\t" - "vleig %%v21,2,0 \n\t" - "vleig %%v21,3,1 \n\t" - "vleig %%v22,4,0 \n\t" - "vleig %%v22,5,1 \n\t" - "vleig %%v23,6,0 \n\t" - "vleig %%v23,7,1 \n\t" - "vrepig %%v4,8 \n\t" - "vlrepg %%v18,0(%[ptr_x]) \n\t" - "vzero %%v5 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vzero %%v19 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfchdb %%v16,%%v24,%%v25 \n\t " - "vfchdb %%v17,%%v26 ,%%v27 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v28, %%v29 \n\t " - "vfchdb %%v17,%%v30,%%v31 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - - - "vfchdb %%v28,%%v0 , %%v3 \n\t" - "vfchdb %%v29, %%v25,%%v27 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - - "vfchdb %%v16, %%v0,%%v25 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - - "vfchdb %%v17,%%v18, %%v29 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfchdb %%v16,%%v24,%%v25 \n\t" - "vfchdb %%v17,%%v26 ,%%v27 \n\t" - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v28 ,%%v29 \n\t" - "vfchdb %%v17,%%v30,%%v31 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - - - "vfchdb %%v28,%%v0 , %%v3 \n\t" - "vfchdb %%v29, %%v25,%%v27 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - - "vfchdb %%v16, %%v0,%%v25 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - - "vfchdb %%v17,%%v18, %%v29 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - - "vrepg %%v26,%%v18,1 \n\t" - "vrepg %%v5,%%v19,1 \n\t" - "wfcdb %%v26,%%v18 \n\t" - "jne 2f \n\t" - "vsteg %%v18,%[minf],0 \n\t" - "vmnlg %%v1,%%v5,%%v19 \n\t" - "j 3f \n\t" - - "2: \n\t" - "wfchdb %%v16,%%v18 ,%%v26 \n\t " - "vsel %%v1,%%v5,%%v19,%%v16 \n\t" - "vsel %%v0,%%v26,%%v18,%%v16 \n\t" - "std %%f0,%[minf] \n\t" - - "3: \n\t" - "vlgvg %[index],%%v1,0 \n\t" - - : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - - ); - - return index; - +static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamin; } - - - + BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; - BLASLONG ix = 0; - BLASLONG min = 0; FLOAT minf = 0.0; - + BLASLONG min = 0; + if (n <= 0 || inc_x <= 0) return (min); - minf = ABS(x[0]); //index's not incremented,though it will make first comparision redundant + if (inc_x == 1) { BLASLONG n1 = n & -32; if (n1 > 0) { - min = diamin_kernel_32(n1, x, &minf); + min = idamin_kernel_32(n1, x, &minf); + i = n1; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c new file mode 100644 index 0000000000..26fff4eb03 --- /dev/null +++ b/kernel/zarch/idmax.c @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) +{ + BLASLONG imax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imax),"=m"(*max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = idmax_kernel_32(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c new file mode 100644 index 0000000000..570b33a151 --- /dev/null +++ b/kernel/zarch/idmin.c @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) +{ + BLASLONG imin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imin),"=m"(*min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + min = idmin_kernel_32(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c new file mode 100644 index 0000000000..95a665b10f --- /dev/null +++ b/kernel/zarch/isamax.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = isamax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c new file mode 100644 index 0000000000..640fc02c92 --- /dev/null +++ b/kernel/zarch/isamin.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + min = isamin_kernel_64(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c new file mode 100644 index 0000000000..0eb3503155 --- /dev/null +++ b/kernel/zarch/ismax.c @@ -0,0 +1,275 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) +{ + BLASLONG imax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imax),"=m"(*max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = ismax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c new file mode 100644 index 0000000000..f050db8cb0 --- /dev/null +++ b/kernel/zarch/ismin.c @@ -0,0 +1,275 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) +{ + BLASLONG imin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imin),"=m"(*min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + min = ismin_kernel_64(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 216c3414a6..bf5f621a7b 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -24,190 +24,165 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include -#define ABS fabs -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - - - - -/** - * Find maximum index - * Warning: requirements n>0 and n % 16 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "vleig %%v16,0,0 \n\t" - "vleig %%v16,1,1 \n\t" - "vleig %%v17,2,0 \n\t" - "vleig %%v17,3,1 \n\t" - "vleig %%v18,4,0 \n\t" - "vleig %%v18,5,1 \n\t" - "vleig %%v19,6,0 \n\t" - "vleig %%v19,7,1 \n\t" - "vleig %%v20,8,0 \n\t" - "vleig %%v20,9,1 \n\t" - "vleig %%v21,10,0 \n\t" - "vleig %%v21,11,1 \n\t" - "vleig %%v22,12,0 \n\t" - "vleig %%v22,13,1 \n\t" - "vleig %%v23,14,0 \n\t" - "vleig %%v23,15,1 \n\t" - - - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v6 \n\t" - "vzero %%v7 \n\t" - "vrepig %%v4,16 \n\t" - "vzero %%v5 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + + __asm__ volatile ( + "vleg %%v0,0(%3),0 \n\t" + "vleg %%v1,8(%3),0 \n\t" + "vleg %%v0,16(%3),1 \n\t" + "vleg %%v1,24(%3),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v1,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,8 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "srlg %%r0,%2,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vleg %%v16,0(%%r1,%3),0 \n\t" + "vleg %%v17,8(%%r1,%3),0 \n\t" + "vleg %%v16,16(%%r1,%3),1 \n\t" + "vleg %%v17,24(%%r1,%3),1 \n\t" + "vleg %%v18,32(%%r1,%3),0 \n\t" + "vleg %%v19,40(%%r1,%3),0 \n\t" + "vleg %%v18,48(%%r1,%3),1 \n\t" + "vleg %%v19,56(%%r1,%3),1 \n\t" + "vleg %%v20,64(%%r1,%3),0 \n\t" + "vleg %%v21,72(%%r1,%3),0 \n\t" + "vleg %%v20,80(%%r1,%3),1 \n\t" + "vleg %%v21,88(%%r1,%3),1 \n\t" + "vleg %%v22,96(%%r1,%3),0 \n\t" + "vleg %%v23,104(%%r1,%3),0 \n\t" + "vleg %%v22,112(%%r1,%3),1 \n\t" + "vleg %%v23,120(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" - "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v24,%%v25 \n\t" - "vfadb %%v1,%%v26,%%v27 \n\t" - "vfadb %%v2,%%v28,%%v29 \n\t" - "vfadb %%v3,%%v30,%%v31 \n\t" - - - "vleg %%v24 , 128(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 136(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 144(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 152(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 160(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 168(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 176(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 184(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 192(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 200(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 208(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 216(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 224(%[ptr_tmp]),0 \n\t" - "vleg %%v31 , 232(%[ptr_tmp]),0 \n\t" - "vleg %%v30 , 240(%[ptr_tmp]),1 \n\t" - "vleg %%v31 , 248(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - "vfchdb %%v25,%%v1,%%v0 \n\t" - "vsel %%v29,%%v17,%%v16,%%v25 \n\t" - "vsel %%v31,%%v1,%%v0,%%v25 \n\t" - - "vfchdb %%v27,%%v3,%%v2 \n\t " - "vsel %%v0,%%v19,%%v18,%%v27 \n\t" - "vsel %%v1,%%v3,%%v2,%%v27 \n\t" - - "vfchdb %%v25,%%v26,%%v24 \n\t" - "vsel %%v2,%%v21,%%v20,%%v25 \n\t" - "vsel %%v3,%%v26,%%v24,%%v25 \n\t" - - "vfchdb %%v27,%%v30,%%v28 \n\t" - "vsel %%v25,%%v23,%%v22,%%v27 \n\t" - "vsel %%v27,%%v30,%%v28,%%v27 \n\t" - - "vfchdb %%v24, %%v1,%%v31 \n\t" - "vsel %%v26,%%v0,%%v29,%%v24 \n\t" - "vsel %%v28,%%v1,%%v31,%%v24 \n\t" - - "vfchdb %%v30, %%v27,%%v3 \n\t" - "vsel %%v29,%%v25,%%v2,%%v30 \n\t" - "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" - - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - - "vfchdb %%v0, %%v31,%%v28 \n\t" - "vsel %%v25,%%v29,%%v26,%%v0 \n\t" - "vsel %%v27,%%v31,%%v28,%%v0 \n\t" - - "vag %%v25,%%v25,%%v5 \n\t" - - //cmp with previous - "vfchdb %%v30, %%v27,%%v6 \n\t" - "vsel %%v7,%%v25,%%v7,%%v30 \n\t" - "vsel %%v6,%%v27,%%v6,%%v30 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - //xtract index - "vrepg %%v26,%%v6,1 \n\t" - "vrepg %%v5,%%v7,1 \n\t" - "wfcdb %%v26,%%v6 \n\t" - "jne 2f \n\t" - "vsteg %%v6,%[maxf],0 \n\t" - "vmnlg %%v1,%%v5,%%v7 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "j 3 \n\t" - "2: \n\t" - "wfchdb %%v16,%%v26,%%v6 \n\t" - "vsel %%v1,%%v5,%%v7,%%v16 \n\t" - "vsel %%v0,%%v26,%%v6,%%v16 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "std %%f0,%[maxf] \n\t" - "3: \n\t" - : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - - ); - return index; - + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vleg %%v16,128(%%r1,%3),0 \n\t" + "vleg %%v17,136(%%r1,%3),0 \n\t" + "vleg %%v16,144(%%r1,%3),1 \n\t" + "vleg %%v17,152(%%r1,%3),1 \n\t" + "vleg %%v18,160(%%r1,%3),0 \n\t" + "vleg %%v19,168(%%r1,%3),0 \n\t" + "vleg %%v18,176(%%r1,%3),1 \n\t" + "vleg %%v19,184(%%r1,%3),1 \n\t" + "vleg %%v20,192(%%r1,%3),0 \n\t" + "vleg %%v21,200(%%r1,%3),0 \n\t" + "vleg %%v20,208(%%r1,%3),1 \n\t" + "vleg %%v21,216(%%r1,%3),1 \n\t" + "vleg %%v22,224(%%r1,%3),0 \n\t" + "vleg %%v23,232(%%r1,%3),0 \n\t" + "vleg %%v22,240(%%r1,%3),1 \n\t" + "vleg %%v23,248(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamax; } - - - - - BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; @@ -223,9 +198,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG n1 = n & -16; if (n1 > 0) { - max = ziamax_kernel_16_TUNED(n1, x, &maxf); + max = izamax_kernel_16(n1, x, &maxf); + i = n1; - ix = n1 << 1; } while(i < n) @@ -260,7 +235,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return (max + 1); } - } diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 9b2a653a77..3636e8fdf5 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -24,253 +24,217 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include -#define ABS fabs -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - - -/** - * Find minimum index - * Warning: requirements n>0 and n % 16 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return minimum index - */ -static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index ; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "vleig %%v16,0,0 \n\t" - "vleig %%v16,1,1 \n\t" - "vleig %%v17,2,0 \n\t" - "vleig %%v17,3,1 \n\t" - "vleig %%v18,4,0 \n\t" - "vleig %%v18,5,1 \n\t" - "vleig %%v19,6,0 \n\t" - "vleig %%v19,7,1 \n\t" - "vleig %%v20,8,0 \n\t" - "vleig %%v20,9,1 \n\t" - "vleig %%v21,10,0 \n\t" - "vleig %%v21,11,1 \n\t" - "vleig %%v22,12,0 \n\t" - "vleig %%v22,13,1 \n\t" - "vleig %%v23,14,0 \n\t" - "vleig %%v23,15,1 \n\t" - "ld %%f6,0(%[ptr_x]) \n\t" - "lpdbr %%f6,%%f6 \n\t" - "ld %%f7,8(%[ptr_x]) \n\t" - "lpdbr %%f7,%%f7 \n\t" - "adbr %%f6,%%f7 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vrepg %%v6,%%v6,0 \n\t" - "vzero %%v7 \n\t" - "vrepig %%v4,16 \n\t" - "vzero %%v5 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vleg %%v0,0(%3),0 \n\t" + "vleg %%v1,8(%3),0 \n\t" + "vleg %%v0,16(%3),1 \n\t" + "vleg %%v1,24(%3),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v1,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,8 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "srlg %%r0,%2,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vleg %%v16,0(%%r1,%3),0 \n\t" + "vleg %%v17,8(%%r1,%3),0 \n\t" + "vleg %%v16,16(%%r1,%3),1 \n\t" + "vleg %%v17,24(%%r1,%3),1 \n\t" + "vleg %%v18,32(%%r1,%3),0 \n\t" + "vleg %%v19,40(%%r1,%3),0 \n\t" + "vleg %%v18,48(%%r1,%3),1 \n\t" + "vleg %%v19,56(%%r1,%3),1 \n\t" + "vleg %%v20,64(%%r1,%3),0 \n\t" + "vleg %%v21,72(%%r1,%3),0 \n\t" + "vleg %%v20,80(%%r1,%3),1 \n\t" + "vleg %%v21,88(%%r1,%3),1 \n\t" + "vleg %%v22,96(%%r1,%3),0 \n\t" + "vleg %%v23,104(%%r1,%3),0 \n\t" + "vleg %%v22,112(%%r1,%3),1 \n\t" + "vleg %%v23,120(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" - "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v24,%%v25 \n\t" - "vfadb %%v1,%%v26,%%v27 \n\t" - "vfadb %%v2,%%v28,%%v29 \n\t" - "vfadb %%v3,%%v30,%%v31 \n\t" - - - "vleg %%v24 ,128(%[ptr_tmp]),0 \n\t" - "vleg %%v25 ,136(%[ptr_tmp]),0 \n\t" - "vleg %%v24 ,144(%[ptr_tmp]),1 \n\t" - "vleg %%v25 ,152(%[ptr_tmp]),1 \n\t" - "vleg %%v26 ,160(%[ptr_tmp]),0 \n\t" - "vleg %%v27 ,168(%[ptr_tmp]),0 \n\t" - "vleg %%v26 ,176(%[ptr_tmp]),1 \n\t" - "vleg %%v27 ,184(%[ptr_tmp]),1 \n\t" - "vleg %%v28 ,192(%[ptr_tmp]),0 \n\t" - "vleg %%v29 ,200(%[ptr_tmp]),0 \n\t" - "vleg %%v28 ,208(%[ptr_tmp]),1 \n\t" - "vleg %%v29 ,216(%[ptr_tmp]),1 \n\t" - "vleg %%v30 ,224(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,232(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,240(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,248(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - - "vfchdb %%v25,%%v0 ,%%v1 \n\t" - "vsel %%v29,%%v17,%%v16,%%v25 \n\t" - "vsel %%v31,%%v1,%%v0,%%v25 \n\t" - - "vfchdb %%v27,%%v2,%%v3 \n\t" - "vsel %%v0,%%v19,%%v18,%%v27 \n\t" - "vsel %%v1,%%v3,%%v2,%%v27 \n\t" - - "vfchdb %%v25,%%v24,%%v26 \n\t" - "vsel %%v2,%%v21,%%v20,%%v25 \n\t" - "vsel %%v3,%%v26,%%v24,%%v25 \n\t" - - "vfchdb %%v27,%%v28,%%v30 \n\t" - "vsel %%v25,%%v23,%%v22,%%v27 \n\t" - "vsel %%v27,%%v30,%%v28,%%v27 \n\t" - - "vfchdb %%v24,%%v31, %%v1 \n\t" - "vsel %%v26,%%v0,%%v29,%%v24 \n\t" - "vsel %%v28,%%v1,%%v31,%%v24 \n\t" - - "vfchdb %%v30,%%v3, %%v27 \n\t" - "vsel %%v29,%%v25,%%v2,%%v30 \n\t" - "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" - - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - - "vfchdb %%v0,%%v28, %%v31 \n\t" - "vsel %%v25,%%v29,%%v26,%%v0 \n\t" - "vsel %%v27,%%v31,%%v28,%%v0 \n\t" - - "vag %%v25,%%v25,%%v5 \n\t" - - //cmp with previous - "vfchdb %%v30,%%v6 , %%v27 \n\t" - "vsel %%v7,%%v25,%%v7,%%v30 \n\t" - "vsel %%v6,%%v27,%%v6,%%v30 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - //xtract index - "vrepg %%v26,%%v6,1 \n\t" - "vrepg %%v5,%%v7,1 \n\t" - "wfcdb %%v26,%%v6 \n\t" - "jne 2f \n\t" - "vsteg %%v6,%[minf],0 \n\t" - "vmnlg %%v1,%%v5,%%v7 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "j 3f \n\t" - "2: \n\t" - "wfchdb %%v16,%%v6 ,%%v26 \n\t" - "vsel %%v1,%%v5,%%v7,%%v16 \n\t" - "vsel %%v0,%%v26,%%v6,%%v16 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "std %%f0,%[minf] \n\t" - "3: \n\t" - - : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - - ); - - return index; + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vleg %%v16,128(%%r1,%3),0 \n\t" + "vleg %%v17,136(%%r1,%3),0 \n\t" + "vleg %%v16,144(%%r1,%3),1 \n\t" + "vleg %%v17,152(%%r1,%3),1 \n\t" + "vleg %%v18,160(%%r1,%3),0 \n\t" + "vleg %%v19,168(%%r1,%3),0 \n\t" + "vleg %%v18,176(%%r1,%3),1 \n\t" + "vleg %%v19,184(%%r1,%3),1 \n\t" + "vleg %%v20,192(%%r1,%3),0 \n\t" + "vleg %%v21,200(%%r1,%3),0 \n\t" + "vleg %%v20,208(%%r1,%3),1 \n\t" + "vleg %%v21,216(%%r1,%3),1 \n\t" + "vleg %%v22,224(%%r1,%3),0 \n\t" + "vleg %%v23,232(%%r1,%3),0 \n\t" + "vleg %%v22,240(%%r1,%3),1 \n\t" + "vleg %%v23,248(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamin; } - - - - - BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf; - BLASLONG min=0; + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(min); - - + if (inc_x == 1) { - BLASLONG n1 = n & -16; - if (n1 > 0) { + BLASLONG n1 = n & -16; + if (n1 > 0) { + + min = izamin_kernel_16(n1, x, &minf); - min = ziamin_kernel_16_TUNED(n1, x, &minf); i = n1; - ix = n1 << 1; - } - else { - //assign minf - minf = CABS1(x,0); - ix += 2; - i++; - } + } - while(i < n) + while(i < n) + { + if( CABS1(x,ix) < minf ) { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; + min = i; + minf = CABS1(x,ix); } + ix += 2; + i++; + } return (min + 1); } else { - inc_x2 = 2 * inc_x; + inc_x2 = 2 * inc_x; - minf = CABS1(x,0); - ix += inc_x2; - i++; + minf = CABS1(x,0); + ix += inc_x2; + i++; - while(i < n) + while(i < n) + { + if( CABS1(x,ix) < minf ) { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; + min = i; + minf = CABS1(x,ix); } + ix += inc_x2; + i++; + } return (min + 1); } - } diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c new file mode 100644 index 0000000000..1025cfcbfe --- /dev/null +++ b/kernel/zarch/samax.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vfchsb %%v26,%%v20,%%v21 \n\t" + "vfchsb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v24,%%v25 \n\t" + "vfchsb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vfchsb %%v26,%%v20,%%v21 \n\t" + "vfchsb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v24,%%v25 \n\t" + "vfchsb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + maxf = samax_kernel_64(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c new file mode 100644 index 0000000000..3b8f03e6a2 --- /dev/null +++ b/kernel/zarch/samin.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + minf = samin_kernel_64(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c new file mode 100644 index 0000000000..2c59ab2e5f --- /dev/null +++ b/kernel/zarch/sasum.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT asum; + + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vfasb %%v0,%%v0,%%v3 \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "ler %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return sumf; + + if (inc_x == 1) { + + n1 = n & -64; + + if (n1 > 0) { + + sumf = sasum_kernel_64(n1, x); + i = n1; + } + + while (i < n) { + sumf += ABS(x[i]); + i++; + } + + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { + + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + + i += inc_x * 4; + j += 4; + + } + sumf = sum1 + sum2; + while (j < n) { + + sumf += ABS(x[i]); + i += inc_x; + j++; + } + + + } + return sumf; +} + + diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c new file mode 100644 index 0000000000..26ead310cb --- /dev/null +++ b/kernel/zarch/saxpy.c @@ -0,0 +1,184 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( + "vlrepf %%v0,%3 \n\t" + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + + "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,80(%%r1,%1) \n\t" + "vl %%v26,96(%%r1,%1) \n\t" + "vl %%v27,112(%%r1,%1) \n\t" + "vl %%v28,64(%%r1,%2) \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vl %%v30,96(%%r1,%2) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" + "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" + "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" + "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "vl %%v16,128(%%r1,%1) \n\t" + "vl %%v17,144(%%r1,%1) \n\t" + "vl %%v18,160(%%r1,%1) \n\t" + "vl %%v19,176(%%r1,%1) \n\t" + "vl %%v20,128(%%r1,%2) \n\t" + "vl %%v21,144(%%r1,%2) \n\t" + "vl %%v22,160(%%r1,%2) \n\t" + "vl %%v23,176(%%r1,%2) \n\t" + + "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" + + "vl %%v24,192(%%r1,%1) \n\t" + "vl %%v25,208(%%r1,%1) \n\t" + "vl %%v26,224(%%r1,%1) \n\t" + "vl %%v27,240(%%r1,%1) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" + + "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" + "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" + "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" + "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" + + "vst %%v16,128(%%r1,%2) \n\t" + "vst %%v17,144(%%r1,%2) \n\t" + "vst %%v18,160(%%r1,%2) \n\t" + "vst %%v19,176(%%r1,%2) \n\t" + "vst %%v20,192(%%r1,%2) \n\t" + "vst %%v21,208(%%r1,%2) \n\t" + "vst %%v22,224(%%r1,%2) \n\t" + "vst %%v23,240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return 0 ; + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -64; + + if ( n1 ) + saxpy_kernel_64(n1, x, y , &da); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return 0 ; + + + } + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return 0 ; + +} + + diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c new file mode 100644 index 0000000000..ff4227595c --- /dev/null +++ b/kernel/zarch/scopy.c @@ -0,0 +1,85 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,6 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","r2" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + + if (n <= 0) return 0; + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + scopy_kernel_64(n1, x, y); + i = n1; + } + + while (i < n) { + y[i] = x[i]; + i++; + + } + + + } else { + + while (i < n) { + + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; + + } + + } + return 0; + + +} diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c new file mode 100644 index 0000000000..fd8c8e4455 --- /dev/null +++ b/kernel/zarch/sdot.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2013-2018,The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms,with or without +modification,are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice,this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice,this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL +DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + FLOAT dot; + + __asm__ volatile ( + "vzero %%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%3) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,16(%%r1,%3) \n\t" + "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" + "vl %%v27,48(%%r1,%3) \n\t" + "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" + "vl %%v28,64(%%r1,%3) \n\t" + "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%3) \n\t" + "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" + "vl %%v30,96(%%r1,%3) \n\t" + "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepf %%v1,%%v0,1 \n\t" + "vrepf %%v2,%%v0,2 \n\t" + "vrepf %%v3,%%v0,3 \n\t" + "aebr %%f0,%%f1 \n\t" + "aebr %%f0,%%f2 \n\t" + "aebr %%f0,%%f3 \n\t" + "ler %0,%%f0 " + :"=f"(dot) + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return dot; +} + +FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + + if ( n1 ) + dot = sdot_kernel_32(n1,x,y); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c new file mode 100644 index 0000000000..92019d7322 --- /dev/null +++ b/kernel/zarch/sgemv_n_4.c @@ -0,0 +1,668 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%5) \n\t" + "vlrepf %%v1,4(%5) \n\t" + "vlrepf %%v2,8(%5) \n\t" + "vlrepf %%v3,12(%5) \n\t" + "vlrepf %%v4,%7 \n\t" + "vfmsb %%v0,%%v0,%%v4 \n\t" + "vfmsb %%v1,%%v1,%%v4 \n\t" + "vfmsb %%v2,%%v2,%%v4 \n\t" + "vfmsb %%v3,%%v3,%%v4 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + "vl %%v24,32(%%r1,%1) \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vl %%v28,48(%%r1,%1) \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "vl %%v4,16(%%r1,%6) \n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + + "vl %%v4,32(%%r1,%6) \n\t" + "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,32(%%r1,%6) \n\t" + + "vl %%v4,48(%%r1,%6) \n\t" + "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,48(%%r1,%6) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,64(%%r1,%2) \n\t" + "vl %%v18,64(%%r1,%3) \n\t" + "vl %%v19,64(%%r1,%4) \n\t" + "vl %%v20,80(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,80(%%r1,%3) \n\t" + "vl %%v23,80(%%r1,%4) \n\t" + "vl %%v24,96(%%r1,%1) \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vl %%v28,112(%%r1,%1) \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + + "vl %%v4,64(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,64(%%r1,%6) \n\t" + + "vl %%v4,80(%%r1,%6) \n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,80(%%r1,%6) \n\t" + + "vl %%v4,96(%%r1,%6) \n\t" + "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,96(%%r1,%6) \n\t" + + "vl %%v4,112(%%r1,%6) \n\t" + "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,112(%%r1,%6) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%3) \n\t" + "vlrepf %%v1,4(%3) \n\t" + "vlrepf %%v2,%5 \n\t" + "vfmsb %%v0,%%v0,%%v2 \n\t" + "vfmsb %%v1,%%v1,%%v2 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "vl %%v20,32(%%r1,%1) \n\t" + "vl %%v21,32(%%r1,%2) \n\t" + "vl %%v22,48(%%r1,%1) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vl %%v26,80(%%r1,%1) \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vl %%v28,96(%%r1,%1) \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%1) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmasb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "vl %%v2,32(%%r1,%4) \n\t" + "vfmasb %%v2,%%v20,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v21,%%v1,%%v2 \n\t" + "vst %%v2,32(%%r1,%4) \n\t" + + "vl %%v2,48(%%r1,%4) \n\t" + "vfmasb %%v2,%%v22,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v23,%%v1,%%v2 \n\t" + "vst %%v2,48(%%r1,%4) \n\t" + + "vl %%v2,64(%%r1,%4) \n\t" + "vfmasb %%v2,%%v24,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v25,%%v1,%%v2 \n\t" + "vst %%v2,64(%%r1,%4) \n\t" + + "vl %%v2,80(%%r1,%4) \n\t" + "vfmasb %%v2,%%v26,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v27,%%v1,%%v2 \n\t" + "vst %%v2,80(%%r1,%4) \n\t" + + "vl %%v2,96(%%r1,%4) \n\t" + "vfmasb %%v2,%%v28,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v29,%%v1,%%v2 \n\t" + "vst %%v2,96(%%r1,%4) \n\t" + + "vl %%v2,112(%%r1,%4) \n\t" + "vfmasb %%v2,%%v30,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v31,%%v1,%%v2 \n\t" + "vst %%v2,112(%%r1,%4) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%2) \n\t" + "vlrepf %%v1,%4 \n\t" + "vfmsb %%v0,%%v0,%%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%1) \n\t" + "vl %%v22,96(%%r1,%1) \n\t" + "vl %%v23,112(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmasb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "vl %%v1,32(%%r1,%3) \n\t" + "vfmasb %%v1,%%v18,%%v0,%%v1 \n\t" + "vst %%v1,32(%%r1,%3) \n\t" + + "vl %%v1,48(%%r1,%3) \n\t" + "vfmasb %%v1,%%v19,%%v0,%%v1 \n\t" + "vst %%v1,48(%%r1,%3) \n\t" + + "vl %%v1,64(%%r1,%3) \n\t" + "vfmasb %%v1,%%v20,%%v0,%%v1 \n\t" + "vst %%v1,64(%%r1,%3) \n\t" + + "vl %%v1,80(%%r1,%3) \n\t" + "vfmasb %%v1,%%v21,%%v0,%%v1 \n\t" + "vst %%v1,80(%%r1,%3) \n\t" + + "vl %%v1,96(%%r1,%3) \n\t" + "vfmasb %%v1,%%v22,%%v0,%%v1 \n\t" + "vst %%v1,96(%%r1,%3) \n\t" + + "vl %%v1,112(%%r1,%3) \n\t" + "vfmasb %%v1,%%v23,%%v0,%%v1 \n\t" + "vst %%v1,112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + for (i = 0; i < n; i++) + { + *dest += src[i]; + dest += inc_dest; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[4]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4 = lda << 2; + FLOAT xbuffer[8],*ybuffer; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + ybuffer = buffer; + + n1 = n >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c new file mode 100644 index 0000000000..efc06297f3 --- /dev/null +++ b/kernel/zarch/sgemv_t_4.c @@ -0,0 +1,826 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + "vl %%v18,32(%%r1,%5) \n\t" + "vl %%v19,48(%%r1,%5) \n\t" + "vl %%v20,64(%%r1,%5) \n\t" + "vl %%v21,80(%%r1,%5) \n\t" + "vl %%v22,96(%%r1,%5) \n\t" + "vl %%v23,112(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" + + "vl %%v24,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" + + "vl %%v28,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + "vl %%v26,64(%%r1,%3) \n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" + "vl %%v27,64(%%r1,%4) \n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" + + "vl %%v28,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" + "vl %%v31,80(%%r1,%4) \n\t" + "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" + + "vl %%v24,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" + + "vl %%v28,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepf %%v4,%%v0,1 \n\t" + "aebr %%f0,%%f4 \n\t" + "vrepf %%v4,%%v0,2 \n\t" + "aebr %%f0,%%f4 \n\t" + "vrepf %%v4,%%v0,3 \n\t" + "aebr %%f0,%%f4 \n\t" + "ste %%f0,0(%6) \n\t" + "vrepf %%v4,%%v1,1 \n\t" + "aebr %%f1,%%f4 \n\t" + "vrepf %%v4,%%v1,2 \n\t" + "aebr %%f1,%%f4 \n\t" + "vrepf %%v4,%%v1,3 \n\t" + "aebr %%f1,%%f4 \n\t" + "ste %%f1,4(%6) \n\t" + "vrepf %%v4,%%v2,1 \n\t" + "aebr %%f2,%%f4 \n\t" + "vrepf %%v4,%%v2,2 \n\t" + "aebr %%f2,%%f4 \n\t" + "vrepf %%v4,%%v2,3 \n\t" + "aebr %%f2,%%f4 \n\t" + "ste %%f2,8(%6) \n\t" + "vrepf %%v4,%%v3,1 \n\t" + "aebr %%f3,%%f4 \n\t" + "vrepf %%v4,%%v3,2 \n\t" + "aebr %%f3,%%f4 \n\t" + "vrepf %%v4,%%v3,3 \n\t" + "aebr %%f3,%%f4 \n\t" + "ste %%f3,12(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" + + "vl %%v28,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" + "vl %%v29,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" + + "vl %%v30,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" + "vl %%v31,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + + "vl %%v26,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" + + "vl %%v28,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" + + "vl %%v30,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepf %%v2,%%v0,1 \n\t" + "aebr %%f0,%%f2 \n\t" + "vrepf %%v2,%%v0,2 \n\t" + "aebr %%f0,%%f2 \n\t" + "vrepf %%v2,%%v0,3 \n\t" + "aebr %%f0,%%f2 \n\t" + "ste %%f0,0(%4) \n\t" + "vrepf %%v2,%%v1,1 \n\t" + "aebr %%f1,%%f2 \n\t" + "vrepf %%v2,%%v1,2 \n\t" + "aebr %%f1,%%f2 \n\t" + "vrepf %%v2,%%v1,3 \n\t" + "aebr %%f1,%%f2 \n\t" + "ste %%f1,4(%4) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" + + "vl %%v26,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" + + "vl %%v27,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" + + "vl %%v28,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" + + "vl %%v29,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" + + "vl %%v30,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" + + "vl %%v31,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepf %%v1,%%v0,1 \n\t" + "aebr %%f0,%%f1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "vrepf %%v1,%%v0,3 \n\t" + "aebr %%f0,%%f1 \n\t" + "ste %%f0,0(%3) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for (i = 0; i < n; i++) + { + dest[i] = *src; + src += inc_src; + } +} + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) +{ + __asm__ volatile ( + "vlrepf %%v0,%1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + "vl %%v25, 16(%%r1,%3) \n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" + "vst %%v25, 16(%%r1,%3) \n\t" + "vl %%v26, 32(%%r1,%3) \n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" + "vst %%v26, 32(%%r1,%3) \n\t" + "vl %%v27, 48(%%r1,%3) \n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" + "vst %%v27, 48(%%r1,%3) \n\t" + "vl %%v28, 64(%%r1,%3) \n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" + "vst %%v28, 64(%%r1,%3) \n\t" + "vl %%v29, 80(%%r1,%3) \n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" + "vst %%v29, 80(%%r1,%3) \n\t" + "vl %%v30, 96(%%r1,%3) \n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" + "vst %%v30, 96(%%r1,%3) \n\t" + "vl %%v31, 112(%%r1,%3) \n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" + "vst %%v31, 112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else + { + BLASLONG i; + for (i = 0; i < n; i++) + { + *dest += src[i] * da; + dest += inc_dest; + } + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG register i; + BLASLONG register j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + BLASLONG n0; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[2] __attribute__ ((aligned(16))); + FLOAT *xbuffer; + FLOAT *ytemp; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + xbuffer = buffer; + ytemp = buffer + (m < NBMAX ? m : NBMAX); + + n0 = n / NBMAX; + n1 = (n % NBMAX) >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } + + if ( n2 & 2 ) + { + + sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) + { + + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j 0) { + + maxf = smax_kernel_64(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c new file mode 100644 index 0000000000..e882b7ff17 --- /dev/null +++ b/kernel/zarch/smin.c @@ -0,0 +1,186 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + minf = smin_kernel_64(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c new file mode 100644 index 0000000000..763cc664ac --- /dev/null +++ b/kernel/zarch/srot.c @@ -0,0 +1,246 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + __asm__ ( + "vlrepf %%v0,%3 \n\t" + "vlrepf %%v1,%4 \n\t" + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT temp; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -64; + if ( n1 > 0 ) + { + FLOAT cosa,sina; + cosa=c; + sina=s; + srot_kernel_64(n1, x, y, &cosa, &sina); + i=n1; + } + + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c new file mode 100644 index 0000000000..c18a7e56f3 --- /dev/null +++ b/kernel/zarch/sscal.c @@ -0,0 +1,201 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) +{ + __asm__ volatile ( + "vlrepf %%v0,%1 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%2) \n\t" + "vfmsb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 0(%%r1,%2) \n\t" + "vl %%v25, 16(%%r1,%2) \n\t" + "vfmsb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 16(%%r1,%2) \n\t" + "vl %%v26, 32(%%r1,%2) \n\t" + "vfmsb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 32(%%r1,%2) \n\t" + "vl %%v27, 48(%%r1,%2) \n\t" + "vfmsb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 48(%%r1,%2) \n\t" + "vl %%v24, 64(%%r1,%2) \n\t" + "vfmsb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 64(%%r1,%2) \n\t" + "vl %%v25, 80(%%r1,%2) \n\t" + "vfmsb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 80(%%r1,%2) \n\t" + "vl %%v26, 96(%%r1,%2) \n\t" + "vfmsb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 96(%%r1,%2) \n\t" + "vl %%v27, 112(%%r1,%2) \n\t" + "vfmsb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 112(%%r1,%2) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v24","v25","v26","v27" + ); +} + +static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0,j=0; + if ( n <= 0 || inc_x <=0 ) + return(0); + + + if ( inc_x == 1 ) + { + + if ( da == 0.0 ) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + + sscal_kernel_32_zero(n1, x); + j=n1; + } + + while(j < n) + { + + x[j]=0.0; + j++; + } + + } + else + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + sscal_kernel_32(n1, da, x); + j=n1; + } + while(j < n) + { + + x[j] = da * x[j] ; + j++; + } + } + + + } + else + { + + if ( da == 0.0 ) + { + + BLASLONG n1 = n & -2; + + while (j < n1) { + + x[i]=0.0; + x[i + inc_x]=0.0; + + i += inc_x * 2; + j += 2; + + } + while(j < n) + { + + x[i]=0.0; + i += inc_x ; + j++; + } + + } + else + { + BLASLONG n1 = n & -2; + + while (j < n1) { + + x[i] = da * x[i] ; + x[i + inc_x] = da * x[i + inc_x]; + + i += inc_x * 2; + j += 2; + + } + + while(j < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + j++; + } + } + + } + return 0; + +} + + diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c new file mode 100644 index 0000000000..d0c0dc3f42 --- /dev/null +++ b/kernel/zarch/sswap.c @@ -0,0 +1,164 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -64; + if ( n1 > 0 ) + { + sswap_kernel_64(n1, x, y); + i=n1; + } + + while(i < n) + { + temp = y[i]; + y[i] = x[i] ; + x[i] = temp; + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = y[iy]; + y[iy] = x[ix] ; + x[ix] = temp; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c new file mode 100644 index 0000000000..6393b099b0 --- /dev/null +++ b/kernel/zarch/zamax.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + maxf = zamax_kernel_16(n1, x); + + i = n1; + } + else + { + maxf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) > maxf) { + maxf = ABS(x[i*2]); + } + i++; + } + return (maxf); + + } else { + + inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) > maxf) { + maxf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) > maxf) { + maxf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) > maxf) { + maxf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c new file mode 100644 index 0000000000..b15774bb9f --- /dev/null +++ b/kernel/zarch/zamin.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + minf = zamin_kernel_16(n1, x); + + i = n1; + } + else + { + minf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) < minf) { + minf = ABS(x[i*2]); + } + i++; + } + return (minf); + + } else { + + inc_x2 = 2 * inc_x; + minf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) < minf) { + minf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) < minf) { + minf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) < minf) { + minf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 0fc5c9ecbe..8faaf20ebc 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -25,92 +25,98 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif - -static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { - +static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) +{ FLOAT asum; - __asm__ ( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v22 \n\t" - "vzero %%v23 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v23,%%v23,%%v26 \n\t" - "vfadb %%v22,%%v22,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v23,%%v23,%%v30 \n\t" - "vfadb %%v22,%%v22,%%v31 \n\t" - - "vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v23,%%v23,%%v26 \n\t" - "vfadb %%v22,%%v22,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v23,%%v23,%%v30 \n\t" - "vfadb %%v22,%%v22,%%v31 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - "vfadb %%v24,%%v0,%%v1 \n\t" - "vfadb %%v25,%%v23,%%v22 \n\t" - "vfadb %%v0,%%v25,%%v24 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %[asum] ,%%f0" - : [asum] "=f"(asum),[ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x) - : "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return asum; + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v2 \n\t" + "vfadb %%v0,%%v0,%%v3 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + return asum; } - - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; @@ -128,7 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( n1 > 0 ) { - sumf=zasum_kernel_16(n1, x ); + sumf = zasum_kernel_16(n1, x); i=n1; ip=2*n1; } diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 212de25c81..6ba44a27c9 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -23,142 +23,98 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" - -static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { - - BLASLONG tempR1 ; - __asm__ ("pfd 1, 0(%[x_tmp]) \n\t" - "pfd 2, 0(%[y_tmp]) \n\t" +static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( #if !defined(CONJ) - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v29,%%v29 \n\t" //complement both - "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} - -#else - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v28,%%v28 \n\t" //complement both - "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} -#endif - - "xgr %[t1],%[t1] \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "vl %%v30 , 0(%[t1],%[y_tmp]) \n\t" - "vl %%v31 , 16(%[t1],%[y_tmp]) \n\t" - "vl %%v6 , 32(%[t1],%[y_tmp]) \n\t" - "vl %%v7 , 48(%[t1],%[y_tmp]) \n\t" - "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" - "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" - "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition - "j 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" - "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" - "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" - "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" - "vl %%v30, 64(%[t1],%[y_tmp]) \n\t" - "vl %%v31, 80(%[t1],%[y_tmp]) \n\t" - "vl %%v6 , 96(%[t1],%[y_tmp]) \n\t" - "vl %%v7 , 112(%[t1],%[y_tmp]) \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" - - "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - "2: \n\t" - "pfd 1, 256(%[t1],%[x_tmp]) \n\t" - "pfd 2, 256(%[t1],%[y_tmp]) \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - - "vfmadb %%v30, %%v20, %%v28, %%v30 \n\t" - "vfmadb %%v31, %%v21, %%v28, %%v31 \n\t" - "vfmadb %%v6, %%v22, %%v28, %%v6 \n\t" - "vfmadb %%v7, %%v23, %%v28, %%v7 \n\t" - "vl %%v16, 64(%[t1],%[y_tmp]) \n\t" - "vl %%v17, 80(%[t1],%[y_tmp]) \n\t" - "vl %%v18, 96(%[t1],%[y_tmp]) \n\t" - "vl %%v19, 112(%[t1],%[y_tmp]) \n\t" - "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" - "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" - "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" - "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" - - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" - - "vst %%v30 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v31 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v6 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v7 , 48(%[t1],%[y_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - - - "clgrjl %[t1],%[tmp],1b \n\t" -//---------------------------------------------------------------------- - "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" - "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" - "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" - "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - - "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" - - : [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) - : [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) - : "cc", "v6","v7", "v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - + "vlrepg %%v0,0(%3) \n\t" + "vleg %%v1,8(%3),0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,8(%3),1 \n\t" +#else + "vleg %%v0,0(%3),1 \n\t" + "vflcdb %%v0,%%v0 \n\t" + "vleg %%v0,0(%3),0 \n\t" + "vlrepg %%v1,8(%3) \n\t" +#endif + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,80(%%r1,%1) \n\t" + "vl %%v18,96(%%r1,%1) \n\t" + "vl %%v19,112(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,64(%%r1,%2) \n\t" + "vst %%v29,80(%%r1,%2) \n\t" + "vst %%v30,96(%%r1,%2) \n\t" + "vst %%v31,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; + FLOAT da[2]; if (n <= 0) return (0); @@ -166,8 +122,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; - if (n1) { - zaxpy_kernel_8(n1, x, y, da_r,da_i); + if (n1) { + da[0] = da_r; + da[1] = da_i; + zaxpy_kernel_8(n1, x, y, da); ix = 2 * n1; } i = n1; diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index b5bf383f70..8c940bba3c 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -24,71 +24,28 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - -#include "common.h" - -static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n) - : [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; +#include "common.h" +static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,4 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","r2" + ); } - int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; @@ -137,9 +94,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } } - return(0); - + return(0); } - - diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 61c5d6b98a..aab18e2e9b 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -23,137 +23,92 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" -#if defined(Z13) - -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ __asm__ volatile( - "pfd 1, 0(%[ptr_x_tmp]) \n\t" - "pfd 1, 0(%[ptr_y_tmp]) \n\t" - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %[n_tmp],%[n_tmp],3 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t" - "pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - - - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" - "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" - "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" - "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" - "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" - - - - "vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" - "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" - "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" - "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" - "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" - - - "la %%r1,128(%%r1) \n\t" - "brctg %[n_tmp],1b \n\t" - "vfadb %%v24,%%v26,%%v24 \n\t" - "vfadb %%v25,%%v25,%%v27 \n\t" - "vsteg %%v24, 0(%[ptr_d]),0 \n\t" - "vsteg %%v24, 8(%[ptr_d]),1 \n\t" - "vsteg %%v25,16(%[ptr_d]),1 \n\t" - "vsteg %%v25,24(%[ptr_d]),0 \n\t" - : [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n) - : [mem_x] "m"( *(const double (*)[2*n])x), - [mem_y] "m"( *(const double (*)[2*n])y), - [ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d) - : "cc", "r1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - -} - -#else - -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { - BLASLONG register i = 0; - FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0}; - BLASLONG j = 0; - - while (i < n) { - - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; - - dot[0] += x[j + 2] * y[j + 2]; - dot[1] += x[j + 3] * y[j + 3]; - dot[2] += x[j + 2] * y[j + 3]; - dot[3] += x[j + 3] * y[j + 2]; - - dot[0] += x[j + 4] * y[j + 4]; - dot[1] += x[j + 5] * y[j + 5]; - dot[2] += x[j + 4] * y[j + 5]; - dot[3] += x[j + 5] * y[j + 4]; - - dot[0] += x[j + 6] * y[j + 6]; - dot[1] += x[j + 7] * y[j + 7]; - dot[2] += x[j + 6] * y[j + 7]; - dot[3] += x[j + 7] * y[j + 6]; - - j += 8; - i += 4; - - } - d[0] = dot[0]; - d[1] = dot[1]; - d[2] = dot[2]; - d[3] = dot[3]; - + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "vzero %%v28 \n\t" + "vzero %%v29 \n\t" + "vzero %%v30 \n\t" + "vzero %%v31 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" + + "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" + + "vl %%v16, 64(%%r1,%1) \n\t" + "vl %%v17, 80(%%r1,%1) \n\t" + "vl %%v18, 96(%%r1,%1) \n\t" + "vl %%v19, 112(%%r1,%1) \n\t" + "vl %%v0, 64(%%r1,%2) \n\t" + "vl %%v1, 80(%%r1,%2) \n\t" + "vl %%v2, 96(%%r1,%2) \n\t" + "vl %%v3, 112(%%r1,%2) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" + + "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v24,%%v24,%%v26 \n\t" + "vfadb %%v24,%%v24,%%v28 \n\t" + "vfadb %%v24,%%v24,%%v30 \n\t" + "vfadb %%v25,%%v25,%%v27 \n\t" + "vfadb %%v25,%%v25,%%v29 \n\t" + "vfadb %%v25,%%v25,%%v31 \n\t" + "vsteg %%v24,0(%3),0 \n\t" + "vsteg %%v24,8(%3),1 \n\t" + "vsteg %%v25,16(%3),1 \n\t" + "vsteg %%v25,24(%3),0 " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix=0, iy=0; + BLASLONG i; + BLASLONG ix, iy; OPENBLAS_COMPLEX_FLOAT result; FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; @@ -167,14 +122,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { BLASLONG n1 = n & -8; - BLASLONG j=0; - if (n1){ + if (n1) zdot_kernel_8(n1, x, y, dot); - i = n1; - j = n1 <<1; - } - + + i = n1; + BLASLONG j = i * 2; while (i < n) { diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index 380f0140e8..75027a06c0 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) +static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__ ( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "lgdr %%r1,%[cos] \n\t" - "vlvgp %%v0,%%r1,%%r1 \n\t" - "lgdr %%r1,%[sin] \n\t" - "vlvgp %%v1,%%r1,%%r1 \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v27,112(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v19,112(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "clgrjl %%r1,%[tmp],1b \n\t" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) - : "cc","r1" ,"v0","v1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ ( + "vlrepg %%v0,%3 \n\t" + "vlrepg %%v1,%4 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -214,8 +204,11 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -16; if ( n1 > 0 ) - { - zrot_kernel_16(n1, x, y, c, s); + { + FLOAT cosa,sina; + cosa=c; + sina=s; + zrot_kernel_16(n1, x, y, &cosa, &sina); i=n1; ix=2*n1; } @@ -234,6 +227,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + } else { @@ -259,3 +253,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 4764c0a522..4d8ee960fd 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -23,270 +23,211 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ #include "common.h" - - -static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) { - BLASLONG tempR1 ; - __asm__ ( - "pfd 2, 0(%[x_tmp]) \n\t" -#if !defined(CONJ) - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v29,%%v29 \n\t" //complement both - "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} - -#else - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v28,%%v28 \n\t" //complement both - "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} -#endif - - "xgr %[t1],%[t1] \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" - "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" - - "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition - "j 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmdb %%v16, %%v20, %%v28 \n\t" - "vfmdb %%v17, %%v21, %%v28 \n\t" - "vfmdb %%v18, %%v22, %%v28 \n\t" - "vfmdb %%v19, %%v23, %%v28 \n\t" - "vl %%v20, 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21, 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22, 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23, 112(%[t1],%[x_tmp]) \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - - - "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - "2: \n\t" - "pfd 2, 256(%[t1],%[x_tmp]) \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - - "vfmdb %%v30, %%v20, %%v28 \n\t" - "vfmdb %%v31, %%v21, %%v28 \n\t" - "vfmdb %%v6, %%v22, %%v28 \n\t" - "vfmdb %%v7, %%v23, %%v28 \n\t" - - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" - - "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" - "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" - "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" - "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" - - - "vst %%v30 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v31 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v6 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v7 , 48(%[t1],%[x_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - - - "clgrjl %[t1],%[tmp],1b \n\t" -//---------------------------------------------------------------------- - "vfmdb %%v16, %%v20, %%v28 \n\t" - "vfmdb %%v17, %%v21, %%v28 \n\t" - "vfmdb %%v18, %%v22, %%v28 \n\t" - "vfmdb %%v19, %%v23, %%v28 \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - - "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" - - : [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) - : [x_tmp] "a"(x), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) - : "cc", "v6","v7", "v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - - +static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepg %%v0,0(%1) \n\t" + "vleg %%v1,8(%1),0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,8(%1),1 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + "vpdi %%v28,%%v20,%%v20,4 \n\t" + "vpdi %%v29,%%v21,%%v21,4 \n\t" + "vpdi %%v30,%%v22,%%v22,4 \n\t" + "vpdi %%v31,%%v23,%%v23,4 \n\t" + + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + "vfmadb %%v16,%%v24,%%v1,%%v16 \n\t" + "vfmadb %%v17,%%v25,%%v1,%%v17 \n\t" + "vfmadb %%v18,%%v26,%%v1,%%v18 \n\t" + "vfmadb %%v19,%%v27,%%v1,%%v19 \n\t" + "vfmadb %%v20,%%v28,%%v1,%%v20 \n\t" + "vfmadb %%v21,%%v29,%%v1,%%v21 \n\t" + "vfmadb %%v22,%%v30,%%v1,%%v22 \n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) { - - __asm__ ( "pfd 2, 0(%1) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v16,%%r0,%%r0 \n\t" //load both from disjoint - "vflcdb %%v16,%%v16 \n\t" //complement both - "vlvgg %%v16,%%r0,0 \n\t" //restore 1st - "vlr %%v17 ,%%v16 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "vl %%v24, 0(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v16 \n\t" - "vsteg %%v24, 0(%[x_ptr]),1 \n\t" - "vsteg %%v24, 8(%[x_ptr]),0 \n\t" - "vl %%v25, 16(%[x_ptr]) \n\t" - "vfmdb %%v25,%%v25,%%v17 \n\t" - "vsteg %%v25, 16(%[x_ptr]),1 \n\t" - "vsteg %%v25, 24(%[x_ptr]),0 \n\t" - "vl %%v26, 32(%[x_ptr]) \n\t" - "vfmdb %%v26,%%v26,%%v16 \n\t" - "vsteg %%v26, 32(%[x_ptr]),1 \n\t" - "vsteg %%v26, 40(%[x_ptr]),0 \n\t" - "vl %%v27, 48(%[x_ptr]) \n\t" - "vfmdb %%v27,%%v27,%%v17 \n\t" - "vsteg %%v27, 48(%[x_ptr]),1 \n\t" - "vsteg %%v27, 56(%[x_ptr]),0 \n\t" - "vl %%v28, 64(%[x_ptr]) \n\t" - "vfmdb %%v28,%%v28,%%v16 \n\t" - "vsteg %%v28, 64(%[x_ptr]),1 \n\t" - "vsteg %%v28, 72(%[x_ptr]),0 \n\t" - "vl %%v29, 80(%[x_ptr]) \n\t" - "vfmdb %%v29,%%v29,%%v17 \n\t" - "vsteg %%v29, 80(%[x_ptr]),1 \n\t" - "vsteg %%v29, 88(%[x_ptr]),0 \n\t" - "vl %%v30, 96(%[x_ptr]) \n\t" - "vfmdb %%v30,%%v30,%%v16 \n\t" - "vsteg %%v30, 96(%[x_ptr]),1 \n\t" - "vsteg %%v30, 104(%[x_ptr]),0 \n\t" - "vl %%v31, 112(%[x_ptr]) \n\t" - "vfmdb %%v31,%%v31,%%v17 \n\t" - "vsteg %%v31, 112(%[x_ptr]),1 \n\t" - "vsteg %%v31, 120(%[x_ptr]),0 \n\t" - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da_i) - :"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - - +static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vleg %%v0,8(%1),0 \n\t" + "wflcdb %%v0,%%v0 \n\t" + "vleg %%v0,8(%1),1 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v16,%%v16,%%v16,4 \n\t" + "vpdi %%v17,%%v17,%%v17,4 \n\t" + "vpdi %%v18,%%v18,%%v18,4 \n\t" + "vpdi %%v19,%%v19,%%v19,4 \n\t" + "vpdi %%v20,%%v20,%%v20,4 \n\t" + "vpdi %%v21,%%v21,%%v21,4 \n\t" + "vpdi %%v22,%%v22,%%v22,4 \n\t" + "vpdi %%v23,%%v23,%%v23,4 \n\t" + + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) { - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v18,%%r0,%%r0 \n\t" - "vlr %%v19,%%v18 \n\t" - "vlr %%v16,%%v18 \n\t" - "vlr %%v17,%%v18 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "vl %%v24, 0(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v18 \n\t" - "vst %%v24, 0(%[x_ptr]) \n\t" - "vl %%v25, 16(%[x_ptr]) \n\t" - "vfmdb %%v25,%%v25,%%v19 \n\t" - "vst %%v25, 16(%[x_ptr]) \n\t" - "vl %%v26, 32(%[x_ptr]) \n\t" - "vfmdb %%v26,%%v26,%%v16 \n\t" - "vst %%v26, 32(%[x_ptr]) \n\t" - "vl %%v27, 48(%[x_ptr]) \n\t" - "vfmdb %%v27,%%v27,%%v17 \n\t" - "vst %%v27, 48(%[x_ptr]) \n\t" - "vl %%v28, 64(%[x_ptr]) \n\t" - "vfmdb %%v28,%%v28,%%v18 \n\t" - "vst %%v28, 64(%[x_ptr]) \n\t" - "vl %%v29, 80(%[x_ptr]) \n\t" - "vfmdb %%v29,%%v29,%%v19 \n\t" - "vst %%v29, 80(%[x_ptr]) \n\t" - "vl %%v30, 96(%[x_ptr]) \n\t" - "vfmdb %%v30,%%v30,%%v16 \n\t" - "vst %%v30, 96(%[x_ptr]) \n\t" - "vl %%v31,112(%[x_ptr]) \n\t" - "vfmdb %%v31,%%v31,%%v17 \n\t" - "vst %%v31,112(%[x_ptr]) \n\t" - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da_r) - : "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" - ); - +static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepg %%v0,0(%1) \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { - - __asm__ ( "pfd 2, 0(%[x_ptr]) \n\t" - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256( %[x_ptr]) \n\t" - "vst %%v24, 0( %[x_ptr]) \n\t" - "vst %%v25, 16( %[x_ptr]) \n\t" - "vst %%v26, 32( %[x_ptr]) \n\t" - "vst %%v27, 48( %[x_ptr]) \n\t" - "vst %%v24, 64( %[x_ptr]) \n\t" - "vst %%v25, 80( %[x_ptr]) \n\t" - "vst %%v26, 96( %[x_ptr]) \n\t" - "vst %%v27,112( %[x_ptr]) \n\t" - - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x) - : [n] "r"(n) - :"cc" ,"r0","v24","v25","v26","v27" - ); - +static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); } - - - - -static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) { - +static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +{ BLASLONG i; BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; - for (i = 0; i < n; i += 4) { + for (i = 0; i < n; i += 4) + { t0 = da_r * x[0] - da_i * x[1]; t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; @@ -303,17 +244,14 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLAS x[inc_x3] = t3; x += 4 * inc_x; - } - - } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; FLOAT temp0; FLOAT temp1; - + FLOAT alpha[2] __attribute__ ((aligned(16))); if (inc_x != 1) { inc_x <<= 1; @@ -405,8 +343,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { BLASLONG n1 = n & -8; - if (n1 > 0) { - zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x); + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + zscal_kernel_inc_8(n1, alpha, x, inc_x); j = n1; i = n1 * inc_x; } @@ -432,17 +372,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; if (da_r == 0.0) if (da_i == 0) zscal_kernel_8_zero(n1, x); else - zscal_kernel_8_zero_r(n1, da_i, x); + zscal_kernel_8_zero_r(n1, alpha, x); else if (da_i == 0) - zscal_kernel_8_zero_i(n1, da_r, x); + zscal_kernel_8_zero_i(n1, alpha, x); else - zscal_kernel_8(n1, da_r,da_i, x); + zscal_kernel_8(n1, alpha, x); i = n1 << 1; j = n1; @@ -508,5 +450,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, return (0); } - - diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 0620790020..a16b87cdc7 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -25,220 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" - -#if defined(Z13_SWAP_A) -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" - ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ volatile( + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else - -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - - - "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" - - "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - -} - -#endif - - - - - - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; diff --git a/ztest/Makefile b/ztest/Makefile new file mode 100644 index 0000000000..0ff7fe46a5 --- /dev/null +++ b/ztest/Makefile @@ -0,0 +1,437 @@ +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +goto :: sdot.goto ddot.goto cdot.goto zdot.goto dsdot.goto sswap.goto dswap.goto cswap.goto zswap.goto isamax.goto idamax.goto icamax.goto izamax.goto samax.goto damax.goto ismax.goto idmax.goto smax.goto dmax.goto isamin.goto idamin.goto icamin.goto izamin.goto samin.goto damin.goto camin.goto zamin.goto ismin.goto idmin.goto smin.goto dmin.goto sgemv.goto dgemv.goto cgemv.goto zgemv.goto sscal.goto dscal.goto cscal.goto zscal.goto saxpy.goto daxpy.goto caxpy.goto zaxpy.goto srot.goto drot.goto crot.goto zrot.goto sasum.goto dasum.goto casum.goto zasum.goto scopy.goto dcopy.goto ccopy.goto zcopy.goto + +##################################### Sdot #################################################### +sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Ddot #################################################### +ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cdot #################################################### +cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zdot #################################################### +zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dsdot #################################################### +dsdot.goto : dsdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMAX ############################################## +isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMAX ############################################## +idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMAX ############################################## +icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMAX ############################################## +izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMAX ############################################## +samax.goto : samax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMAX ############################################## +damax.goto : damax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMAX ############################################## +ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMAX ############################################## +idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMAX ############################################## +smax.goto : smax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMAX ############################################## +dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMIN ############################################## +isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMIN ############################################## +idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMIN ############################################## +icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMIN ############################################## +izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMIN ############################################## +samin.goto : samin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMIN ############################################## +damin.goto : damin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## CAMIN ############################################## +camin.goto : camin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ZAMIN ############################################## +zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMIN ############################################## +ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMIN ############################################## +idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMIN ############################################## +smin.goto : smin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMIN ############################################## +dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sgemv #################################################### +sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dgemv #################################################### +dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cgemv #################################################### + +cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zgemv #################################################### + +zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sscal #################################################### +sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dscal #################################################### +dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cscal #################################################### + +cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zscal #################################################### + +zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Srot #################################################### +srot.goto : srot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Drot #################################################### +drot.goto : drot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Crot #################################################### +crot.goto : crot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zrot #################################################### +zrot.goto : zrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sswap #################################################### +sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dswap #################################################### +dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cswap #################################################### + +cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zswap #################################################### + +zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sasum #################################################### +sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dasum #################################################### +dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Casum #################################################### + +casum.goto : casum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zasum #################################################### + +zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Scopy #################################################### +scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dcopy #################################################### +dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Ccopy #################################################### + +ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zcopy #################################################### + +zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +################################################################################################### + +sdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +ddot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +dsdot.$(SUFFIX) : dsdot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +isamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +samax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ismax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +smax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +isamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +samin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +camin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zamin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ismin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +smin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +crot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +casum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +scopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +clean :: + @rm -f *.goto + diff --git a/ztest/amax.c b/ztest/amax.c new file mode 100644 index 0000000000..f2e3f54119 --- /dev/null +++ b/ztest/amax.c @@ -0,0 +1,235 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +FLOAT amax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + +#undef AMAX +#ifdef DOUBLE +#define AMAX BLASFUNC(damax) +#else +#define AMAX BLASFUNC(samax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +FLOAT amin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + +#undef AMIN +#ifdef DOUBLE +#define AMIN BLASFUNC(damin) +#else +#define AMIN BLASFUNC(samin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +FLOAT zasum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} +#else +FLOAT asum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} +#endif + +#undef ASUM +#ifdef COMPLEX +#ifdef DOUBLE +#define ASUM BLASFUNC(dzasum) +#else +#define ASUM BLASFUNC(scasum) +#endif +#else +#ifdef DOUBLE +#define ASUM BLASFUNC(dasum) +#else +#define ASUM BLASFUNC(sasum) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zaxpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} +#else +int axpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da == 0.0 ) return(0); + + ix = 0; + iy = 0; + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} +#endif + +#undef AXPY +#ifdef COMPLEX +#ifdef DOUBLE +#define AXPY BLASFUNC(zaxpy) +#else +#define AXPY BLASFUNC(caxpy) +#endif +#else +#ifdef DOUBLE +#define AXPY BLASFUNC(daxpy) +#else +#define AXPY BLASFUNC(saxpy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *y_c;; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + argc--;argv++; + + blasint iy; + int test = 1; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zcopy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2; + iy += inc_y2; + i++ ; + + } + return(0); + +} +#else +int copy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + while(i < n) + { + + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} +#endif + +#undef COPY +#ifdef COMPLEX +#ifdef DOUBLE +#define COPY BLASFUNC(zcopy) +#else +#define COPY BLASFUNC(ccopy) +#endif +#else +#ifdef DOUBLE +#define COPY BLASFUNC(dcopy) +#else +#define COPY BLASFUNC(scopy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *y_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +OPENBLAS_COMPLEX_FLOAT zdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + OPENBLAS_COMPLEX_FLOAT result; + BLASLONG inc_x2; + BLASLONG inc_y2; + + dot[0]=0.0; + dot[1]=0.0; + + CREAL(result) = 0.0 ; + CIMAG(result) = 0.0 ; + + if ( n < 1 ) return(result); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; + return(result); + +} +#else +FLOAT dot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); +} +#endif + +#undef DOT +#ifdef COMPLEX +#ifdef DOUBLE +#define DOT BLASFUNC(zdotu) +#else +#define DOT BLASFUNC(cdotu) +#endif +#else +#ifdef DOUBLE +#define DOT BLASFUNC(ddot) +#else +#define DOT BLASFUNC(sdot) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; +#ifdef COMPLEX + OPENBLAS_COMPLEX_FLOAT result, result_c; +#else + FLOAT result, result_c; +#endif + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +double dsdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); +} + +#undef DSDOT +#define DSDOT BLASFUNC(dsdot) + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + double result, result_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp_r,temp_i; + BLASLONG inc_x2,inc_y2; + BLASLONG lda2; + BLASLONG i2; + + lda2 = 2*lda; + + ix = 0; + a_ptr = a; + + if ( inc_x == 1 && inc_y == 1 ) + { + + for (j=0; jtv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y, *y_c; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char trans='N'; + blasint m, i, j; + blasint inc_x=1,inc_y=1; + blasint n=0; + int has_param_n = 0; + int has_param_m = 0; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + + int tomax = to; + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + if ((p = getenv("OPENBLAS_PARAM_N"))) { + n = atoi(p); + if ((n>0)) has_param_n = 1; + if ( n > tomax ) tomax = n; + } + if ( has_param_n == 0 ) + if ((p = getenv("OPENBLAS_PARAM_M"))) { + m = atoi(p); + if ((m>0)) has_param_m = 1; + if ( m > tomax ) tomax = m; + } + + + + fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + if (has_param_m == 0) + { + + for(m = from; m <= to; m += step) + { + timeg=0; + timeg_c=0; + if ( has_param_n == 0 ) n = m; + fprintf(stderr, " %6dx%d :", (int)m,(int)n); + for(j = 0; j < m; j++){ + for(i = 0; i < n * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +BLASLONG izamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(max+1); +} +#else +BLASLONG iamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} +#endif + +#undef IAMAX +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMAX BLASFUNC(izamax) +#else +#define IAMAX BLASFUNC(icamax) +#endif +#else +#ifdef DOUBLE +#define IAMAX BLASFUNC(idamax) +#else +#define IAMAX BLASFUNC(isamax) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +BLASLONG izamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(min+1); +} +#else +BLASLONG iamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} +#endif + +#undef IAMIN +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMIN BLASFUNC(izamin) +#else +#define IAMIN BLASFUNC(icamin) +#endif +#else +#ifdef DOUBLE +#define IAMIN BLASFUNC(idamin) +#else +#define IAMIN BLASFUNC(isamin) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +BLASLONG imax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + +#undef IMAX +#ifdef DOUBLE +#define IMAX BLASFUNC(idmax) +#else +#define IMAX BLASFUNC(ismax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +BLASLONG imin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + +#undef IMIN +#ifdef DOUBLE +#define IMIN BLASFUNC(idmin) +#else +#define IMIN BLASFUNC(ismin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +FLOAT max_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + +#undef MAX_ +#ifdef DOUBLE +#define MAX_ BLASFUNC(dmax) +#else +#define MAX_ BLASFUNC(smax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +FLOAT min_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + +#undef MIN_ +#ifdef DOUBLE +#define MIN_ BLASFUNC(dmin) +#else +#define MIN_ BLASFUNC(smin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zrot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); +} +#else +int rot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); +} +#endif + +#undef ROT +#ifdef COMPLEX +#ifdef DOUBLE +#define ROT BLASFUNC(zdrot) +#else +#define ROT BLASFUNC(csrot) +#endif +#else +#ifdef DOUBLE +#define ROT BLASFUNC(drot) +#else +#define ROT BLASFUNC(srot) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *x_c, *y_c; + // FLOAT result; + blasint m, i; + blasint inc_x=1,inc_y=1; + FLOAT c[1] = { 2.0 }; + FLOAT s[1] = { 2.0 }; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix,iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zscal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; + + if ( (n <= 0) || (inc_x <= 0)) + return(0); + + inc_x2 = 2 * inc_x; + for ( i=0; itv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *x_c; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zswap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); +} +#else +int swap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); +} +#endif + +#undef SWAP +#ifdef COMPLEX +#ifdef DOUBLE +#define SWAP BLASFUNC(zswap) +#else +#define SWAP BLASFUNC(cswap) +#endif +#else +#ifdef DOUBLE +#define SWAP BLASFUNC(dswap) +#else +#define SWAP BLASFUNC(sswap) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *x_c, *y_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix,iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l Date: Mon, 6 Aug 2018 20:03:49 +0300 Subject: [PATCH 002/189] [ZARCH] Restore detect() function --- cpuid_zarch.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 0ae32f27d7..073419fa84 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -45,9 +45,29 @@ static char *cpuname_lower[] = { int detect(void) { - // return CPU_GENERIC; - return CPU_Z14; - + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + infile = fopen("/proc/sysinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("Type", buffer, 4)){ + p = strchr(buffer, ':') + 2; +#if 0 + fprintf(stderr, "%s\n", p); +#endif + break; + } + } + + fclose(infile); + + if (strstr(p, "2964")) return CPU_Z13; + if (strstr(p, "2965")) return CPU_Z13; + if (strstr(p, "3906")) return CPU_Z14; + if (strstr(p, "3907")) return CPU_Z14; + + return CPU_GENERIC; } void get_libname(void) From e6c0e39492d49eded5a72c9882b79bed7bff35d0 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 13 Aug 2018 12:23:40 +0300 Subject: [PATCH 003/189] Optimize Zgemv --- cpuid_zarch.c | 8 +- kernel/zarch/KERNEL.Z13 | 4 +- kernel/zarch/KERNEL.Z14 | 8 +- kernel/zarch/camax.c | 46 +- kernel/zarch/camin.c | 46 +- kernel/zarch/caxpy.c | 4 +- kernel/zarch/cgemv_n_4.c | 743 ++++++++++++++++++++ kernel/zarch/cgemv_t_4.c | 671 ++++++++++++++++++ kernel/zarch/icamax.c | 9 +- kernel/zarch/icamin.c | 9 +- kernel/zarch/idamax.c | 11 +- kernel/zarch/idamin.c | 11 +- kernel/zarch/idmax.c | 11 +- kernel/zarch/idmin.c | 11 +- kernel/zarch/isamax.c | 11 +- kernel/zarch/isamin.c | 11 +- kernel/zarch/ismax.c | 11 +- kernel/zarch/ismin.c | 11 +- kernel/zarch/izamax.c | 9 +- kernel/zarch/izamin.c | 9 +- kernel/zarch/zamax.c | 48 +- kernel/zarch/zamin.c | 46 +- kernel/zarch/zaxpy.c | 4 +- kernel/zarch/zgemv_n_4.c | 1401 ++++++++++++++++---------------------- kernel/zarch/zgemv_t_4.c | 1267 +++++++++++++++------------------- ztest/gemv.c | 159 +++-- 26 files changed, 2866 insertions(+), 1713 deletions(-) create mode 100644 kernel/zarch/cgemv_n_4.c create mode 100644 kernel/zarch/cgemv_t_4.c diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 073419fa84..8ed40099b4 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -27,9 +27,9 @@ #include -#define CPU_GENERIC 0 -#define CPU_Z13 1 -#define CPU_Z14 2 +#define CPU_GENERIC 0 +#define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", @@ -112,7 +112,7 @@ void get_cpuconfig(void) printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; - case CPU_Z14: + case CPU_Z14: printf("#define Z14\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index d39b9d904b..e5b974ab4e 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -74,12 +74,12 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = dgemv_n_4.c CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index fa88b68810..80f78f48fa 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -73,13 +73,13 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = sgemv_n_4.c DGEMVNKERNEL = dgemv_n_4.c -CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = ../arm/zgemv_n.c +CGEMVNKERNEL = cgemv_n_4.c +ZGEMVNKERNEL = zgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c DGEMVTKERNEL = dgemv_t_4.c -CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = ../arm/zgemv_t.c +CGEMVTKERNEL = cgemv_t_4.c +ZGEMVTKERNEL = zgemv_t_4.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 6394be7694..3506c4e9b9 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -198,7 +198,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT maxf = 0.0; BLASLONG inc_x2; @@ -216,53 +216,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { maxf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) > maxf) { - maxf = ABS(x[i*2]); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } + ix += 2; i++; } return (maxf); } else { - inc_x2 = 2 * inc_x; maxf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) > maxf) { - maxf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) > maxf) { + maxf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) > maxf) { - maxf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) > maxf) { + maxf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) > maxf) { - maxf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) > maxf) { + maxf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (maxf); } diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 936c300c88..726747b999 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -198,7 +198,7 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT minf = 0.0; BLASLONG inc_x2; @@ -216,53 +216,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { minf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) < minf) { - minf = ABS(x[i*2]); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } + ix += 2; i++; } return (minf); } else { - inc_x2 = 2 * inc_x; minf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) < minf) { - minf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) < minf) { + minf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) < minf) { - minf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) < minf) { + minf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) < minf) { - minf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) < minf) { + minf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (minf); } diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index 2176f3dcd9..fe5568cc83 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -110,7 +110,7 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "agfi %%r1,128 \n\t" "brctg %%r0,0b " : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); } @@ -118,7 +118,7 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; - FLOAT da[2]; + FLOAT da[2] __attribute__ ((aligned(16))); if (n <= 0) return (0); diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c new file mode 100644 index 0000000000..4c3253774a --- /dev/null +++ b/kernel/zarch/cgemv_n_4.c @@ -0,0 +1,743 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#define NBMAX 1024 + +static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vlrepg %%v16,0(%5) \n\t" + "vlrepg %%v17,8(%5) \n\t" + "vlrepg %%v18,16(%5) \n\t" + "vlrepg %%v19,24(%5) \n\t" +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" +#else + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%%r0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vlef %%v24,0(%%r1,%1),0 \n\t" + "vlef %%v24,0(%%r1,%1),1 \n\t" + "vlef %%v24,8(%%r1,%1),2 \n\t" + "vlef %%v24,8(%%r1,%1),3 \n\t" + "vlef %%v25,4(%%r1,%1),0 \n\t" + "vlef %%v25,4(%%r1,%1),1 \n\t" + "vlef %%v25,12(%%r1,%1),2 \n\t" + "vlef %%v25,12(%%r1,%1),3 \n\t" + "vlef %%v26,0(%%r1,%2),0 \n\t" + "vlef %%v26,0(%%r1,%2),1 \n\t" + "vlef %%v26,8(%%r1,%2),2 \n\t" + "vlef %%v26,8(%%r1,%2),3 \n\t" + "vlef %%v27,4(%%r1,%2),0 \n\t" + "vlef %%v27,4(%%r1,%2),1 \n\t" + "vlef %%v27,12(%%r1,%2),2 \n\t" + "vlef %%v27,12(%%r1,%2),3 \n\t" + + "vl %%v0,0(%%r1,%6) \n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" + + "vlef %%v28,0(%%r1,%1),0 \n\t" + "vlef %%v28,0(%%r1,%1),1 \n\t" + "vlef %%v28,8(%%r1,%1),2 \n\t" + "vlef %%v28,8(%%r1,%1),3 \n\t" + "vlef %%v29,4(%%r1,%1),0 \n\t" + "vlef %%v29,4(%%r1,%1),1 \n\t" + "vlef %%v29,12(%%r1,%1),2 \n\t" + "vlef %%v29,12(%%r1,%1),3 \n\t" + "vlef %%v30,0(%%r1,%2),0 \n\t" + "vlef %%v30,0(%%r1,%2),1 \n\t" + "vlef %%v30,8(%%r1,%2),2 \n\t" + "vlef %%v30,8(%%r1,%2),3 \n\t" + "vlef %%v31,4(%%r1,%2),0 \n\t" + "vlef %%v31,4(%%r1,%2),1 \n\t" + "vlef %%v31,12(%%r1,%2),2 \n\t" + "vlef %%v31,12(%%r1,%2),3 \n\t" + + "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" + "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" + "vfmasb %%v0,%%v30,%%v19,%%v0 \n\t" + "vfmasb %%v0,%%v31,%%v23,%%v0 \n\t" + "vst %%v0,0(%%r1,%6) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,0b \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vlrepg %%v16,0(%3) \n\t" + "vlrepg %%v17,8(%3) \n\t" +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" +#else + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%%r0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vlef %%v20,0(%%r1,%1),0 \n\t" + "vlef %%v20,0(%%r1,%1),1 \n\t" + "vlef %%v20,8(%%r1,%1),2 \n\t" + "vlef %%v20,8(%%r1,%1),3 \n\t" + "vlef %%v21,4(%%r1,%1),0 \n\t" + "vlef %%v21,4(%%r1,%1),1 \n\t" + "vlef %%v21,12(%%r1,%1),2 \n\t" + "vlef %%v21,12(%%r1,%1),3 \n\t" + "vlef %%v22,0(%%r1,%2),0 \n\t" + "vlef %%v22,0(%%r1,%2),1 \n\t" + "vlef %%v22,8(%%r1,%2),2 \n\t" + "vlef %%v22,8(%%r1,%2),3 \n\t" + "vlef %%v23,4(%%r1,%2),0 \n\t" + "vlef %%v23,4(%%r1,%2),1 \n\t" + "vlef %%v23,12(%%r1,%2),2 \n\t" + "vlef %%v23,12(%%r1,%2),3 \n\t" + + "vl %%v0,0(%%r1,%4) \n\t" + "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v21,%%v18,%%v0 \n\t" + "vfmasb %%v0,%%v22,%%v17,%%v0 \n\t" + "vfmasb %%v0,%%v23,%%v19,%%v0 \n\t" + "vst %%v0,0(%%r1,%4) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,0b \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vlrepg %%v16,0(%2) \n\t" +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vlef %%v17,4(%2),0 \n\t" + "vlef %%v17,4(%2),2 \n\t" + "vflcsb %%v17,%%v17 \n\t" + "vlef %%v17,0(%2),1 \n\t" + "vlef %%v17,0(%2),3 \n\t" +#else + "vlef %%v17,0(%2),1 \n\t" + "vlef %%v17,0(%2),3 \n\t" + "vflcsb %%v17,%%v17 \n\t" + "vlef %%v17,4(%2),0 \n\t" + "vlef %%v17,4(%2),2 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%%r0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vlef %%v18,0(%%r1,%1),0 \n\t" + "vlef %%v18,0(%%r1,%1),1 \n\t" + "vlef %%v18,8(%%r1,%1),2 \n\t" + "vlef %%v18,8(%%r1,%1),3 \n\t" + "vlef %%v19,4(%%r1,%1),0 \n\t" + "vlef %%v19,4(%%r1,%1),1 \n\t" + "vlef %%v19,12(%%r1,%1),2 \n\t" + "vlef %%v19,12(%%r1,%1),3 \n\t" + + "vl %%v0,0(%%r1,%3) \n\t" + "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v19,%%v17,%%v0 \n\t" + "vst %%v0,0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,0b \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19" + ); +} + +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) +{ + __asm__ volatile ( +#if !defined(XCONJ) + "vlrepf %%v0,%3 \n\t" + "vlef %%v1,%4,0 \n\t" + "vlef %%v1,%4,2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,%4,1 \n\t" + "vlef %%v1,%4,3 \n\t" +#else + "vlef %%v0,%3,1 \n\t" + "vlef %%v0,%3,3 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,%3,0 \n\t" + "vlef %%v0,%3,2 \n\t" + "vlrepf %%v1,%4 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,2 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,0(%%r1,%2) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + + "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t" + "vfmasb %%v23,%%v17,%%v0,%%v19 \n\t" + + "vfmasb %%v22,%%v20,%%v1,%%v22 \n\t" + "vfmasb %%v23,%%v21,%%v1,%%v23 \n\t" + + "vst %%v22,0(%%r1,%2) \n\t" + "vst %%v23,16(%%r1,%2) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if ( inc_x != 2 ) + copy_x(NB,x_ptr,xbuffer,inc_x); + else + xbuffer = x_ptr; + + if ( inc_y == 2 ) + { + + for( i = 0; i < n1 ; i++) + { + cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if ( n2 & 2 ) + { + cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if ( n2 & 1 ) + { + cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } + else + { + + for( i = 0; i < n1 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for( i = 0; i < n2 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + + + if ( m3 == 0 ) return(0); + + x_ptr = x; + j=0; + a_ptr = a; + y_ptr = y; + + if ( m3 == 3 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + + if ( m3 == 2 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return(0); + } + + + if ( m3 == 1 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + return(0); +} diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index e7f096e0d4..9b4077c6b0 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -281,6 +281,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + maxf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -296,9 +302,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - maxf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index b9c1ccd9c9..6e952a3256 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -281,6 +281,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + minf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -296,9 +302,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - minf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index aba880949f..d1f1353692 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -204,6 +204,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) > maxf) { @@ -216,7 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) > maxf) { diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 3213efa4da..679606a8f8 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -204,6 +204,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) < minf) { @@ -216,7 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) < minf) { diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 26fff4eb03..5de41ac7b4 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -180,6 +180,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = x[0]; + i++; + } while (i < n) { if (x[i] > maxf) { @@ -192,7 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] > maxf) { diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 570b33a151..7fec111cfb 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -180,6 +180,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = x[0]; + i++; + } while (i < n) { if (x[i] < minf) { @@ -192,7 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] < minf) { diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 95a665b10f..d2686c0cd5 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -247,6 +247,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) > maxf) { @@ -259,7 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) > maxf) { diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 640fc02c92..768f31a8c7 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -247,6 +247,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) < minf) { @@ -259,7 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) < minf) { diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 0eb3503155..8fc32adf6c 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -223,6 +223,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = x[0]; + i++; + } while (i < n) { if (x[i] > maxf) { @@ -235,7 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] > maxf) { diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index f050db8cb0..415052810d 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -223,6 +223,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = x[0]; + i++; + } while (i < n) { if (x[i] < minf) { @@ -235,7 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] < minf) { diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index bf5f621a7b..541464b055 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -202,6 +202,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + maxf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -217,9 +223,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - maxf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 3636e8fdf5..4b5572b80c 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -202,6 +202,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + minf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -217,9 +223,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - minf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 6393b099b0..937bc97538 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -150,7 +150,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT maxf = 0.0; BLASLONG inc_x2; @@ -168,53 +168,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { maxf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) > maxf) { - maxf = ABS(x[i*2]); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } + ix += 2; i++; } return (maxf); } else { - - inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) > maxf) { - maxf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) > maxf) { + maxf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) > maxf) { - maxf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) > maxf) { + maxf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) > maxf) { - maxf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) > maxf) { + maxf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (maxf); } diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index b15774bb9f..8564edaf45 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -150,7 +150,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT minf = 0.0; BLASLONG inc_x2; @@ -168,53 +168,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { minf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) < minf) { - minf = ABS(x[i*2]); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } + ix += 2; i++; } return (minf); } else { - inc_x2 = 2 * inc_x; minf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) < minf) { - minf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) < minf) { + minf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) < minf) { - minf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) < minf) { + minf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) < minf) { - minf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) < minf) { + minf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (minf); } diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 6ba44a27c9..f0e993d2f0 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -106,7 +106,7 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "agfi %%r1,128 \n\t" "brctg %%r0,0b " : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); } @@ -114,7 +114,7 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; - FLOAT da[2]; + FLOAT da[2] __attribute__ ((aligned(16))); if (n <= 0) return (0); diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 484db30734..9472b5d5a4 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2018, The OpenBLAS Project +Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,898 +23,693 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ #include #include #include "common.h" -#define HAVE_KERNEL_4x4_VEC 1 -#define HAVE_KERNEL_4x2_VEC 1 -#define HAVE_KERNEL_4x1_VEC 1 -#define HAVE_KERNEL_ADDY 1 - -#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) -#include -#endif - -// #define NBMAX 1024 -#ifdef HAVE_KERNEL_4x4_VEC_ASM - -#elif HAVE_KERNEL_4x4_VEC - -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vl %%v16,0(%5) \n\t" + "vl %%v17,16(%5) \n\t" + "vl %%v18,32(%5) \n\t" + "vl %%v19,48(%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register __vector double vx0_r = {x[0], x[0]}; - register __vector double vx0_i = {-x[1], x[1]}; - register __vector double vx1_r = {x[2], x[2]}; - register __vector double vx1_i = {-x[3], x[3]}; - register __vector double vx2_r = {x[4], x[4]}; - register __vector double vx2_i = {-x[5], x[5]}; - register __vector double vx3_r = {x[6], x[6]}; - register __vector double vx3_i = {-x[7], x[7]}; - + "vleg %%v20,8(%5),0 \n\t" + "wflcdb %%v20,%%v20 \n\t" + "vleg %%v20,0(%5),1 \n\t" + "vleg %%v21,24(%5),0 \n\t" + "wflcdb %%v21,%%v21 \n\t" + "vleg %%v21,16(%5),1 \n\t" + "vleg %%v22,40(%5),0 \n\t" + "wflcdb %%v22,%%v22 \n\t" + "vleg %%v22,32(%5),1 \n\t" + "vleg %%v23,56(%5),0 \n\t" + "wflcdb %%v23,%%v23 \n\t" + "vleg %%v23,48(%5),1 \n\t" #else - register __vector double vx0_r = {x[0], -x[0]}; - register __vector double vx0_i = {x[1], x[1]}; - register __vector double vx1_r = {x[2], -x[2]}; - register __vector double vx1_i = {x[3], x[3]}; - register __vector double vx2_r = {x[4], -x[4]}; - register __vector double vx2_i = {x[5], x[5]}; - register __vector double vx3_r = {x[6], -x[6]}; - register __vector double vx3_i = {x[7], x[7]}; + "vleg %%v20,0(%5),1 \n\t" + "vflcdb %%v20,%%v20 \n\t" + "vleg %%v20,8(%5),0 \n\t" + "vleg %%v21,16(%5),1 \n\t" + "vflcdb %%v21,%%v21 \n\t" + "vleg %%v21,24(%5),0 \n\t" + "vleg %%v22,32(%5),1 \n\t" + "vflcdb %%v22,%%v22 \n\t" + "vleg %%v22,40(%5),0 \n\t" + "vleg %%v23,48(%5),1 \n\t" + "vflcdb %%v23,%%v23 \n\t" + "vleg %%v23,56(%5),0 \n\t" #endif - - register __vector double *vy = (__vector double *) y; - register __vector double *vptr_a0 = (__vector double *) a0; - register __vector double *vptr_a1 = (__vector double *) a1; - register __vector double *vptr_a2 = (__vector double *) a2; - register __vector double *vptr_a3 = (__vector double *) a3; - - for (i = 0; i < n; i += 4) { - - register __vector double vy_0 = vy[i]; - register __vector double vy_1 = vy[i + 1]; - register __vector double vy_2 = vy[i + 2]; - register __vector double vy_3 = vy[i + 3]; - - register __vector double va0 = vptr_a0[i]; - register __vector double va0_1 = vptr_a0[i + 1]; - register __vector double va0_2 = vptr_a0[i + 2]; - register __vector double va0_3 = vptr_a0[i + 3]; - - register __vector double va1 = vptr_a1[i]; - register __vector double va1_1 = vptr_a1[i + 1]; - register __vector double va1_2 = vptr_a1[i + 2]; - register __vector double va1_3 = vptr_a1[i + 3]; - - register __vector double va2 = vptr_a2[i]; - register __vector double va2_1 = vptr_a2[i + 1]; - register __vector double va2_2 = vptr_a2[i + 2]; - register __vector double va2_3 = vptr_a2[i + 3]; - - register __vector double va3 = vptr_a3[i]; - register __vector double va3_1 = vptr_a3[i + 1]; - register __vector double va3_2 = vptr_a3[i + 2]; - register __vector double va3_3 = vptr_a3[i + 3]; - - vy_0 += va0*vx0_r; - vy_1 += va0_1*vx0_r; - vy_2 += va0_2*vx0_r; - vy_3 += va0_3*vx0_r; - - vy_0 += va1*vx1_r; - vy_1 += va1_1*vx1_r; - vy_2 += va1_2*vx1_r; - vy_3 += va1_3*vx1_r; - - va0 = vec_permi(va0, va0, 2); - va0_1 = vec_permi(va0_1, va0_1, 2); - va0_2 = vec_permi(va0_2, va0_2, 2); - va0_3 = vec_permi(va0_3, va0_3, 2); - - vy_0 += va2*vx2_r; - vy_1 += va2_1*vx2_r; - vy_2 += va2_2*vx2_r; - vy_3 += va2_3*vx2_r; - - va1 = vec_permi(va1, va1, 2); - va1_1 = vec_permi(va1_1, va1_1, 2); - va1_2 = vec_permi(va1_2, va1_2, 2); - va1_3 = vec_permi(va1_3, va1_3, 2); - - vy_0 += va3*vx3_r; - vy_1 += va3_1*vx3_r; - vy_2 += va3_2*vx3_r; - vy_3 += va3_3*vx3_r; - - va2 = vec_permi(va2, va2, 2); - va2_1 = vec_permi(va2_1, va2_1, 2); - va2_2 = vec_permi(va2_2, va2_2, 2); - va2_3 = vec_permi(va2_3, va2_3, 2); - - vy_0 += va0*vx0_i; - vy_1 += va0_1*vx0_i; - vy_2 += va0_2*vx0_i; - vy_3 += va0_3*vx0_i; - - va3 = vec_permi(va3, va3, 2); - va3_1 = vec_permi(va3_1, va3_1, 2); - va3_2 = vec_permi(va3_2, va3_2, 2); - va3_3 = vec_permi(va3_3, va3_3, 2); - - vy_0 += va1*vx1_i; - vy_1 += va1_1*vx1_i; - vy_2 += va1_2*vx1_i; - vy_3 += va1_3*vx1_i; - - vy_0 += va2*vx2_i; - vy_1 += va2_1*vx2_i; - vy_2 += va2_2*vx2_i; - vy_3 += va2_3*vx2_i; - - vy_0 += va3*vx3_i; - vy_1 += va3_1*vx3_i; - vy_2 += va3_2*vx3_i; - vy_3 += va3_3*vx3_i; - - vy[i] = vy_0; - vy[i + 1] = vy_1; - vy[i + 2] = vy_2; - vy[i + 3] = vy_3; - - } + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vlrepg %%v24,0(%%r1,%1) \n\t" + "vlrepg %%v25,8(%%r1,%1) \n\t" + "vlrepg %%v26,0(%%r1,%2) \n\t" + "vlrepg %%v27,8(%%r1,%2) \n\t" + + "vl %%v0,0(%%r1,%6) \n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" + + "vlrepg %%v28,0(%%r1,%3) \n\t" + "vlrepg %%v29,8(%%r1,%3) \n\t" + "vlrepg %%v30,0(%%r1,%4) \n\t" + "vlrepg %%v31,8(%%r1,%4) \n\t" + + "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" + "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" + "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" + "vst %%v0,0(%%r1,%6) \n\t" + + "vlrepg %%v24,16(%%r1,%1) \n\t" + "vlrepg %%v25,24(%%r1,%1) \n\t" + "vlrepg %%v26,16(%%r1,%2) \n\t" + "vlrepg %%v27,24(%%r1,%2) \n\t" + + "vl %%v0,16(%%r1,%6) \n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" + + "vlrepg %%v28,16(%%r1,%3) \n\t" + "vlrepg %%v29,24(%%r1,%3) \n\t" + "vlrepg %%v30,16(%%r1,%4) \n\t" + "vlrepg %%v31,24(%%r1,%4) \n\t" + + "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" + "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" + "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" + "vst %%v0,16(%%r1,%6) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - - for (i = 0; i < 2 * n; i += 2) { +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vl %%v16,0(%3) \n\t" + "vl %%v17,16(%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; - y[i] += a2[i] * x[4] - a2[i + 1] * x[5]; - y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4]; - y[i] += a3[i] * x[6] - a3[i + 1] * x[7]; - y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6]; -#else - y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; - y[i] += a2[i] * x[4] + a2[i + 1] * x[5]; - y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4]; - y[i] += a3[i] * x[6] + a3[i + 1] * x[7]; - y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6]; + "vleg %%v18,8(%3),0 \n\t" + "wflcdb %%v18,%%v18 \n\t" + "vleg %%v18,0(%3),1 \n\t" + "vleg %%v19,24(%3),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,16(%3),1 \n\t" +#else + "vleg %%v18,0(%3),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,8(%3),0 \n\t" + "vleg %%v19,16(%3),1 \n\t" + "vflcdb %%v19,%%v19 \n\t" + "vleg %%v19,24(%3),0 \n\t" #endif - } + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vlrepg %%v20,0(%%r1,%1) \n\t" + "vlrepg %%v21,8(%%r1,%1) \n\t" + "vlrepg %%v22,0(%%r1,%2) \n\t" + "vlrepg %%v23,8(%%r1,%2) \n\t" + + "vl %%v0,0(%%r1,%4) \n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" + "vst %%v0,0(%%r1,%4) \n\t" + + "vlrepg %%v20,16(%%r1,%1) \n\t" + "vlrepg %%v21,24(%%r1,%1) \n\t" + "vlrepg %%v22,16(%%r1,%2) \n\t" + "vlrepg %%v23,24(%%r1,%2) \n\t" + + "vl %%v0,16(%%r1,%4) \n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" + "vst %%v0,16(%%r1,%4) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -#endif - -#ifdef HAVE_KERNEL_4x2_VEC - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - - +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vl %%v16,0(%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register __vector double vx0_r = {x[0], x[0]}; - register __vector double vx0_i = {-x[1], x[1]}; - register __vector double vx1_r = {x[2], x[2]}; - register __vector double vx1_i = {-x[3], x[3]}; - + "vleg %%v17,8(%2),0 \n\t" + "wflcdb %%v17,%%v17 \n\t" + "vleg %%v17,0(%2),1 \n\t" #else - register __vector double vx0_r = {x[0], -x[0]}; - register __vector double vx0_i = {x[1], x[1]}; - register __vector double vx1_r = {x[2], -x[2]}; - register __vector double vx1_i = {x[3], x[3]}; + "vleg %%v17,0(%2),1 \n\t" + "vflcdb %%v17,%%v17 \n\t" + "vleg %%v17,8(%2),0 \n\t" #endif - - - register __vector double *vy = (__vector double *) y; - register __vector double *vptr_a0 = (__vector double *) a0; - register __vector double *vptr_a1 = (__vector double *) a1; - - for (i = 0; i < n; i += 4) { - - register __vector double vy_0 = vy[i]; - register __vector double vy_1 = vy[i + 1]; - register __vector double vy_2 = vy[i + 2]; - register __vector double vy_3 = vy[i + 3]; - - register __vector double va0 = vptr_a0[i]; - register __vector double va0_1 = vptr_a0[i + 1]; - register __vector double va0_2 = vptr_a0[i + 2]; - register __vector double va0_3 = vptr_a0[i + 3]; - - register __vector double va1 = vptr_a1[i]; - register __vector double va1_1 = vptr_a1[i + 1]; - register __vector double va1_2 = vptr_a1[i + 2]; - register __vector double va1_3 = vptr_a1[i + 3]; - - vy_0 += va0*vx0_r; - vy_1 += va0_1*vx0_r; - vy_2 += va0_2*vx0_r; - vy_3 += va0_3*vx0_r; - - va0 = vec_permi(va0, va0, 2); - va0_1 = vec_permi(va0_1, va0_1, 2); - va0_2 = vec_permi(va0_2, va0_2, 2); - va0_3 = vec_permi(va0_3, va0_3, 2); - - vy_0 += va1*vx1_r; - vy_1 += va1_1*vx1_r; - vy_2 += va1_2*vx1_r; - vy_3 += va1_3*vx1_r; - - va1 = vec_permi(va1, va1, 2); - va1_1 = vec_permi(va1_1, va1_1, 2); - va1_2 = vec_permi(va1_2, va1_2, 2); - va1_3 = vec_permi(va1_3, va1_3, 2); - - vy_0 += va0*vx0_i; - vy_1 += va0_1*vx0_i; - vy_2 += va0_2*vx0_i; - vy_3 += va0_3*vx0_i; - - vy_0 += va1*vx1_i; - vy_1 += va1_1*vx1_i; - vy_2 += va1_2*vx1_i; - vy_3 += va1_3*vx1_i; - - vy[i] = vy_0; - vy[i + 1] = vy_1; - vy[i + 2] = vy_2; - vy[i + 3] = vy_3; - - } + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vlrepg %%v18,0(%%r1,%1) \n\t" + "vlrepg %%v19,8(%%r1,%1) \n\t" + + "vl %%v0,0(%%r1,%3) \n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" + "vst %%v0,0(%%r1,%3) \n\t" + + "vlrepg %%v18,16(%%r1,%1) \n\t" + "vlrepg %%v19,24(%%r1,%1) \n\t" + + "vl %%v0,16(%%r1,%3) \n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" + "vst %%v0,16(%%r1,%3) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19" + ); } -#else - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - for (i = 0; i < 2 * n; i += 2) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; -#else - y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) +{ + __asm__ volatile ( +#if !defined(XCONJ) + "vlrepg %%v0,%3 \n\t" + "vleg %%v1,%4,0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,%4,1 \n\t" +#else + "vleg %%v0,%3,1 \n\t" + "vflcdb %%v0,%%v0 \n\t" + "vleg %%v0,%3,0 \n\t" + "vlrepg %%v1,%4 \n\t" #endif - } + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,2 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" + + "agfi %%r1,64 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - -#ifdef HAVE_KERNEL_4x1_VEC - -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0; - a0 = ap; - - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; - register __vector double vx0_r = {x[0], x[0]}; - register __vector double vx0_i = {-x[1], x[1]}; + if ( inc_dest != 2 ) + { + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i -#endif - -#ifdef HAVE_KERNEL_4x4_VEC_ASM - -#elif HAVE_KERNEL_4x4_VEC - -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - //p for positive(real*real,image*image) r for image (real*image,image*real) - register __vector double vtemp0_p = {0.0, 0.0}; - register __vector double vtemp0_r = {0.0, 0.0}; - register __vector double vtemp1_p = {0.0, 0.0}; - register __vector double vtemp1_r = {0.0, 0.0}; - register __vector double vtemp2_p = {0.0, 0.0}; - register __vector double vtemp2_r = {0.0, 0.0}; - register __vector double vtemp3_p = {0.0, 0.0}; - register __vector double vtemp3_r = {0.0, 0.0}; - i = 0; - n = n << 1; - while (i < n) { -// __builtin_prefetch(&x[i]); -// __builtin_prefetch(&a0[i]); -// __builtin_prefetch(&a1[i]); -// __builtin_prefetch(&a2[i]); -// __builtin_prefetch(&a3[i]); - register __vector double vx_0 = *(__vector double*) (&x[i]); - register __vector double vx_1 = *(__vector double*) (&x[i + 2]); - register __vector double vx_2 = *(__vector double*) (&x[i + 4]); - register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - - register __vector double va0 = *(__vector double*) (&a0[i]); - register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); - register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); - register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - - register __vector double va1 = *(__vector double*) (&a1[i]); - register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); - register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); - register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); - - register __vector double va2 = *(__vector double*) (&a2[i]); - register __vector double va2_1 = *(__vector double*) (&a2[i + 2]); - register __vector double va2_2 = *(__vector double*) (&a2[i + 4]); - register __vector double va2_3 = *(__vector double*) (&a2[i + 6]); - - register __vector double va3 = *(__vector double*) (&a3[i]); - register __vector double va3_1 = *(__vector double*) (&a3[i + 2]); - register __vector double va3_2 = *(__vector double*) (&a3[i + 4]); - register __vector double va3_3 = *(__vector double*) (&a3[i + 6]); - - register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); - register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); - - i += 8; - - vtemp0_p += vx_0*va0; - vtemp0_r += vxr_0*va0; - - vtemp1_p += vx_0*va1; - vtemp1_r += vxr_0*va1; - - vtemp2_p += vx_0*va2; - vtemp2_r += vxr_0*va2; - - vtemp3_p += vx_0*va3; - vtemp3_r += vxr_0*va3; - - vtemp0_p += vx_1*va0_1; - vtemp0_r += vxr_1*va0_1; - - vtemp1_p += vx_1*va1_1; - vtemp1_r += vxr_1*va1_1; - vxr_0 = vec_permi(vx_2, vx_2, 2); - vtemp2_p += vx_1*va2_1; - vtemp2_r += vxr_1*va2_1; - - vtemp3_p += vx_1*va3_1; - vtemp3_r += vxr_1*va3_1; - - vtemp0_p += vx_2*va0_2; - vtemp0_r += vxr_0*va0_2; - vxr_1 = vec_permi(vx_3, vx_3, 2); - - vtemp1_p += vx_2*va1_2; - vtemp1_r += vxr_0*va1_2; - - vtemp2_p += vx_2*va2_2; - vtemp2_r += vxr_0*va2_2; - - vtemp3_p += vx_2*va3_2; - vtemp3_r += vxr_0*va3_2; - - vtemp0_p += vx_3*va0_3; - vtemp0_r += vxr_1*va0_3; - - vtemp1_p += vx_3*va1_3; - vtemp1_r += vxr_1*va1_3; - - vtemp2_p += vx_3*va2_3; - vtemp2_r += vxr_1*va2_3; - - vtemp3_p += vx_3*va3_3; - vtemp3_r += vxr_1*va3_3; - - } +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vzero %%v16 \n\t" + "vzero %%v17 \n\t" + "vzero %%v18 \n\t" + "vzero %%v19 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v20,0(%%r1,%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; - - register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1]; - register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1]; - - register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1]; - register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; - - register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1]; - register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1]; - - register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1]; - register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; - + "vleg %%v21,8(%%r1,%5),0 \n\t" + "wflcdb %%v21,%%v21 \n\t" + "vleg %%v21,0(%%r1,%5),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; - + "vleg %%v21,0(%%r1,%5),1 \n\t" + "vflcdb %%v21,%%v21 \n\t" + "vleg %%v21,8(%%r1,%5),0 \n\t" #endif -} -#else - -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - - FLOAT temp_r0 = 0.0; - FLOAT temp_r1 = 0.0; - FLOAT temp_r2 = 0.0; - FLOAT temp_r3 = 0.0; - FLOAT temp_i0 = 0.0; - FLOAT temp_i1 = 0.0; - FLOAT temp_i2 = 0.0; - FLOAT temp_i3 = 0.0; - - for (i = 0; i < 2 * n; i += 2) { + "vlrepg %%v24,0(%%r1,%1) \n\t" + "vlrepg %%v25,8(%%r1,%1) \n\t" + "vlrepg %%v26,0(%%r1,%2) \n\t" + "vlrepg %%v27,8(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v24,%%v20,%%v16 \n\t" + "vfmadb %%v16,%%v25,%%v21,%%v16 \n\t" + "vfmadb %%v17,%%v26,%%v20,%%v17 \n\t" + "vfmadb %%v17,%%v27,%%v21,%%v17 \n\t" + + "vlrepg %%v28,0(%%r1,%3) \n\t" + "vlrepg %%v29,8(%%r1,%3) \n\t" + "vlrepg %%v30,0(%%r1,%4) \n\t" + "vlrepg %%v31,8(%%r1,%4) \n\t" + + "vfmadb %%v18,%%v28,%%v20,%%v18 \n\t" + "vfmadb %%v18,%%v29,%%v21,%%v18 \n\t" + "vfmadb %%v19,%%v30,%%v20,%%v19 \n\t" + "vfmadb %%v19,%%v31,%%v21,%%v19 \n\t" + + "vl %%v22,16(%%r1,%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; - temp_r2 += a2[i] * x[i] - a2[i + 1] * x[i + 1]; - temp_i2 += a2[i] * x[i + 1] + a2[i + 1] * x[i]; - temp_r3 += a3[i] * x[i] - a3[i + 1] * x[i + 1]; - temp_i3 += a3[i] * x[i + 1] + a3[i + 1] * x[i]; + "vleg %%v23,24(%%r1,%5),0 \n\t" + "wflcdb %%v23,%%v23 \n\t" + "vleg %%v23,16(%%r1,%5),1 \n\t" #else - temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; - temp_r2 += a2[i] * x[i] + a2[i + 1] * x[i + 1]; - temp_i2 += a2[i] * x[i + 1] - a2[i + 1] * x[i]; - temp_r3 += a3[i] * x[i] + a3[i + 1] * x[i + 1]; - temp_i3 += a3[i] * x[i + 1] - a3[i + 1] * x[i]; + "vleg %%v23,16(%%r1,%5),1 \n\t" + "vflcdb %%v23,%%v23 \n\t" + "vleg %%v23,24(%%r1,%5),0 \n\t" #endif - } + "vlrepg %%v24,16(%%r1,%1) \n\t" + "vlrepg %%v25,24(%%r1,%1) \n\t" + "vlrepg %%v26,16(%%r1,%2) \n\t" + "vlrepg %%v27,24(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v24,%%v22,%%v16 \n\t" + "vfmadb %%v16,%%v25,%%v23,%%v16 \n\t" + "vfmadb %%v17,%%v26,%%v22,%%v17 \n\t" + "vfmadb %%v17,%%v27,%%v23,%%v17 \n\t" + + "vlrepg %%v28,16(%%r1,%3) \n\t" + "vlrepg %%v29,24(%%r1,%3) \n\t" + "vlrepg %%v30,16(%%r1,%4) \n\t" + "vlrepg %%v31,24(%%r1,%4) \n\t" + + "vfmadb %%v18,%%v28,%%v22,%%v18 \n\t" + "vfmadb %%v18,%%v29,%%v23,%%v18 \n\t" + "vfmadb %%v19,%%v30,%%v22,%%v19 \n\t" + "vfmadb %%v19,%%v31,%%v23,%%v19 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b \n\t" + + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" #if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; - + "vlrepg %%v24,0(%7) \n\t" + "vleg %%v25,8(%7),0 \n\t" + "wflcdb %%v25,%%v25 \n\t" + "vleg %%v25,8(%7),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; - + "vleg %%v24,0(%7),1 \n\t" + "vflcdb %%v24,%%v24 \n\t" + "vleg %%v24,0(%7),0 \n\t" + "vlrepg %%v25,8(%7) \n\t" #endif + "vl %%v26,0(%6) \n\t" + "vl %%v27,16(%6) \n\t" + "vl %%v28,32(%6) \n\t" + "vl %%v29,48(%6) \n\t" + "vfmadb %%v26,%%v16,%%v24,%%v26 \n\t" + "vfmadb %%v26,%%v20,%%v25,%%v26 \n\t" + "vfmadb %%v27,%%v17,%%v24,%%v27 \n\t" + "vfmadb %%v27,%%v21,%%v25,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v24,%%v28 \n\t" + "vfmadb %%v28,%%v22,%%v25,%%v28 \n\t" + "vfmadb %%v29,%%v19,%%v24,%%v29 \n\t" + "vfmadb %%v29,%%v23,%%v25,%%v29 \n\t" + "vst %%v26,0(%6) \n\t" + "vst %%v27,16(%6) \n\t" + "vst %%v28,32(%6) \n\t" + "vst %%v29,48(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[8])y),"ZQ"((const FLOAT (*)[2])alpha) + :"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - -#ifdef HAVE_KERNEL_4x2_VEC - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - //p for positive(real*real,image*image) r for image (real*image,image*real) - register __vector double vtemp0_p = {0.0, 0.0}; - register __vector double vtemp0_r = {0.0, 0.0}; - register __vector double vtemp1_p = {0.0, 0.0}; - register __vector double vtemp1_r = {0.0, 0.0}; - i = 0; - n = n << 1; - while (i < n) { - - register __vector double vx_0 = *(__vector double*) (&x[i]); - register __vector double vx_1 = *(__vector double*) (&x[i + 2]); - register __vector double vx_2 = *(__vector double*) (&x[i + 4]); - register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - - register __vector double va0 = *(__vector double*) (&a0[i]); - register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); - register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); - register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - - register __vector double va1 = *(__vector double*) (&a1[i]); - register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); - register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); - register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); - - register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); - register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); - - i += 8; - - vtemp0_p += vx_0*va0; - vtemp0_r += vxr_0*va0; - - vtemp1_p += vx_0*va1; - vtemp1_r += vxr_0*va1; - - vxr_0 = vec_permi(vx_2, vx_2, 2); - vtemp0_p += vx_1*va0_1; - vtemp0_r += vxr_1*va0_1; - - vtemp1_p += vx_1*va1_1; - vtemp1_r += vxr_1*va1_1; - vxr_1 = vec_permi(vx_3, vx_3, 2); - - vtemp0_p += vx_2*va0_2; - vtemp0_r += vxr_0*va0_2; - - vtemp1_p += vx_2*va1_2; - vtemp1_r += vxr_0*va1_2; - - vtemp0_p += vx_3*va0_3; - vtemp0_r += vxr_1*va0_3; - - vtemp1_p += vx_3*va1_3; - vtemp1_r += vxr_1*va1_3; - - } - +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vzero %%v16 \n\t" + "vzero %%v17 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v18,0(%%r1,%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - + "vleg %%v19,8(%%r1,%3),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,0(%%r1,%3),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - + "vleg %%v19,0(%%r1,%3),1 \n\t" + "vflcdb %%v19,%%v19 \n\t" + "vleg %%v19,8(%%r1,%3),0 \n\t" #endif -} - -#else - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - FLOAT temp_r0 = 0.0; - FLOAT temp_r1 = 0.0; - FLOAT temp_i0 = 0.0; - FLOAT temp_i1 = 0.0; - - for (i = 0; i < 2 * n; i += 2) { + "vlrepg %%v20,0(%%r1,%1) \n\t" + "vlrepg %%v21,8(%%r1,%1) \n\t" + "vlrepg %%v22,0(%%r1,%2) \n\t" + "vlrepg %%v23,8(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" + "vfmadb %%v16,%%v21,%%v19,%%v16 \n\t" + "vfmadb %%v17,%%v22,%%v18,%%v17 \n\t" + "vfmadb %%v17,%%v23,%%v19,%%v17 \n\t" + + "vl %%v18,16(%%r1,%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; + "vleg %%v19,24(%%r1,%3),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,16(%%r1,%3),1 \n\t" #else - temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; + "vleg %%v19,16(%%r1,%3),1 \n\t" + "vflcdb %%v19,%%v19 \n\t" + "vleg %%v19,24(%%r1,%3),0 \n\t" #endif - } + "vlrepg %%v20,16(%%r1,%1) \n\t" + "vlrepg %%v21,24(%%r1,%1) \n\t" + "vlrepg %%v22,16(%%r1,%2) \n\t" + "vlrepg %%v23,24(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" + "vfmadb %%v16,%%v21,%%v19,%%v16 \n\t" + "vfmadb %%v17,%%v22,%%v18,%%v17 \n\t" + "vfmadb %%v17,%%v23,%%v19,%%v17 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b \n\t" + + "vpdi %%v18,%%v16,%%v16,4 \n\t" + "vpdi %%v19,%%v17,%%v17,4 \n\t" #if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - + "vlrepg %%v20,0(%5) \n\t" + "vleg %%v21,8(%5),0 \n\t" + "wflcdb %%v21,%%v21 \n\t" + "vleg %%v21,8(%5),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - + "vleg %%v20,0(%5),1 \n\t" + "vflcdb %%v20,%%v20 \n\t" + "vleg %%v20,0(%5),0 \n\t" + "vlrepg %%v21,8(%5) \n\t" #endif + "vl %%v22,0(%4) \n\t" + "vl %%v23,16(%4) \n\t" + "vfmadb %%v22,%%v16,%%v20,%%v22 \n\t" + "vfmadb %%v22,%%v18,%%v21,%%v22 \n\t" + "vfmadb %%v23,%%v17,%%v20,%%v23 \n\t" + "vfmadb %%v23,%%v19,%%v21,%%v23 \n\t" + "vst %%v22,0(%4) \n\t" + "vst %%v23,16(%4) \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[4])y),"ZQ"((const FLOAT (*)[2])alpha) + :"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -#endif - -#ifdef HAVE_KERNEL_4x1_VEC - -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0 ; - a0 = ap; - //p for positive(real*real,image*image) r for image (real*image,image*real) - register __vector double vtemp0_p = {0.0, 0.0}; - register __vector double vtemp0_r = {0.0, 0.0}; - i = 0; - n = n << 1; - while (i < n) { - - register __vector double vx_0 = *(__vector double*) (&x[i]); - register __vector double vx_1 = *(__vector double*) (&x[i + 2]); - register __vector double vx_2 = *(__vector double*) (&x[i + 4]); - register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - - register __vector double va0 = *(__vector double*) (&a0[i]); - register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); - register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); - register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - - register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); - register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); - - i += 8; - - vtemp0_p += vx_0*va0; - vtemp0_r += vxr_0*va0; - - vxr_0 = vec_permi(vx_2, vx_2, 2); - vtemp0_p += vx_1*va0_1; - vtemp0_r += vxr_1*va0_1; - - vxr_1 = vec_permi(vx_3, vx_3, 2); - - vtemp0_p += vx_2*va0_2; - vtemp0_r += vxr_0*va0_2; - - vtemp0_p += vx_3*va0_3; - vtemp0_r += vxr_1*va0_3; - - } - +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vzero %%v16 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v17,0(%%r1,%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - + "vleg %%v18,8(%%r1,%2),0 \n\t" + "wflcdb %%v18,%%v18 \n\t" + "vleg %%v18,0(%%r1,%2),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + "vleg %%v18,0(%%r1,%2),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,8(%%r1,%2),0 \n\t" #endif -} - -#else - -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0; - a0 = ap; + "vlrepg %%v19,0(%%r1,%1) \n\t" + "vlrepg %%v20,8(%%r1,%1) \n\t" + + "vfmadb %%v16,%%v19,%%v17,%%v16 \n\t" + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" - FLOAT temp_r0 = 0.0; - FLOAT temp_i0 = 0.0; - - for (i = 0; i < 2 * n; i += 2) { + "vl %%v17,16(%%r1,%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; + "vleg %%v18,24(%%r1,%2),0 \n\t" + "wflcdb %%v18,%%v18 \n\t" + "vleg %%v18,16(%%r1,%2),1 \n\t" #else - temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; + "vleg %%v18,16(%%r1,%2),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,24(%%r1,%2),0 \n\t" #endif - } -#if !defined(XCONJ) + "vlrepg %%v19,16(%%r1,%1) \n\t" + "vlrepg %%v20,24(%%r1,%1) \n\t" + + "vfmadb %%v16,%%v19,%%v17,%%v16 \n\t" + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + "agfi %%r1,32 \n\t" + "brctg %%r0,0b \n\t" + "vpdi %%v17,%%v16,%%v16,4 \n\t" +#if !defined(XCONJ) + "vlrepg %%v18,0(%4) \n\t" + "vleg %%v19,8(%4),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,8(%4),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - + "vleg %%v18,0(%4),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,0(%4),0 \n\t" + "vlrepg %%v19,8(%4) \n\t" #endif - + "vl %%v20,0(%3) \n\t" + "vfmadb %%v20,%%v16,%%v18,%%v20 \n\t" + "vfmadb %%v20,%%v17,%%v19,%%v20 \n\t" + "vst %%v20,0(%3) \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[2])y),"ZQ"((const FLOAT (*)[2])alpha) + :"memory","cc","r0","r1","v16","v17","v18","v19","v20" + ); } -#endif - -static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest = *src; - *(dest + 1) = *(src + 1); - dest += 2; - src += inc_src; - } +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[8]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4; + FLOAT ybuffer[8],*xbuffer; + FLOAT alpha[2]; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + lda4 = lda << 2; + + xbuffer = buffer; + + n1 = n >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if ( inc_x != 2 ) + copy_x(NB,x_ptr,xbuffer,inc_x); + else + xbuffer = x_ptr; + + if ( inc_y == 2 ) + { + + for( i = 0; i < n1 ; i++) + { + zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if ( n2 & 2 ) + { + zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if ( n2 & 1 ) + { + zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } + else + { + + for( i = 0; i < n1 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for( i = 0; i < n2 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + + + if ( m3 == 0 ) return(0); - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; x_ptr = x; + j=0; + a_ptr = a; + y_ptr = y; - if (inc_x != 2) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - if (inc_y == 2) { - - for (i = 0; i < n1; i++) { - zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 2; - y_ptr += 8; - - } - - if (n2 & 2) { - zgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 1; - y_ptr += 4; - - } - - if (n2 & 1) { - zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda; - y_ptr += 2; - - } - - } else { - - for (i = 0; i < n1; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - - a_ptr += lda << 2; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for (i = 0; i < n2; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - if (m3 == 0) return (0); - - x_ptr = x; - j = 0; - a_ptr = a; - y_ptr = y; - - if (m3 == 3) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while (j < n) { + if ( m3 == 3 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while ( j < n) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - if (m3 == 2) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - - while (j < (n & -2)) { + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + + if ( m3 == 2 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } - while (j < n) { + + while ( j < n) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } - return (0); - } + return(0); + } - if (m3 == 1) { - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; + if ( m3 == 1 ) + { - while (j < (n & -2)) { + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } - while (j < n) { + while ( j < n) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - return (0); + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + return(0); } - diff --git a/ztest/gemv.c b/ztest/gemv.c index f1ee972bc4..964afd3ef3 100644 --- a/ztest/gemv.c +++ b/ztest/gemv.c @@ -52,67 +52,66 @@ int assert_dbl_near(double exp, double real, double tol) { int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i; - BLASLONG ix,iy; + BLASLONG ix, iy; BLASLONG j; FLOAT *a_ptr; - FLOAT temp_r,temp_i; - BLASLONG inc_x2,inc_y2; + FLOAT temp_r, temp_i; + BLASLONG inc_x2, inc_y2; BLASLONG lda2; BLASLONG i2; - lda2 = 2*lda; + lda2 = 2 * lda; ix = 0; a_ptr = a; - if ( inc_x == 1 && inc_y == 1 ) + if (inc_x == 1 && inc_y == 1) { - for (j=0; j Date: Mon, 31 Dec 2018 23:10:59 +0100 Subject: [PATCH 004/189] Increment version to 0.3.6.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 24c169afe0..812e6bf6f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 5.dev) +set(OpenBLAS_PATCH_VERSION 6.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From ed704185abd09fe04c6c82cf809c1cb09d359651 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Dec 2018 23:11:37 +0100 Subject: [PATCH 005/189] Increment version to 0.3.6.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 0d5b83b391..7c128fb498 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.5.dev +VERSION = 0.3.6.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From d11554c88fdf1b6a9cad1c4c1252f27995117378 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Mon, 31 Dec 2018 23:19:44 +0100 Subject: [PATCH 006/189] Validate user supplied TARGET (#1941) the build will now abort with an error message when an undefined build TARGET is named Fixes #1938 --- Makefile.system | 1 + getarch.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/Makefile.system b/Makefile.system index fb8e7ea419..20d4f64920 100644 --- a/Makefile.system +++ b/Makefile.system @@ -65,6 +65,7 @@ endif ifdef TARGET GETARCH_FLAGS := -DFORCE_$(TARGET) +GETARCH_FLAGS += -DUSER_TARGET endif # Force fallbacks for 32bit diff --git a/getarch.c b/getarch.c index 146f1f36fb..78ba0fefdb 100644 --- a/getarch.c +++ b/getarch.c @@ -1068,6 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef FORCE +#ifdef USER_TARGET +#error "The TARGET specified on the command line or in Makefile.rule is not supported. Please choose a target from TargetList.txt" +#endif + #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) #ifndef POWER From 20d1aad13f59d6146bcdf8be6716cd8cc020d2bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Jan 2019 20:15:35 +0100 Subject: [PATCH 007/189] Fix missing quotes around thunderx targets --- cmake/prebuild.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 6ed99e807b..757461008e 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -198,7 +198,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_M 8) set(ZGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "THUNDERX) + elseif ("${CORE}" STREQUAL "THUNDERX") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" @@ -224,7 +224,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 2) set(ZGEMM_UNROLL_M 2) set(ZGEMM_UNROLL_N 2) - elseif ("${CORE}" STREQUAL "THUNDERX2T99) + elseif ("${CORE}" STREQUAL "THUNDERX2T99") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" From 802f0dbde153b166f533ab1660336d7832e5b616 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Jan 2019 22:17:31 +0100 Subject: [PATCH 008/189] More fixes for cross-compiling ARM64 targets Fixed core naming for DYNAMIC_ARCH. Corrected GEMM_DEFAULT entries and added SYMV_P. Replaced outdated VULCAN define for ThunderX2T99 with ARMV8 to get basic definitions back. For issue #1908 --- cmake/prebuild.cmake | 45 ++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 757461008e..a67c44bf5c 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -87,13 +87,18 @@ endif () # Cannot run getarch on target if we are cross-compiling if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) # Write to config as getarch would + if (DEFINED TARGET_CORE) + set(TCORE ${TARGET_CORE}) + else() + set(TCORE ${CORE}) + endif() # TODO: Set up defines that getarch sets up based on every other target # Perhaps this should be inside a different file as it grows larger file(APPEND ${TARGET_CONF_TEMP} - "#define ${CORE}\n" - "#define CHAR_CORENAME \"${CORE}\"\n") - if ("${CORE}" STREQUAL "ARMV7") + "#define ${TCORE}\n" + "#define CHAR_CORENAME \"${TCORE}\"\n") + if ("${TCORE}" STREQUAL "ARMV7") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t65536\n" "#define L1_DATA_LINESIZE\t32\n" @@ -108,7 +113,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 4) set(DGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "ARMV8") + elseif ("${TCORE}" STREQUAL "ARMV8") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t32768\n" "#define L1_DATA_LINESIZE\t64\n" @@ -118,9 +123,16 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define DTB_SIZE\t4096\n" "#define L2_ASSOCIATIVE\t32\n" "#define ARMV8\n") - set(SGEMM_UNROLL_M 4) + set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "CORTEXA57" OR "${CORE}" STREQUAL "CORTEXA53") + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" @@ -144,9 +156,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_N 4) - set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "CORTEXA72" OR "${CORE}" STREQUAL "CORTEXA73") + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t49152\n" "#define L1_CODE_LINESIZE\t64\n" @@ -170,9 +183,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_N 4) - set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "FALKOR") + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "FALKOR") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" "#define L1_CODE_LINESIZE\t64\n" @@ -196,9 +210,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_N 4) - set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) - elseif ("${CORE}" STREQUAL "THUNDERX") + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "THUNDERX") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" @@ -224,7 +239,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 2) set(ZGEMM_UNROLL_M 2) set(ZGEMM_UNROLL_N 2) - elseif ("${CORE}" STREQUAL "THUNDERX2T99") + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "THUNDERX2T99") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" @@ -240,7 +256,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define L3_ASSOCIATIVE\t32\n" "#define DTB_DEFAULT_ENTRIES\t64\n" "#define DTB_SIZE\t4096\n" - "#define VULCAN\n") + "#define ARMV8\n") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 8) @@ -249,6 +265,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) endif() # Or should this actually be NUM_CORES? From 1aa840a0a2e52edfe4572e99131c4f19ccc63e58 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 4 Jan 2019 01:38:18 +0200 Subject: [PATCH 009/189] [ZARCH] fix sgemv_t_4.c --- kernel/zarch/sgemv_t_4.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index efc06297f3..fe99ef5ce0 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -158,8 +158,6 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "brctg %%r0,2b \n\t" "3: \n\t" - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" "vrepf %%v4,%%v0,1 \n\t" "aebr %%f0,%%f4 \n\t" "vrepf %%v4,%%v0,2 \n\t" @@ -351,6 +349,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) "vl %%v31,112(%%r1,%1) \n\t" "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" "1: \n\t" "lghi %%r0,28 \n\t" From 94cd946b963e9e077cb4a4c5d93b1ce691e1fe63 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 4 Jan 2019 17:45:56 +0200 Subject: [PATCH 010/189] [ZARCH] fix cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 332 +++++++++++++++++++-------------------- 1 file changed, 166 insertions(+), 166 deletions(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index 4c3253774a..c939aea9fc 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -34,107 +34,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%5) \n\t" - "vlrepg %%v17,8(%5) \n\t" - "vlrepg %%v18,16(%5) \n\t" - "vlrepg %%v19,24(%5) \n\t" + "vlrepg %%v16,0(%5) \n\t" + "vlrepg %%v17,8(%5) \n\t" + "vlrepg %%v18,16(%5) \n\t" + "vlrepg %%v19,24(%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" #else - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vlef %%v24,0(%%r1,%1),0 \n\t" - "vlef %%v24,0(%%r1,%1),1 \n\t" - "vlef %%v24,8(%%r1,%1),2 \n\t" - "vlef %%v24,8(%%r1,%1),3 \n\t" - "vlef %%v25,4(%%r1,%1),0 \n\t" - "vlef %%v25,4(%%r1,%1),1 \n\t" - "vlef %%v25,12(%%r1,%1),2 \n\t" - "vlef %%v25,12(%%r1,%1),3 \n\t" - "vlef %%v26,0(%%r1,%2),0 \n\t" - "vlef %%v26,0(%%r1,%2),1 \n\t" - "vlef %%v26,8(%%r1,%2),2 \n\t" - "vlef %%v26,8(%%r1,%2),3 \n\t" - "vlef %%v27,4(%%r1,%2),0 \n\t" - "vlef %%v27,4(%%r1,%2),1 \n\t" - "vlef %%v27,12(%%r1,%2),2 \n\t" - "vlef %%v27,12(%%r1,%2),3 \n\t" - - "vl %%v0,0(%%r1,%6) \n\t" - "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlef %%v28,0(%%r1,%1),0 \n\t" - "vlef %%v28,0(%%r1,%1),1 \n\t" - "vlef %%v28,8(%%r1,%1),2 \n\t" - "vlef %%v28,8(%%r1,%1),3 \n\t" - "vlef %%v29,4(%%r1,%1),0 \n\t" - "vlef %%v29,4(%%r1,%1),1 \n\t" - "vlef %%v29,12(%%r1,%1),2 \n\t" - "vlef %%v29,12(%%r1,%1),3 \n\t" - "vlef %%v30,0(%%r1,%2),0 \n\t" - "vlef %%v30,0(%%r1,%2),1 \n\t" - "vlef %%v30,8(%%r1,%2),2 \n\t" - "vlef %%v30,8(%%r1,%2),3 \n\t" - "vlef %%v31,4(%%r1,%2),0 \n\t" - "vlef %%v31,4(%%r1,%2),1 \n\t" - "vlef %%v31,12(%%r1,%2),2 \n\t" - "vlef %%v31,12(%%r1,%2),3 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vlef %%v24,0(%%r1,%1),0 \n\t" + "vlef %%v24,0(%%r1,%1),1 \n\t" + "vlef %%v24,8(%%r1,%1),2 \n\t" + "vlef %%v24,8(%%r1,%1),3 \n\t" + "vlef %%v25,4(%%r1,%1),0 \n\t" + "vlef %%v25,4(%%r1,%1),1 \n\t" + "vlef %%v25,12(%%r1,%1),2 \n\t" + "vlef %%v25,12(%%r1,%1),3 \n\t" + "vlef %%v26,0(%%r1,%2),0 \n\t" + "vlef %%v26,0(%%r1,%2),1 \n\t" + "vlef %%v26,8(%%r1,%2),2 \n\t" + "vlef %%v26,8(%%r1,%2),3 \n\t" + "vlef %%v27,4(%%r1,%2),0 \n\t" + "vlef %%v27,4(%%r1,%2),1 \n\t" + "vlef %%v27,12(%%r1,%2),2 \n\t" + "vlef %%v27,12(%%r1,%2),3 \n\t" + + "vl %%v0,0(%%r1,%6) \n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" + + "vlef %%v28,0(%%r1,%1),0 \n\t" + "vlef %%v28,0(%%r1,%1),1 \n\t" + "vlef %%v28,8(%%r1,%1),2 \n\t" + "vlef %%v28,8(%%r1,%1),3 \n\t" + "vlef %%v29,4(%%r1,%1),0 \n\t" + "vlef %%v29,4(%%r1,%1),1 \n\t" + "vlef %%v29,12(%%r1,%1),2 \n\t" + "vlef %%v29,12(%%r1,%1),3 \n\t" + "vlef %%v30,0(%%r1,%2),0 \n\t" + "vlef %%v30,0(%%r1,%2),1 \n\t" + "vlef %%v30,8(%%r1,%2),2 \n\t" + "vlef %%v30,8(%%r1,%2),3 \n\t" + "vlef %%v31,4(%%r1,%2),0 \n\t" + "vlef %%v31,4(%%r1,%2),1 \n\t" + "vlef %%v31,12(%%r1,%2),2 \n\t" + "vlef %%v31,12(%%r1,%2),3 \n\t" "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" @@ -153,56 +153,56 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%3) \n\t" - "vlrepg %%v17,8(%3) \n\t" + "vlrepg %%v16,0(%3) \n\t" + "vlrepg %%v17,8(%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" #else - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vlef %%v20,0(%%r1,%1),0 \n\t" - "vlef %%v20,0(%%r1,%1),1 \n\t" - "vlef %%v20,8(%%r1,%1),2 \n\t" - "vlef %%v20,8(%%r1,%1),3 \n\t" - "vlef %%v21,4(%%r1,%1),0 \n\t" - "vlef %%v21,4(%%r1,%1),1 \n\t" - "vlef %%v21,12(%%r1,%1),2 \n\t" - "vlef %%v21,12(%%r1,%1),3 \n\t" - "vlef %%v22,0(%%r1,%2),0 \n\t" - "vlef %%v22,0(%%r1,%2),1 \n\t" - "vlef %%v22,8(%%r1,%2),2 \n\t" - "vlef %%v22,8(%%r1,%2),3 \n\t" - "vlef %%v23,4(%%r1,%2),0 \n\t" - "vlef %%v23,4(%%r1,%2),1 \n\t" - "vlef %%v23,12(%%r1,%2),2 \n\t" - "vlef %%v23,12(%%r1,%2),3 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vlef %%v20,0(%%r1,%1),0 \n\t" + "vlef %%v20,0(%%r1,%1),1 \n\t" + "vlef %%v20,8(%%r1,%1),2 \n\t" + "vlef %%v20,8(%%r1,%1),3 \n\t" + "vlef %%v21,4(%%r1,%1),0 \n\t" + "vlef %%v21,4(%%r1,%1),1 \n\t" + "vlef %%v21,12(%%r1,%1),2 \n\t" + "vlef %%v21,12(%%r1,%1),3 \n\t" + "vlef %%v22,0(%%r1,%2),0 \n\t" + "vlef %%v22,0(%%r1,%2),1 \n\t" + "vlef %%v22,8(%%r1,%2),2 \n\t" + "vlef %%v22,8(%%r1,%2),3 \n\t" + "vlef %%v23,4(%%r1,%2),0 \n\t" + "vlef %%v23,4(%%r1,%2),1 \n\t" + "vlef %%v23,12(%%r1,%2),2 \n\t" + "vlef %%v23,12(%%r1,%2),3 \n\t" "vl %%v0,0(%%r1,%4) \n\t" "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" @@ -222,34 +222,34 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%2) \n\t" + "vlrepg %%v16,0(%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,4(%2),2 \n\t" "vflcsb %%v17,%%v17 \n\t" "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,0(%2),3 \n\t" #else "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,0(%2),3 \n\t" "vflcsb %%v17,%%v17 \n\t" "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,4(%2),2 \n\t" #endif "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" + "srlg %%r0,%0,1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%1) \n\t" "pfd 2,1024(%%r1,%3) \n\t" - "vlef %%v18,0(%%r1,%1),0 \n\t" - "vlef %%v18,0(%%r1,%1),1 \n\t" - "vlef %%v18,8(%%r1,%1),2 \n\t" - "vlef %%v18,8(%%r1,%1),3 \n\t" - "vlef %%v19,4(%%r1,%1),0 \n\t" - "vlef %%v19,4(%%r1,%1),1 \n\t" - "vlef %%v19,12(%%r1,%1),2 \n\t" - "vlef %%v19,12(%%r1,%1),3 \n\t" + "vlef %%v18,0(%%r1,%1),0 \n\t" + "vlef %%v18,0(%%r1,%1),1 \n\t" + "vlef %%v18,8(%%r1,%1),2 \n\t" + "vlef %%v18,8(%%r1,%1),3 \n\t" + "vlef %%v19,4(%%r1,%1),0 \n\t" + "vlef %%v19,4(%%r1,%1),1 \n\t" + "vlef %%v19,12(%%r1,%1),2 \n\t" + "vlef %%v19,12(%%r1,%1),3 \n\t" "vl %%v0,0(%%r1,%3) \n\t" "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" @@ -268,18 +268,18 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT al { __asm__ volatile ( #if !defined(XCONJ) - "vlrepf %%v0,%3 \n\t" - "vlef %%v1,%4,0 \n\t" - "vlef %%v1,%4,2 \n\t" + "vlrepf %%v0,%3 \n\t" + "vlef %%v1,%4,0 \n\t" + "vlef %%v1,%4,2 \n\t" "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,%4,1 \n\t" + "vlef %%v1,%4,1 \n\t" "vlef %%v1,%4,3 \n\t" #else "vlef %%v0,%3,1 \n\t" - "vlef %%v0,%3,3 \n\t" + "vlef %%v0,%3,3 \n\t" "vflcsb %%v0,%%v0 \n\t" "vlef %%v0,%3,0 \n\t" - "vlef %%v0,%3,2 \n\t" + "vlef %%v0,%3,2 \n\t" "vlrepf %%v1,%4 \n\t" #endif "xgr %%r1,%%r1 \n\t" @@ -292,7 +292,7 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT al "vl %%v17,16(%%r1,%1) \n\t" "vl %%v18,0(%%r1,%2) \n\t" "vl %%v19,16(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" + "verllg %%v20,%%v16,32 \n\t" "verllg %%v21,%%v17,32 \n\t" "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t" From ae1d1f74f7ff96b8345189bcba058b7acdc7d494 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 16:55:33 +0100 Subject: [PATCH 011/189] Query AVX2 and AVX512 capability for runtime cpu selection --- driver/others/dynamic.c | 141 +++++++++++++++++++++++++++++----------- 1 file changed, 102 insertions(+), 39 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 1f67dc5215..7cc911d32e 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -304,9 +304,47 @@ int support_avx(){ #endif } +int support_avx2(){ +#ifndef NO_AVX2 + int eax, ebx, ecx=0, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 0) + ret=1; //OS supports AVX2 + return ret; +#else + return 0; +#endif +} + +int support_avx512(){ +#ifndef NO_AVX512 + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 1){ + ret=0; //OS does not even support AVX2 + } + if((ebx & (1<<31)) != 0){ + ret=1; //OS supports AVX512VL + } + return ret; +#else + return 0; +#endif +} + extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" +#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" +#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512 instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" static int get_vendor(void){ @@ -403,18 +441,24 @@ static gotoblas_t *get_coretype(void){ } //Intel Haswell if (model == 12 || model == 15) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Broadwell if (model == 13) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } @@ -424,27 +468,36 @@ static gotoblas_t *get_coretype(void){ case 4: //Intel Haswell if (model == 5 || model == 6) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Broadwell if (model == 7 || model == 15) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Skylake if (model == 14) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } @@ -457,40 +510,50 @@ static gotoblas_t *get_coretype(void){ case 5: //Intel Broadwell if (model == 6) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } if (model == 5) { // Intel Skylake X -#ifndef NO_AVX512 - return &gotoblas_SKYLAKEX; -#else - if(support_avx()) + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) return &gotoblas_HASWELL; - else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; - } -#endif + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } } //Intel Skylake if (model == 14) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Phi Knights Landing if (model == 7) { - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } @@ -503,26 +566,26 @@ static gotoblas_t *get_coretype(void){ case 6: if (model == 6) { // Cannon Lake -#ifndef NO_AVX512 - return &gotoblas_SKYLAKEX; -#else - if(support_avx()) -#ifndef NO_AVX2 - return &gotoblas_HASWELL; -#else - return &gotoblas_SANDYBRIDGE; -#endif - else - return &gotoblas_NEHALEM; -#endif + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } } return NULL; case 9: case 8: if (model == 14 ) { // Kaby Lake - if(support_avx()) + if(support_avx2()) return &gotoblas_HASWELL; - else{ + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } From 0afaae4b2323b28af49ffe81b98d17bd4ced96f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 16:58:56 +0100 Subject: [PATCH 012/189] Query AVX2 and AVX512VL capability in x86 cpu detection --- common_x86_64.h | 2 +- cpuid.h | 1 + cpuid_x86.c | 132 +++++++++++++++++++++++++++--------------------- 3 files changed, 76 insertions(+), 59 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index 62e138e342..f27c1e9be8 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -134,7 +134,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ "=b" (*ebx), "=c" (*ecx), "=d" (*edx) - : "0" (op)); + : "0" (op), "c"(0)); #endif } diff --git a/cpuid.h b/cpuid.h index a6bc211f3e..c56672ad8b 100644 --- a/cpuid.h +++ b/cpuid.h @@ -139,6 +139,7 @@ #define HAVE_FMA4 (1 << 19) #define HAVE_FMA3 (1 << 20) #define HAVE_AVX512VL (1 << 21) +#define HAVE_AVX2 (1 << 22) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 diff --git a/cpuid_x86.c b/cpuid_x86.c index eb986b6b68..ddc09857b6 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -97,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ ("mov %%ebx, %%edi;" "cpuid;" "xchgl %%ebx, %%edi;" - : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); + : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc"); #else __asm__ __volatile__ - ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); + ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc"); #endif } @@ -211,6 +211,42 @@ int support_avx(){ #endif } +int support_avx2(){ +#ifndef NO_AVX2 + int eax, ebx, ecx=0, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 0) + ret=1; //OS supports AVX2 + return ret; +#else + return 0; +#endif +} + +int support_avx512(){ +#ifndef NO_AVX512 + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & 32) != 32){ + ret=0; //OS does not even support AVX2 + } + if((ebx & (1<<31)) != 0){ + ret=1; //OS supports AVX512VL + } + return ret; +#else + return 0; +#endif +} + int get_vendor(void){ int eax, ebx, ecx, edx; @@ -294,6 +330,8 @@ int get_cputype(int gettype){ if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; #ifndef NO_AVX if (support_avx()) feature |= HAVE_AVX; + if (support_avx2()) feature |= HAVE_AVX2; + if (support_avx512()) feature |= HAVE_AVX512VL; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; #endif @@ -1228,22 +1266,18 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 12: case 15: - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 13: //Broadwell - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; } @@ -1252,33 +1286,27 @@ int get_cpuname(void){ switch (model) { case 5: case 6: - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 7: case 15: //Broadwell - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 14: //Skylake - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 12: @@ -1292,46 +1320,36 @@ int get_cpuname(void){ switch (model) { case 6: //Broadwell - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 5: // Skylake X -#ifndef NO_AVX512 - return CPUTYPE_SKYLAKEX; -#else - if(support_avx()) -#ifndef NO_AVX2 - return CPUTYPE_HASWELL; -#else - return CPUTYPE_SANDYBRIDGE; -#endif + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; -#endif case 14: // Skylake - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 7: // Xeon Phi Knights Landing - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; case 12: @@ -1342,30 +1360,24 @@ int get_cpuname(void){ case 6: switch (model) { case 6: // Cannon Lake -#ifndef NO_AVX512 - return CPUTYPE_SKYLAKEX; -#else - if(support_avx()) -#ifndef NO_AVX2 - return CPUTYPE_HASWELL; -#else - return CPUTYPE_SANDYBRIDGE; -#endif + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; -#endif } break; case 9: case 8: switch (model) { case 14: // Kaby Lake - if(support_avx()) -#ifndef NO_AVX2 + if(support_avx2()) return CPUTYPE_HASWELL; -#else + if(support_avx()) return CPUTYPE_SANDYBRIDGE; -#endif else return CPUTYPE_NEHALEM; } @@ -2112,6 +2124,8 @@ void get_cpuconfig(void){ if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); + if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); + if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); @@ -2180,6 +2194,8 @@ void get_sse(void){ if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); + if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); + if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); From 68eb3146ce4c50ac557cf5f199cc1b4294ba3817 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 18:07:14 +0100 Subject: [PATCH 013/189] Add xcr0 (os support) check --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index ddc09857b6..377267fcc0 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -239,6 +239,8 @@ int support_avx512(){ ret=0; //OS does not even support AVX2 } if((ebx & (1<<31)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 0xe0) == 0xe0) ret=1; //OS supports AVX512VL } return ret; From e1574fa2b4a2a781be70d8d521bb3b80a572ca9d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 18:08:02 +0100 Subject: [PATCH 014/189] Add xcr0 (os support) check --- driver/others/dynamic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 7cc911d32e..4c966260d8 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -332,6 +332,8 @@ int support_avx512(){ ret=0; //OS does not even support AVX2 } if((ebx & (1<<31)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 0xe0) == 0xe0) ret=1; //OS supports AVX512VL } return ret; From 31ed19e8b907f72ed4c8ef3165d8577b55264861 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Jan 2019 19:41:13 +0100 Subject: [PATCH 015/189] Add message for SkylakeX and KNL fallbacks to Haswell --- driver/others/dynamic.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4c966260d8..ba93fca8b2 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -346,7 +346,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" #define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" -#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512 instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" +#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" static int get_vendor(void){ @@ -526,8 +526,10 @@ static gotoblas_t *get_coretype(void){ // Intel Skylake X if (support_avx512()) return &gotoblas_SKYLAKEX; - if(support_avx2()) + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); return &gotoblas_HASWELL; + } if(support_avx()) { openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); return &gotoblas_SANDYBRIDGE; @@ -550,8 +552,10 @@ static gotoblas_t *get_coretype(void){ } //Intel Phi Knights Landing if (model == 7) { - if(support_avx2()) + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); return &gotoblas_HASWELL; + } if(support_avx()) { openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); return &gotoblas_SANDYBRIDGE; From 191677b902054d1476f3bb12b5360c337c47eb7e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Jan 2019 10:46:47 +0100 Subject: [PATCH 016/189] Add travis_wait to the OSX brew install phase --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3f323a8540..e8b7e0a27c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -153,7 +153,7 @@ matrix: before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update - - brew install gcc # for gfortran + - travis_wait 30 brew install gcc # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: From cf5d48e83300a5eb2bb047829fc793ba78959c35 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Jan 2019 14:41:48 +0100 Subject: [PATCH 017/189] Update OSX environment to Sierra as homebrew seems to have dropped support for El Capitan in their gcc packages --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3f323a8540..51679af620 100644 --- a/.travis.yml +++ b/.travis.yml @@ -149,7 +149,7 @@ matrix: - &test-macos os: osx - osx_image: xcode8 + osx_image: xcode8.3 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update From 1650311246d185ca2631c76c33c0212848b57d2a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Jan 2019 14:43:45 +0100 Subject: [PATCH 018/189] Bump xcode to 8.3 --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index e8b7e0a27c..51679af620 100644 --- a/.travis.yml +++ b/.travis.yml @@ -149,11 +149,11 @@ matrix: - &test-macos os: osx - osx_image: xcode8 + osx_image: xcode8.3 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update - - travis_wait 30 brew install gcc # for gfortran + - brew install gcc # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: From 3eafcfa6507891f7fff781423d9eb6af13501133 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 07:43:45 +0200 Subject: [PATCH 019/189] [ZARCH] fix cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index c939aea9fc..7b5e43497f 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -119,22 +119,22 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" - "vlef %%v28,0(%%r1,%1),0 \n\t" - "vlef %%v28,0(%%r1,%1),1 \n\t" - "vlef %%v28,8(%%r1,%1),2 \n\t" - "vlef %%v28,8(%%r1,%1),3 \n\t" - "vlef %%v29,4(%%r1,%1),0 \n\t" - "vlef %%v29,4(%%r1,%1),1 \n\t" - "vlef %%v29,12(%%r1,%1),2 \n\t" - "vlef %%v29,12(%%r1,%1),3 \n\t" - "vlef %%v30,0(%%r1,%2),0 \n\t" - "vlef %%v30,0(%%r1,%2),1 \n\t" - "vlef %%v30,8(%%r1,%2),2 \n\t" - "vlef %%v30,8(%%r1,%2),3 \n\t" - "vlef %%v31,4(%%r1,%2),0 \n\t" - "vlef %%v31,4(%%r1,%2),1 \n\t" - "vlef %%v31,12(%%r1,%2),2 \n\t" - "vlef %%v31,12(%%r1,%2),3 \n\t" + "vlef %%v28,0(%%r1,%3),0 \n\t" + "vlef %%v28,0(%%r1,%3),1 \n\t" + "vlef %%v28,8(%%r1,%3),2 \n\t" + "vlef %%v28,8(%%r1,%3),3 \n\t" + "vlef %%v29,4(%%r1,%3),0 \n\t" + "vlef %%v29,4(%%r1,%3),1 \n\t" + "vlef %%v29,12(%%r1,%3),2 \n\t" + "vlef %%v29,12(%%r1,%3),3 \n\t" + "vlef %%v30,0(%%r1,%4),0 \n\t" + "vlef %%v30,0(%%r1,%4),1 \n\t" + "vlef %%v30,8(%%r1,%4),2 \n\t" + "vlef %%v30,8(%%r1,%4),3 \n\t" + "vlef %%v31,4(%%r1,%4),0 \n\t" + "vlef %%v31,4(%%r1,%4),1 \n\t" + "vlef %%v31,12(%%r1,%4),2 \n\t" + "vlef %%v31,12(%%r1,%4),3 \n\t" "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" From e7455f500c06ecda4085d560ffa20c5bc188416f Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 16:33:54 +0200 Subject: [PATCH 020/189] [ZARCH] fix dsdot.c --- kernel/zarch/dsdot.c | 123 ++++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 67 deletions(-) diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 17461a0290..800bb0d51a 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -27,61 +27,34 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { double dot; __asm__ volatile ( "vzero %%v0 \n\t" - "srlg %%r0,%1,5 \n\t" + "srlg %%r0,%1,4 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%3) \n\t" - "vfmsb %%v16,%%v16,%%v24 \n\t" - "vl %%v25,16(%%r1,%3) \n\t" - "vfmsb %%v17,%%v17,%%v25 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmsb %%v18,%%v18,%%v26 \n\t" - "vl %%v27,48(%%r1,%3) \n\t" - "vfmsb %%v19,%%v19,%%v27 \n\t" - "vl %%v28,64(%%r1,%3) \n\t" - "vfmsb %%v20,%%v20,%%v28 \n\t" - "vl %%v29,80(%%r1,%3) \n\t" - "vfmsb %%v21,%%v21,%%v29 \n\t" - "vl %%v30,96(%%r1,%3) \n\t" - "vfmsb %%v22,%%v22,%%v30 \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vfmsb %%v23,%%v23,%%v31 \n\t" - - "vflls %%v24,%%v16 \n\t" - "vflls %%v25,%%v17 \n\t" - "vflls %%v26,%%v18 \n\t" - "vflls %%v27,%%v19 \n\t" - "vflls %%v28,%%v20 \n\t" - "vflls %%v29,%%v21 \n\t" - "vflls %%v30,%%v22 \n\t" - "vflls %%v31,%%v23 \n\t" - - "veslg %%v16,%%v16,32 \n\t" - "veslg %%v17,%%v17,32 \n\t" - "veslg %%v18,%%v18,32 \n\t" - "veslg %%v19,%%v19,32 \n\t" - "veslg %%v20,%%v20,32 \n\t" - "veslg %%v21,%%v21,32 \n\t" - "veslg %%v22,%%v22,32 \n\t" - "veslg %%v23,%%v23,32 \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v16,4(%%r1,%2),2 \n\t" + "vlef %%v17,8(%%r1,%2),0 \n\t" + "vlef %%v17,12(%%r1,%2),2 \n\t" + "vlef %%v18,16(%%r1,%2),0 \n\t" + "vlef %%v18,20(%%r1,%2),2 \n\t" + "vlef %%v19,24(%%r1,%2),0 \n\t" + "vlef %%v19,28(%%r1,%2),2 \n\t" + "vlef %%v20,32(%%r1,%2),0 \n\t" + "vlef %%v20,36(%%r1,%2),2 \n\t" + "vlef %%v21,40(%%r1,%2),0 \n\t" + "vlef %%v21,44(%%r1,%2),2 \n\t" + "vlef %%v22,48(%%r1,%2),0 \n\t" + "vlef %%v22,52(%%r1,%2),2 \n\t" + "vlef %%v23,56(%%r1,%2),0 \n\t" + "vlef %%v23,60(%%r1,%2),2 \n\t" "vflls %%v16,%%v16 \n\t" "vflls %%v17,%%v17 \n\t" @@ -92,24 +65,40 @@ static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) "vflls %%v22,%%v22 \n\t" "vflls %%v23,%%v23 \n\t" - "vfadb %%v16,%%v16,%%v24 \n\t" - "vfadb %%v17,%%v17,%%v25 \n\t" - "vfadb %%v18,%%v18,%%v26 \n\t" - "vfadb %%v19,%%v19,%%v27 \n\t" - "vfadb %%v20,%%v20,%%v28 \n\t" - "vfadb %%v21,%%v21,%%v29 \n\t" - "vfadb %%v22,%%v22,%%v30 \n\t" - "vfadb %%v23,%%v23,%%v31 \n\t" - "vfadb %%v16,%%v16,%%v20 \n\t" - "vfadb %%v17,%%v17,%%v21 \n\t" - "vfadb %%v18,%%v18,%%v22 \n\t" - "vfadb %%v19,%%v19,%%v23 \n\t" - "vfadb %%v16,%%v16,%%v18 \n\t" - "vfadb %%v17,%%v17,%%v19 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v0,%%v16,%%v0 \n\t" - - "agfi %%r1,128 \n\t" + "vlef %%v24,0(%%r1,%3),0 \n\t" + "vlef %%v24,4(%%r1,%3),2 \n\t" + "vflls %%v24,%%v24 \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vlef %%v25,8(%%r1,%3),0 \n\t" + "vlef %%v25,12(%%r1,%3),2 \n\t" + "vflls %%v25,%%v25 \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + "vlef %%v26,16(%%r1,%3),0 \n\t" + "vlef %%v26,20(%%r1,%3),2 \n\t" + "vflls %%v26,%%v26 \n\t" + "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" + "vlef %%v27,24(%%r1,%3),0 \n\t" + "vlef %%v27,28(%%r1,%3),2 \n\t" + "vflls %%v27,%%v27 \n\t" + "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" + "vlef %%v28,32(%%r1,%3),0 \n\t" + "vlef %%v28,36(%%r1,%3),2 \n\t" + "vflls %%v28,%%v28 \n\t" + "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" + "vlef %%v29,40(%%r1,%3),0 \n\t" + "vlef %%v29,44(%%r1,%3),2 \n\t" + "vflls %%v29,%%v29 \n\t" + "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" + "vlef %%v30,48(%%r1,%3),0 \n\t" + "vlef %%v30,52(%%r1,%3),2 \n\t" + "vflls %%v30,%%v30 \n\t" + "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" + "vlef %%v31,56(%%r1,%3),0 \n\t" + "vlef %%v31,60(%%r1,%3),2 \n\t" + "vflls %%v31,%%v31 \n\t" + "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,64 \n\t" "brctg %%r0,0b \n\t" "vrepg %%v1,%%v0,1 \n\t" "adbr %%f0,%%f1 \n\t" @@ -134,10 +123,10 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 ) - dot = dsdot_kernel_32(n1,x,y); + dot = dsdot_kernel_16(n1,x,y); i = n1; while(i < n) From c2ffef81569624cc530d515bbaac9890d819253b Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 16:49:44 +0200 Subject: [PATCH 021/189] [ZARCH] fix data prefetch type in ddot --- kernel/zarch/ddot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index f34d1e96e8..ff4c347a6c 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -37,7 +37,7 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%2) \n\t" From be66f5d5c21b558dd1ef35dc8f4bda6b544b4f79 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 16:50:07 +0200 Subject: [PATCH 022/189] [ZARCH] fix data prefetch type in sdot --- kernel/zarch/sdot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c index fd8c8e4455..5ddbc69bd6 100644 --- a/kernel/zarch/sdot.c +++ b/kernel/zarch/sdot.c @@ -37,7 +37,7 @@ static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%2) \n\t" From ad2c386d6ad99d3021e33cbbfb311150b2586c93 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jan 2019 00:32:50 +0100 Subject: [PATCH 023/189] Move TLS key deletion to openblas_quit fixes #1954 (as suggested by thrasibule in that issue) --- driver/others/memory.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 6f7a7db825..72d3e173cf 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1073,11 +1073,6 @@ static volatile int memory_initialized = 0; } free(table); } -#if defined(OS_WINDOWS) - TlsFree(local_storage_key); -#else - pthread_key_delete(local_storage_key); -#endif } static void blas_memory_init(){ @@ -1491,6 +1486,14 @@ void DESTRUCTOR gotoblas_quit(void) { blas_shutdown(); +#if defined(SMP) +#if defined(OS_WINDOWS) + TlsFree(local_storage_key); +#else + pthread_key_delete(local_storage_key); +#endif +#endif + #ifdef PROFILE moncontrol (0); #endif From 67432b23c2fe7f8ef29cf85821278dcdf69b4db2 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 16:44:46 +0200 Subject: [PATCH 024/189] [ZARCH] fix cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index 7b5e43497f..a45c3d6870 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -396,7 +396,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, ap[3] = ap[2] + lda; x_ptr = x; //zero_y(NB,ybuffer); - memset(ybuffer,0,NB*16); + memset(ybuffer,0,NB*8); if ( inc_x == 2 ) { From 5d89d6b143ea770e4dcb2336319b543f2297c6ba Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:08:24 +0200 Subject: [PATCH 025/189] [ZARCH] fix sgemv_n_4.c --- kernel/zarch/sgemv_n_4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index 92019d7322..01d8414de4 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -435,7 +435,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ap[3] = ap[2] + lda; if ( inc_y != 1 ) - memset(ybuffer,0,NB*8); + memset(ybuffer,0,NB*4); else ybuffer = y_ptr; @@ -465,8 +465,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 1 ) { sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; + /* a_ptr += lda; + x_ptr += 1; */ } From ecc31b743fc93d3b5951e83e6e37148dbdd381c8 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:13:02 +0200 Subject: [PATCH 026/189] Update dgemv_t_4.c --- kernel/zarch/dgemv_t_4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index f9c1f966dd..2d8fa0d104 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -601,9 +601,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - a_ptr += lda; + // a_ptr += lda; *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; + // y_ptr += inc_y; } a += NB; From b731e8246f9fad13637005be39d8566111bab9fe Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:14:04 +0200 Subject: [PATCH 027/189] Update sgemv_t_4.c --- kernel/zarch/sgemv_t_4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index fe99ef5ce0..5515d7bb7d 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -605,9 +605,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - a_ptr += lda; + // a_ptr += lda; *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; + // y_ptr += inc_y; } a += NB; From 621dedb37bd1d33c7006c305b4057bb0cc7ea7cd Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:37:11 +0200 Subject: [PATCH 028/189] [ZARCH] Update cgemv_t_4.c --- kernel/zarch/cgemv_t_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c index 89914fb1f9..0dd43057c2 100644 --- a/kernel/zarch/cgemv_t_4.c +++ b/kernel/zarch/cgemv_t_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define NBMAX 1024 +#define NBMAX 2048 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { From 406f835f00fedcfef894742b30a7f48905836eee Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:39:17 +0200 Subject: [PATCH 029/189] [ZARCH] update cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index a45c3d6870..ed81325e1a 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#define NBMAX 1024 +#define NBMAX 2048 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { From 1a7925b3a335114d26bd1d25d6f6fdc2743909b6 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:43:11 +0200 Subject: [PATCH 030/189] [ZARCH] Update dgemv_n_4.c --- kernel/zarch/dgemv_n_4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index ca6d287bc8..ca4fd61709 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -488,8 +488,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 1 ) { dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; + /* a_ptr += lda; + x_ptr += 1; */ } From 00401489c2d82e1dd997f91480fe6bc441cd6b40 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Jan 2019 22:38:32 +0100 Subject: [PATCH 031/189] Fix missing braces in support_avx() --- cpuid_x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 377267fcc0..74cc6655b1 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -216,7 +216,7 @@ int support_avx2(){ int eax, ebx, ecx=0, edx; int ret=0; - if (!support_avx) + if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); if((ebx & (1<<7)) != 0) @@ -232,7 +232,7 @@ int support_avx512(){ int eax, ebx, ecx, edx; int ret=0; - if (!support_avx) + if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); if((ebx & 32) != 32){ From dbc9a060ef4d6ba08b21352f22bb2fa989db0919 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Jan 2019 22:41:31 +0100 Subject: [PATCH 032/189] Fix missing braces in support_av() call --- driver/others/dynamic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index ba93fca8b2..9e59da2ccb 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -309,7 +309,7 @@ int support_avx2(){ int eax, ebx, ecx=0, edx; int ret=0; - if (!support_avx) + if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); if((ebx & (1<<7)) != 0) @@ -325,7 +325,7 @@ int support_avx512(){ int eax, ebx, ecx, edx; int ret=0; - if (!support_avx) + if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); if((ebx & (1<<7)) != 1){ From b815a04c87e49a01e66e1c41ce4654f8d7817f83 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 15 Jan 2019 21:04:22 +0200 Subject: [PATCH 033/189] [ZARCH] fix a bug in max/min functions --- kernel/zarch/camax.c | 2 +- kernel/zarch/camin.c | 2 +- kernel/zarch/damax.c | 2 +- kernel/zarch/damin.c | 2 +- kernel/zarch/dmax.c | 2 +- kernel/zarch/dmin.c | 2 +- kernel/zarch/idamax.c | 2 +- kernel/zarch/idamin.c | 2 +- kernel/zarch/idmax.c | 2 +- kernel/zarch/idmin.c | 2 +- kernel/zarch/isamax.c | 2 +- kernel/zarch/isamin.c | 2 +- kernel/zarch/ismax.c | 2 +- kernel/zarch/ismin.c | 2 +- kernel/zarch/samax.c | 2 +- kernel/zarch/samin.c | 2 +- kernel/zarch/smax.c | 2 +- kernel/zarch/smin.c | 2 +- kernel/zarch/zamax.c | 2 +- kernel/zarch/zamin.c | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 3506c4e9b9..2c913b62e5 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -237,7 +237,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 726747b999..733f98fbf9 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -237,7 +237,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index b74af5d372..236d11c722 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -172,7 +172,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 4cf5e88b13..c2c63c6c5a 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -172,7 +172,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index de38bd21a7..469f657358 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -148,7 +148,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index d7c86735f7..3df5049500 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -148,7 +148,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index d1f1353692..4f7ff69857 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -226,7 +226,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 679606a8f8..3abc7a5585 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -226,7 +226,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 5de41ac7b4..313a88db44 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -202,7 +202,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 7fec111cfb..42443215be 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -202,7 +202,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index d2686c0cd5..dd2144db21 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -269,7 +269,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 768f31a8c7..d7e44421d0 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -269,7 +269,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 8fc32adf6c..1ebc6c8c8e 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -245,7 +245,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 415052810d..a6b9d59de9 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -245,7 +245,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index 1025cfcbfe..61d50159fe 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -176,7 +176,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index 3b8f03e6a2..a585a79ffb 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -176,7 +176,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index 33798eb7c9..bcdb473afa 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index e882b7ff17..91c31d284d 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 937bc97538..8ef3f42ca9 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -189,7 +189,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 8564edaf45..30fd1d030a 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -189,7 +189,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); From 29dc72889f5c0544aee8bc5f2dee98603cbfec36 Mon Sep 17 00:00:00 2001 From: caiyu Date: Wed, 16 Jan 2019 14:25:19 +0800 Subject: [PATCH 034/189] Add support for Hygon Dhyana --- cpuid.h | 5 ++++ cpuid_x86.c | 54 +++++++++++++++++++++++++++++++++++++---- driver/others/dynamic.c | 11 ++++++++- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/cpuid.h b/cpuid.h index c56672ad8b..697f43133e 100644 --- a/cpuid.h +++ b/cpuid.h @@ -53,6 +53,7 @@ #define VENDOR_SIS 8 #define VENDOR_TRANSMETA 9 #define VENDOR_NSC 10 +#define VENDOR_HYGON 11 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) @@ -116,6 +117,7 @@ #define CORE_EXCAVATOR 26 #define CORE_ZEN 27 #define CORE_SKYLAKEX 28 +#define CORE_DHYANA 29 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -215,5 +217,8 @@ typedef struct { #define CPUTYPE_EXCAVATOR 50 #define CPUTYPE_ZEN 51 #define CPUTYPE_SKYLAKEX 52 +#define CPUTYPE_DHYANA 53 + +#define CPUTYPE_HYGON_UNKNOWN 54 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 74cc6655b1..7260140330 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -271,6 +271,7 @@ int get_vendor(void){ if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC; + if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; @@ -1046,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ } } - if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) { + if ((get_vendor() == VENDOR_AMD) || + (get_vendor() == VENDOR_HYGON) || + (get_vendor() == VENDOR_CENTAUR)) { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); LDTB.size = 4096; @@ -1483,6 +1486,26 @@ int get_cpuname(void){ return CPUTYPE_AMD_UNKNOWN; } + if (vendor == VENDOR_HYGON){ + switch (family) { + case 0xf: + switch (exfamily) { + case 9: + //Hygon Dhyana + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_ZEN; +#else + return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator +#endif + else + return CPUTYPE_BARCELONA; + } + break; + } + return CPUTYPE_HYGON_UNKNOWN; + } + if (vendor == VENDOR_CYRIX){ switch (family) { case 0x4: @@ -1604,7 +1627,8 @@ static char *cpuname[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", - "SKYLAKEX" + "SKYLAKEX", + "DHYANA" }; static char *lowercpuname[] = { @@ -1659,7 +1683,8 @@ static char *lowercpuname[] = { "steamroller", "excavator", "zen", - "skylakex" + "skylakex", + "dhyana" }; static char *corename[] = { @@ -1691,7 +1716,8 @@ static char *corename[] = { "STEAMROLLER", "EXCAVATOR", "ZEN", - "SKYLAKEX" + "SKYLAKEX", + "DHYANA" }; static char *corename_lower[] = { @@ -1723,7 +1749,8 @@ static char *corename_lower[] = { "steamroller", "excavator", "zen", - "skylakex" + "skylakex", + "dhyana" }; @@ -2040,6 +2067,23 @@ int get_coretype(void){ } } + if (vendor == VENDOR_HYGON){ + if (family == 0xf){ + if (exfamily == 9) { + if(support_avx()) +#ifndef NO_AVX2 + return CORE_ZEN; +#else + return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator +#endif + else + return CORE_BARCELONA; + } else { + return CORE_BARCELONA; + } + } + } + if (vendor == VENDOR_CENTAUR) { switch (family) { case 0x6: diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 9e59da2ccb..99c9254acb 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -274,6 +274,7 @@ extern gotoblas_t gotoblas_SKYLAKEX; #define VENDOR_INTEL 1 #define VENDOR_AMD 2 #define VENDOR_CENTAUR 3 +#define VENDOR_HYGON 4 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) @@ -369,6 +370,7 @@ static int get_vendor(void){ if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; @@ -604,7 +606,7 @@ static gotoblas_t *get_coretype(void){ } } - if (vendor == VENDOR_AMD){ + if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){ if (family <= 0xe) { // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon cpuid(0x80000000, &eax, &ebx, &ecx, &edx); @@ -684,6 +686,13 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } } + } else if (exfamily == 9) { + if(support_avx()) + return &gotoblas_ZEN; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } }else { return &gotoblas_BARCELONA; } From def0385caaa054411676032bddafe1aee903f656 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 09:51:29 +0200 Subject: [PATCH 035/189] init From b70fd238366c6a822c7f1766ab125f64c67a6b39 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:18:54 +0200 Subject: [PATCH 036/189] disable NaN checks before BLAS calls dsolve.R --- benchmark/scripts/R/dsolve.R | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R index a3fb78da71..6f1b8ef7ba 100755 --- a/benchmark/scripts/R/dsolve.R +++ b/benchmark/scripts/R/dsolve.R @@ -2,6 +2,10 @@ argv <- commandArgs(trailingOnly = TRUE) +if (!is.null(options("matprod")[[1]])) { + options(matprod = "blas") +} + nfrom <- 128 nto <- 2048 nstep <- 128 @@ -19,7 +23,6 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } - } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -27,31 +30,23 @@ if (p != "") { loops <- as.numeric(p) } - -cat(sprintf( - "From %.0f To %.0f Step=%.0f Loops=%.0f\n", - nfrom, - nto, - nstep, - loops -)) +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(rnorm(n * n), ncol = n, nrow = n) - B <- matrix(rnorm(n * n), ncol = n, nrow = n) + A <- matrix(rnorm(n * n), nrow = n) + B <- matrix(rnorm(n * n), nrow = n) z <- system.time(for (l in 1:loops) { solve(A, B) }) - mflops <- - (2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6) + mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e6) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep - } + From 2777a7f506308550e37f7ef26ce05f53a0d096ef Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:23:51 +0200 Subject: [PATCH 037/189] disable NaN checks before BLAS calls dsolve.R (shorter config part) --- benchmark/scripts/R/dsolve.R | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R index 6f1b8ef7ba..ad20459007 100755 --- a/benchmark/scripts/R/dsolve.R +++ b/benchmark/scripts/R/dsolve.R @@ -2,9 +2,7 @@ argv <- commandArgs(trailingOnly = TRUE) -if (!is.null(options("matprod")[[1]])) { - options(matprod = "blas") -} +if (!is.null(options("matprod")[[1]])) options(matprod = "blas") nfrom <- 128 nto <- 2048 @@ -42,11 +40,10 @@ while (n <= nto) { solve(A, B) }) - mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e6) + mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep } - From 7af8b21dbbb523b0e9ab6caff271cb63affaa5f2 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:34:46 +0200 Subject: [PATCH 038/189] disable NaN checks before BLAS calls dsolve.R (shorter formula) --- benchmark/scripts/R/dsolve.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R index ad20459007..46301570bc 100755 --- a/benchmark/scripts/R/dsolve.R +++ b/benchmark/scripts/R/dsolve.R @@ -40,7 +40,7 @@ while (n <= nto) { solve(A, B) }) - mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e+06) + mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) From 3afceb6c2a220ff61878c9a328846cc723de42ed Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:38:14 +0200 Subject: [PATCH 039/189] disable NaN checks before BLAS calls deig.R --- benchmark/scripts/R/deig.R | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/benchmark/scripts/R/deig.R b/benchmark/scripts/R/deig.R index ece727fb37..32716471b8 100755 --- a/benchmark/scripts/R/deig.R +++ b/benchmark/scripts/R/deig.R @@ -2,6 +2,8 @@ argv <- commandArgs(trailingOnly = TRUE) +if (!is.null(options("matprod")[[1]])) options(matprod = "blas") + nfrom <- 128 nto <- 2048 nstep <- 128 @@ -19,7 +21,6 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } - } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -27,14 +28,7 @@ if (p != "") { loops <- as.numeric(p) } - -cat(sprintf( - "From %.0f To %.0f Step=%.0f Loops=%.0f\n", - nfrom, - nto, - nstep, - loops -)) +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom @@ -45,11 +39,10 @@ while (n <= nto) { ev <- eigen(A) }) - mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6) + mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep - } From 478d3c4569cd4957bbef779423ee7e51686b5c0a Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:41:46 +0200 Subject: [PATCH 040/189] disable NaN checks before BLAS calls deig.R (shorten matrix def) --- benchmark/scripts/R/deig.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/scripts/R/deig.R b/benchmark/scripts/R/deig.R index 32716471b8..c6d541dcf2 100755 --- a/benchmark/scripts/R/deig.R +++ b/benchmark/scripts/R/deig.R @@ -33,7 +33,7 @@ cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(rnorm(n * n), ncol = n, nrow = n) + A <- matrix(rnorm(n * n), nrow = n) ev <- 0 z <- system.time(for (l in 1:loops) { ev <- eigen(A) From 3e601bd4195b24568eb4f7db2402ba3258fd82cc Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:54:22 +0200 Subject: [PATCH 041/189] disable NaN checks before BLAS calls dgemm.R --- benchmark/scripts/R/dgemm.R | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/benchmark/scripts/R/dgemm.R b/benchmark/scripts/R/dgemm.R index 75297dfb83..d7c3e81084 100755 --- a/benchmark/scripts/R/dgemm.R +++ b/benchmark/scripts/R/dgemm.R @@ -2,6 +2,8 @@ argv <- commandArgs(trailingOnly = TRUE) +if (!is.null(options("matprod")[[1]])) options(matprod = "blas") + nfrom <- 128 nto <- 2048 nstep <- 128 @@ -19,7 +21,6 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } - } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -27,26 +28,13 @@ if (p != "") { loops <- as.numeric(p) } - -cat(sprintf( - "From %.0f To %.0f Step=%.0f Loops=%.0f\n", - nfrom, - nto, - nstep, - loops -)) +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(runif(n * n), - ncol = n, - nrow = n, - byrow = TRUE) - B <- matrix(runif(n * n), - ncol = n, - nrow = n, - byrow = TRUE) + A <- matrix(runif(n * n), nrow = n) + B <- matrix(runif(n * n), nrow = n) C <- 1 z <- system.time(for (l in 1:loops) { @@ -54,11 +42,10 @@ while (n <= nto) { l <- l + 1 }) - mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6) + mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep - } From 8c3386be8780bdf631ffebe085fde2591d4cd062 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 16 Jan 2019 15:16:21 +0000 Subject: [PATCH 042/189] Added missing Blas1 single fp {saxpy, caxpy, cdot, crot(refactored version of srot),isamax ,isamin, icamax, icamin}, Fixed idamin,icamin choosing the first occurance index of equal minimals --- kernel/power/KERNEL.POWER8 | 20 +-- kernel/power/caxpy.c | 145 +++++++++++++++++++ kernel/power/cdot.c | 164 +++++++++++++++++++++ kernel/power/crot.c | 213 +++++++++++++++++++++++++++ kernel/power/icamax.c | 261 +++++++++++++++++++++++++++++++++ kernel/power/icamin.c | 266 ++++++++++++++++++++++++++++++++++ kernel/power/idamin.c | 50 +++---- kernel/power/isamax.c | 288 +++++++++++++++++++++++++++++++++++++ kernel/power/isamin.c | 288 +++++++++++++++++++++++++++++++++++++ kernel/power/izamin.c | 26 ++-- kernel/power/saxpy.c | 129 +++++++++++++++++ 11 files changed, 1802 insertions(+), 48 deletions(-) create mode 100644 kernel/power/caxpy.c create mode 100644 kernel/power/cdot.c create mode 100644 kernel/power/crot.c create mode 100644 kernel/power/icamax.c create mode 100644 kernel/power/icamin.c create mode 100644 kernel/power/isamax.c create mode 100644 kernel/power/isamin.c create mode 100644 kernel/power/saxpy.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 00ff8682a5..cbcffb8fe8 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # -#ISAMAXKERNEL = ../arm/iamax.c +ISAMAXKERNEL = isamax.c IDAMAXKERNEL = idamax.c -#ICAMAXKERNEL = ../arm/izamax.c -IZAMAXKERNEL = izamax.c +ICAMAXKERNEL = icamax.c +IZAMAXKERNEL = izamax.c # -#ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = idamin.c -#ICAMINKERNEL = ../arm/izamin.c +ISAMINKERNEL = isamin.c +IDAMINKERNEL = idamin.c +ICAMINKERNEL = icamin.c IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c @@ -110,9 +110,9 @@ DASUMKERNEL = dasum.c CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c # -#SAXPYKERNEL = ../arm/axpy.c +SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -#CAXPYKERNEL = ../arm/zaxpy.c +CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c @@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c DSDOTKERNEL = sdot.c -#CDOTKERNEL = ../arm/zdot.c +CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c @@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c -#CROTKERNEL = ../arm/zrot.c +CROTKERNEL = crot.c ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c diff --git a/kernel/power/caxpy.c b/kernel/power/caxpy.c new file mode 100644 index 0000000000..4bdf13c34e --- /dev/null +++ b/kernel/power/caxpy.c @@ -0,0 +1,145 @@ +/* +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#ifndef HAVE_ASM_KERNEL +#include +static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) +{ + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector float valpha_r = {alpha_r, alpha_r,alpha_r, alpha_r}; + register __vector float valpha_i = {-alpha_i, alpha_i,-alpha_i, alpha_i}; + +#else + register __vector float valpha_r = {alpha_r, -alpha_r,alpha_r, -alpha_r}; + register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i}; +#endif + + __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + register __vector float *vy = (__vector float *) y; + register __vector float *vx = (__vector float *) x; + BLASLONG i=0; + for (; i < n/2; i += 8) { + + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float vy_2 = vy[i + 2]; + register __vector float vy_3 = vy[i + 3]; + register __vector float vy_4 = vy[i + 4]; + register __vector float vy_5 = vy[i + 5]; + register __vector float vy_6 = vy[i + 6]; + register __vector float vy_7 = vy[i + 7]; + register __vector float vx_0 = vx[i]; + register __vector float vx_1 = vx[i + 1]; + register __vector float vx_2 = vx[i + 2]; + register __vector float vx_3 = vx[i + 3]; + register __vector float vx_4 = vx[i + 4]; + register __vector float vx_5 = vx[i + 5]; + register __vector float vx_6 = vx[i + 6]; + register __vector float vx_7 = vx[i + 7]; + vy_0 += vx_0*valpha_r; + vy_1 += vx_1*valpha_r; + vy_2 += vx_2*valpha_r; + vy_3 += vx_3*valpha_r; + vy_4 += vx_4*valpha_r; + vy_5 += vx_5*valpha_r; + vy_6 += vx_6*valpha_r; + vy_7 += vx_7*valpha_r; + vx_0 = vec_perm(vx_0, vx_0, swap_mask); + vx_1 = vec_perm(vx_1, vx_1, swap_mask); + vx_2 = vec_perm(vx_2, vx_2, swap_mask); + vx_3 = vec_perm(vx_3, vx_3, swap_mask); + vx_4 = vec_perm(vx_4, vx_4, swap_mask); + vx_5 = vec_perm(vx_5, vx_5, swap_mask); + vx_6 = vec_perm(vx_6, vx_6, swap_mask); + vx_7 = vec_perm(vx_7, vx_7, swap_mask); + vy_0 += vx_0*valpha_i; + vy_1 += vx_1*valpha_i; + vy_2 += vx_2*valpha_i; + vy_3 += vx_3*valpha_i; + vy_4 += vx_4*valpha_i; + vy_5 += vx_5*valpha_i; + vy_6 += vx_6*valpha_i; + vy_7 += vx_7*valpha_i; + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + vy[i + 4] = vy_4; + vy[i + 5] = vy_5 ; + vy[i + 6] = vy_6 ; + vy[i + 7] = vy_7 ; + + } +} +#endif +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + if (n <= 0) return (0); + if ((inc_x == 1) && (inc_y == 1)) { + BLASLONG n1 = n & -16; + if (n1) { + caxpy_kernel_16(n1, x, y, da_r,da_i); + ix = 2 * n1; + } + i = n1; + while (i < n) { +#if !defined(CONJ) + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + i++; + ix += 2; + } + return (0); + + } + inc_x *= 2; + inc_y *= 2; + while (i < n) { +#if !defined(CONJ) + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} + diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c new file mode 100644 index 0000000000..f86a33f228 --- /dev/null +++ b/kernel/power/cdot.c @@ -0,0 +1,164 @@ +/*Copyright (c) 2013-201\n8, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" + +#ifndef HAVE_KERNEL_8 +#include +static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) +{ + __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + register __vector float *vy = (__vector float *) y; + register __vector float *vx = (__vector float *) x; + BLASLONG i = 0; + register __vector float vd_0 = { 0 }; + register __vector float vd_1 = { 0 }; + register __vector float vd_2 = { 0 }; + register __vector float vd_3 = { 0 }; + register __vector float vdd_0 = { 0 }; + register __vector float vdd_1 = { 0 }; + register __vector float vdd_2 = { 0 }; + register __vector float vdd_3 = { 0 }; + for (; i < n/2; i += 4) { + + register __vector float vyy_0 ; + register __vector float vyy_1 ; + register __vector float vyy_2 ; + register __vector float vyy_3 ; + + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float vy_2 = vy[i + 2]; + register __vector float vy_3 = vy[i + 3]; + register __vector float vx_0= vx[i]; + register __vector float vx_1 = vx[i + 1]; + register __vector float vx_2 = vx[i + 2]; + register __vector float vx_3 = vx[i + 3]; + vyy_0 = vec_perm(vy_0, vy_0, swap_mask); + vyy_1 = vec_perm(vy_1, vy_1, swap_mask); + vyy_2 = vec_perm(vy_2, vy_2, swap_mask); + vyy_3 = vec_perm(vy_3, vy_3, swap_mask); + + vd_0 += vx_0 * vy_0; + vd_1 += vx_1 * vy_1; + vd_2 += vx_2 * vy_2; + vd_3 += vx_3 * vy_3; + + vdd_0 += vx_0 * vyy_0; + vdd_1 += vx_1 * vyy_1; + vdd_2 += vx_2 * vyy_2; + vdd_3 += vx_3 * vyy_3; + + + } + //aggregate + vd_0 = vd_0 + vd_1 +vd_2 +vd_3; + vdd_0= vdd_0 + vdd_1 +vdd_2 +vdd_3; + //reverse and aggregate + vd_1=vec_xxpermdi(vd_0,vd_0,2) ; + vdd_1=vec_xxpermdi(vdd_0,vdd_0,2); + vd_2=vd_0+vd_1; + vdd_2=vdd_0+vdd_1; + + dot[0]=vd_2[0]; + dot[1]=vd_2[1]; + dot[2]=vdd_2[0]; + dot[3]=vdd_2[1]; + +} +#endif + + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix=0, iy=0; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); + + } + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -8; + BLASLONG j=0; + + if (n1){ + cdot_kernel_8(n1, x, y, dot); + i = n1; + j = n1 <<1; + } + + + while (i < n) { + + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; + + j += 2; + i++; + + } + + + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { + + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; + + ix += inc_x; + iy += inc_y; + i++; + + } + } + +#if !defined(CONJ) + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; +#else + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; + +#endif + + return (result); + +} diff --git a/kernel/power/crot.c b/kernel/power/crot.c new file mode 100644 index 0000000000..7e04a09e80 --- /dev/null +++ b/kernel/power/crot.c @@ -0,0 +1,213 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(POWER8) + +static void crot_kernel_8 (long n, float *x, float *y, float c, float s) +{ + __vector float t0; + __vector float t1; + __vector float t2; + __vector float t3; + __vector float t4; + __vector float t5; + __vector float t6; + __vector float t7; + __asm__ + ( + "xscvdpspn 36, %x[cos] \n\t" // load c to all words + "xxspltw 36, 36, 0 \n\t" + "xscvdpspn 37, %x[sin] \n\t" // load s to all words + "xxspltw 37, 37, 0 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 64 \n\t" + "addi %[y_ptr], %[y_ptr], 64 \n\t" + "addic. %[temp_n], %[temp_n], -16 \n\t" + "ble 2f \n\t" + ".p2align 5 \n\t" + "1: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 128 \n\t" + "addi %[y_ptr], %[y_ptr], 128 \n\t" + "addic. %[temp_n], %[temp_n], -16 \n\t" + "bgt 1b \n\t" + "2: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] " + : + [mem_x] "+m" (*(float (*)[2*n])x), + [mem_y] "+m" (*(float (*)[2*n])y), + [temp_n] "+r" (n), + [x_ptr] "+&b" (x), + [y_ptr] "+&b" (y), + [x0] "=wa" (t0), + [x1] "=wa" (t2), + [x2] "=wa" (t1), + [x3] "=wa" (t3), + [x4] "=wa" (t4), + [x5] "=wa" (t5), + [x6] "=wa" (t6), + [x7] "=wa" (t7) + : + [cos] "f" (c), + [sin] "f" (s), + [i16] "b" (16), + [i32] "b" (32), + [i48] "b" (48) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} + +#endif + + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT temp; + if ( n <= 0 ) return(0); + if ( (inc_x == 1) && (inc_y == 1) ) + { + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + crot_kernel_8(n1, x1, y1, c, s); + i=n1; + } + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + i++ ; + } + + } + else + { + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + return(0); +} + diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c new file mode 100644 index 0000000000..aa0531dc61 --- /dev/null +++ b/kernel/power/icamax.c @@ -0,0 +1,261 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + + +/** + * Find maximum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { + + BLASLONG index; + BLASLONG i; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + + register __vector float * v_ptrx=(__vector float *)x; + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vv0,vf0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(vv0,quadruple_values); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the maximum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + + + + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = ciamax_kernel_32(n1, x, &maxf); + i = n1; + ix = n1 << 1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } + +} + + diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c new file mode 100644 index 0000000000..36432c9933 --- /dev/null +++ b/kernel/power/icamin.c @@ -0,0 +1,266 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + + +/** + * Find minimum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { + + BLASLONG index; + BLASLONG i; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + float first_min=CABS1(x,0); + register __vector float quadruple_values={first_min,first_min,first_min,first_min}; + + register __vector float * v_ptrx=(__vector float *)x; + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vf0,vv0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(quadruple_values,vv0); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the minimum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = ciamin_kernel_32(n1, x, &minf); + i = n1; + ix = n1 << 1; + } + + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } + +} + + diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index f4d1d1bdb4..7fe0f8a330 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -89,10 +89,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { ".p2align 5 \n\t" "1: \n\t" - "xvcmpgedp 2,44,45 \n\t " - "xvcmpgedp 3,46,47 \n\t " - "xvcmpgedp 4,48,49 \n\t " - "xvcmpgedp 5,50,51 \n\t" + "xvcmpgtdp 2,44,45 \n\t " + "xvcmpgtdp 3,46,47 \n\t " + "xvcmpgtdp 4,48,49 \n\t " + "xvcmpgtdp 5,50,51 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -103,8 +103,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 35,42,43,5 \n\t" "xxsel 47,50,51,5 \n\t" - "xvcmpgedp 2,0, 1 \n\t" - "xvcmpgedp 3, 45,47 \n\t" + "xvcmpgtdp 2,0, 1 \n\t" + "xvcmpgtdp 3, 45,47 \n\t" "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" @@ -125,7 +125,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" //choose smaller from first and second part - "xvcmpgedp 4, 0,5 \n\t" + "xvcmpgtdp 4, 0,5 \n\t" "xxsel 3, 0,5,4 \n\t" "xxsel 33,32,34,4 \n\t" @@ -139,7 +139,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) - "xvcmpgedp 2,39, 3 \n\t" + "xvcmpgtdp 2,39, 3 \n\t" "xxsel 39,39,3,2 \n\t" "xxsel 38,38,33,2 \n\t" @@ -162,10 +162,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //<-----------jump here from first load "2: \n\t" - "xvcmpgedp 2,44,45 \n\t " - "xvcmpgedp 3,46,47 \n\t " - "xvcmpgedp 4,48,49 \n\t " - "xvcmpgedp 5,50,51 \n\t" + "xvcmpgtdp 2,44,45 \n\t " + "xvcmpgtdp 3,46,47 \n\t " + "xvcmpgtdp 4,48,49 \n\t " + "xvcmpgtdp 5,50,51 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -176,8 +176,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 35,42,43,5 \n\t" "xxsel 47,50,51,5 \n\t" - "xvcmpgedp 2,0, 1 \n\t" - "xvcmpgedp 3, 45,47 \n\t" + "xvcmpgtdp 2,0, 1 \n\t" + "xvcmpgtdp 3, 45,47 \n\t" "xxsel 32,32,33,2 \n\t" "xxsel 0 ,0,1,2 \n\t" "xxsel 34,34,35,3 \n\t" @@ -194,7 +194,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" //choose smaller from first and second part - "xvcmpgedp 4, 0,5 \n\t" + "xvcmpgtdp 4, 0,5 \n\t" "xxsel 3, 0,5,4 \n\t" "xxsel 33,32,34,4 \n\t" @@ -210,7 +210,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) - "xvcmpgedp 2,39, 3 \n\t" + "xvcmpgtdp 2,39, 3 \n\t" "xxsel 39,39,3,2 \n\t" "xxsel 38,38,33,2 \n\t" @@ -238,10 +238,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //============================================================================== - "xvcmpgedp 2,44,45 \n\t " - "xvcmpgedp 3,46,47 \n\t " - "xvcmpgedp 4,48,49 \n\t " - "xvcmpgedp 5,50,51 \n\t" + "xvcmpgtdp 2,44,45 \n\t " + "xvcmpgtdp 3,46,47 \n\t " + "xvcmpgtdp 4,48,49 \n\t " + "xvcmpgtdp 5,50,51 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -252,8 +252,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 35,42,43,5 \n\t" "xxsel 47,50,51,5 \n\t" - "xvcmpgedp 2,0, 1 \n\t" - "xvcmpgedp 3, 45,47 \n\t" + "xvcmpgtdp 2,0, 1 \n\t" + "xvcmpgtdp 3, 45,47 \n\t" "xxsel 32,32,33,2 \n\t" @@ -264,14 +264,14 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16} "vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8} //choose smaller from first and second part - "xvcmpgedp 4, 0,5 \n\t" + "xvcmpgtdp 4, 0,5 \n\t" "xxsel 3, 0,5,4 \n\t" "xxsel 33,32,34,4 \n\t" "vaddudm 1,1,5 \n\t" // get real index for first smaller //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) - "xvcmpgedp 2,39, 3 \n\t" + "xvcmpgtdp 2,39, 3 \n\t" "xxsel 39,39,3,2 \n\t" "xxsel 38,38,33,2 \n\t" @@ -284,7 +284,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 "bc 14,24, 3f \n\t" - "xvcmpgedp 4,39, 40 \n\t" + "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" diff --git a/kernel/power/isamax.c b/kernel/power/isamax.c new file mode 100644 index 0000000000..bf1af78d6d --- /dev/null +++ b/kernel/power/isamax.c @@ -0,0 +1,288 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include + + +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif + +/** + * Find maximum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + register __vector float * v_ptrx=(__vector float *)x; + for(; ii2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = siamax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/power/isamin.c b/kernel/power/isamin.c new file mode 100644 index 0000000000..1c1f0ad788 --- /dev/null +++ b/kernel/power/isamin.c @@ -0,0 +1,288 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +/** + * Find minimum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; + register __vector float * v_ptrx=(__vector float *)x; + register __vector float quadruple_values=vec_abs(v_ptrx[0]); + for(; ii2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = siamin_kernel_64(n1, x, &minf); + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 448247ffd5..1ffa3ba8b2 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -101,8 +101,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgedp 50,46,47 \n\t " - "xvcmpgedp 51,48,49 \n\t " + "xvcmpgtdp 50,46,47 \n\t " + "xvcmpgtdp 51,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" @@ -114,7 +114,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" - "xvcmpgedp 2,0,1 \n\t " + "xvcmpgtdp 2,0,1 \n\t " "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" @@ -126,7 +126,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { //cmp with previous - "xvcmpgedp 4,39,3 \n\t " + "xvcmpgtdp 4,39,3 \n\t " "vaddudm 5,5,4 \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" @@ -166,8 +166,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvadddp 48, 4,5 \n\t" "xvadddp 49, 44,45 \n\t" - "xvcmpgedp 50,46,47 \n\t " - "xvcmpgedp 51,48,49 \n\t " + "xvcmpgtdp 50,46,47 \n\t " + "xvcmpgtdp 51,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" @@ -179,7 +179,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" - "xvcmpgedp 2,0,1 \n\t " + "xvcmpgtdp 2,0,1 \n\t " "lxvd2x 46, %[i32],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" @@ -191,7 +191,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { //cmp with previous - "xvcmpgedp 4,39,3 \n\t " + "xvcmpgtdp 4,39,3 \n\t " "vaddudm 5,5,4 \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" @@ -235,15 +235,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgedp 50,46,47 \n\t " - "xvcmpgedp 51,48,49 \n\t " + "xvcmpgtdp 50,46,47 \n\t " + "xvcmpgtdp 51,48,49 \n\t " "xxsel 32,40,41,50 \n\t" "xxsel 0,46,47,50 \n\t" "xxsel 33,42,43,51 \n\t" "xxsel 1,48,49,51 \n\t" - "xvcmpgedp 2,0,1 \n\t " + "xvcmpgtdp 2,0,1 \n\t " "xxsel 32,32,33,2 \n\t" "xxsel 3,0,1,2 \n\t" @@ -252,7 +252,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" //cmp with previous - "xvcmpgedp 4,39,3 \n\t " + "xvcmpgtdp 4,39,3 \n\t " "vaddudm 5,5,4 \n\t" "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -267,7 +267,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 "bc 14,24, 3f \n\t" - "xvcmpgedp 4,39, 40 \n\t" + "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" diff --git a/kernel/power/saxpy.c b/kernel/power/saxpy.c new file mode 100644 index 0000000000..393cdfadc4 --- /dev/null +++ b/kernel/power/saxpy.c @@ -0,0 +1,129 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + + + +#ifndef HAVE_KERNEL_8 +#include + +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) +{ + BLASLONG i = 0; + __vector float v_a = {alpha,alpha,alpha,alpha}; + __vector float * v_y=(__vector float *)y; + __vector float * v_x=(__vector float *)x; + + for(; i Date: Thu, 17 Jan 2019 14:45:31 +0000 Subject: [PATCH 043/189] crot fix --- kernel/power/crot.c | 90 +++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 36 deletions(-) diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 7e04a09e80..40e350ba3f 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -55,7 +55,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) "lxvd2x 51, %[i48], %[y_ptr] \n\t" "addi %[x_ptr], %[x_ptr], 64 \n\t" "addi %[y_ptr], %[y_ptr], 64 \n\t" - "addic. %[temp_n], %[temp_n], -16 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" "ble 2f \n\t" ".p2align 5 \n\t" "1: \n\t" @@ -103,7 +103,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" "addi %[x_ptr], %[x_ptr], 128 \n\t" "addi %[y_ptr], %[y_ptr], 128 \n\t" - "addic. %[temp_n], %[temp_n], -16 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" "bgt 1b \n\t" "2: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x @@ -173,41 +173,59 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) { - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT *x1=x; - FLOAT *y1=y; - FLOAT temp; - if ( n <= 0 ) return(0); - if ( (inc_x == 1) && (inc_y == 1) ) - { - BLASLONG n1 = n & -8; - if ( n1 > 0 ) - { - crot_kernel_8(n1, x1, y1, c, s); - i=n1; - } - while(i < n) - { - temp = c*x[i] + s*y[i] ; - y[i] = c*y[i] - s*x[i] ; - x[i] = temp ; - i++ ; - } + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; - } - else - { - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; - ix += inc_x ; - iy += inc_y ; - i++ ; - } - } + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + crot_kernel_8(n1, x, y, c, s); + i=n1; + ix=2*n1; + } + + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + } return(0); } From 3e9fd6359dabb1c9c8ce3fa5e980e94a3536d2c0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Jan 2019 16:19:03 +0100 Subject: [PATCH 044/189] Bump xcode version to 10.1 to make sure it handles AVX512 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 51679af620..ec5dc8a9bf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -149,7 +149,7 @@ matrix: - &test-macos os: osx - osx_image: xcode8.3 + osx_image: xcode10.1 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update From d5e6940253b2ee638509de283b8b1d7695fefbbf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Jan 2019 23:20:32 +0100 Subject: [PATCH 045/189] Fix declaration of input arguments in the x86_64 microkernels for DOT and AXPY (#1965) * Tag operands 0 and 1 as both input and output For #1964 (basically a continuation of coding problems first seen in #1292) --- kernel/x86_64/caxpy_microk_bulldozer-2.c | 14 +++++++------- kernel/x86_64/caxpy_microk_haswell-2.c | 6 +++--- kernel/x86_64/caxpy_microk_sandy-2.c | 8 ++++---- kernel/x86_64/caxpy_microk_steamroller-2.c | 14 +++++++------- kernel/x86_64/cdot_microk_bulldozer-2.c | 14 +++++++------- kernel/x86_64/cdot_microk_haswell-2.c | 6 +++--- kernel/x86_64/cdot_microk_sandy-2.c | 8 ++++---- kernel/x86_64/cdot_microk_steamroller-2.c | 14 +++++++------- kernel/x86_64/daxpy_microk_bulldozer-2.c | 6 +++--- kernel/x86_64/daxpy_microk_haswell-2.c | 8 ++++---- kernel/x86_64/daxpy_microk_nehalem-2.c | 6 +++--- kernel/x86_64/daxpy_microk_piledriver-2.c | 16 ++++++++-------- kernel/x86_64/daxpy_microk_sandy-2.c | 8 ++++---- kernel/x86_64/daxpy_microk_steamroller-2.c | 16 ++++++++-------- kernel/x86_64/ddot_microk_bulldozer-2.c | 8 ++++---- kernel/x86_64/ddot_microk_haswell-2.c | 6 +++--- kernel/x86_64/ddot_microk_nehalem-2.c | 8 ++++---- kernel/x86_64/ddot_microk_piledriver-2.c | 16 ++++++++-------- kernel/x86_64/ddot_microk_sandy-2.c | 8 ++++---- kernel/x86_64/ddot_microk_steamroller-2.c | 8 ++++---- kernel/x86_64/saxpy_microk_haswell-2.c | 8 ++++---- kernel/x86_64/saxpy_microk_nehalem-2.c | 6 +++--- kernel/x86_64/saxpy_microk_piledriver-2.c | 16 ++++++++-------- kernel/x86_64/saxpy_microk_sandy-2.c | 8 ++++---- kernel/x86_64/sdot_microk_bulldozer-2.c | 8 ++++---- kernel/x86_64/sdot_microk_haswell-2.c | 8 ++++---- kernel/x86_64/sdot_microk_nehalem-2.c | 8 ++++---- kernel/x86_64/sdot_microk_sandy-2.c | 8 ++++---- kernel/x86_64/sdot_microk_steamroller-2.c | 16 ++++++++-------- kernel/x86_64/zaxpy_microk_bulldozer-2.c | 16 ++++++++-------- kernel/x86_64/zaxpy_microk_haswell-2.c | 8 ++++---- kernel/x86_64/zaxpy_microk_sandy-2.c | 16 ++++++++-------- kernel/x86_64/zaxpy_microk_steamroller-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_bulldozer-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_haswell-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_sandy-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_steamroller-2.c | 16 ++++++++-------- 37 files changed, 202 insertions(+), 202 deletions(-) diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c index 33bda09434..ca2209340c 100644 --- a/kernel/x86_64/caxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c @@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -180,10 +180,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c index 00e2e6a42a..b605ea34c8 100644 --- a/kernel/x86_64/caxpy_microk_haswell-2.c +++ b/kernel/x86_64/caxpy_microk_haswell-2.c @@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c index a798fd9779..72d37afed6 100644 --- a/kernel/x86_64/caxpy_microk_sandy-2.c +++ b/kernel/x86_64/caxpy_microk_sandy-2.c @@ -95,10 +95,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c index 87370b0320..7ca7af0701 100644 --- a/kernel/x86_64/caxpy_microk_steamroller-2.c +++ b/kernel/x86_64/caxpy_microk_steamroller-2.c @@ -113,10 +113,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c index f587aa0366..1186559130 100644 --- a/kernel/x86_64/cdot_microk_bulldozer-2.c +++ b/kernel/x86_64/cdot_microk_bulldozer-2.c @@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c index fe195a63b2..8b9d6d104b 100644 --- a/kernel/x86_64/cdot_microk_haswell-2.c +++ b/kernel/x86_64/cdot_microk_haswell-2.c @@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c index 01816917d2..fe142c38f7 100644 --- a/kernel/x86_64/cdot_microk_sandy-2.c +++ b/kernel/x86_64/cdot_microk_sandy-2.c @@ -105,10 +105,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c index 76a3aa0eb0..7350b21c9f 100644 --- a/kernel/x86_64/cdot_microk_steamroller-2.c +++ b/kernel/x86_64/cdot_microk_steamroller-2.c @@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c index 8c520dcf10..9c1305b977 100644 --- a/kernel/x86_64/daxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c @@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c index bbe8b95506..f3682e6d72 100644 --- a/kernel/x86_64/daxpy_microk_haswell-2.c +++ b/kernel/x86_64/daxpy_microk_haswell-2.c @@ -59,10 +59,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c index 943d893af3..8feb9f26cd 100644 --- a/kernel/x86_64/daxpy_microk_nehalem-2.c +++ b/kernel/x86_64/daxpy_microk_nehalem-2.c @@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c index 95eb953b48..4b83124c7a 100644 --- a/kernel/x86_64/daxpy_microk_piledriver-2.c +++ b/kernel/x86_64/daxpy_microk_piledriver-2.c @@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 @@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c index 85e038cef1..db9a45de81 100644 --- a/kernel/x86_64/daxpy_microk_sandy-2.c +++ b/kernel/x86_64/daxpy_microk_sandy-2.c @@ -99,10 +99,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c index e40009037d..8e63fcc1db 100644 --- a/kernel/x86_64/daxpy_microk_steamroller-2.c +++ b/kernel/x86_64/daxpy_microk_steamroller-2.c @@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 @@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c index 9756ee46a9..5590c5b177 100644 --- a/kernel/x86_64/ddot_microk_bulldozer-2.c +++ b/kernel/x86_64/ddot_microk_bulldozer-2.c @@ -65,10 +65,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c index 365737363b..dbb5487f70 100644 --- a/kernel/x86_64/ddot_microk_haswell-2.c +++ b/kernel/x86_64/ddot_microk_haswell-2.c @@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c index fb5ec9bca0..e5e234e225 100644 --- a/kernel/x86_64/ddot_microk_nehalem-2.c +++ b/kernel/x86_64/ddot_microk_nehalem-2.c @@ -75,10 +75,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "movsd %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c index ac950885c6..cc4bcd90a2 100644 --- a/kernel/x86_64/ddot_microk_piledriver-2.c +++ b/kernel/x86_64/ddot_microk_piledriver-2.c @@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -145,10 +145,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c index 160f956048..84493ec273 100644 --- a/kernel/x86_64/ddot_microk_sandy-2.c +++ b/kernel/x86_64/ddot_microk_sandy-2.c @@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c index 5ce20b5dee..27d5244ce2 100644 --- a/kernel/x86_64/ddot_microk_steamroller-2.c +++ b/kernel/x86_64/ddot_microk_steamroller-2.c @@ -78,10 +78,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c index 3a743d64c7..7099ba4c6f 100644 --- a/kernel/x86_64/saxpy_microk_haswell-2.c +++ b/kernel/x86_64/saxpy_microk_haswell-2.c @@ -59,10 +59,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c index 68f68ea3a9..88bbb695d4 100644 --- a/kernel/x86_64/saxpy_microk_nehalem-2.c +++ b/kernel/x86_64/saxpy_microk_nehalem-2.c @@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c index 204cf8bacf..5feea7f241 100644 --- a/kernel/x86_64/saxpy_microk_piledriver-2.c +++ b/kernel/x86_64/saxpy_microk_piledriver-2.c @@ -78,10 +78,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 @@ -139,10 +139,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c index 0a6bef0466..0d448d5f88 100644 --- a/kernel/x86_64/saxpy_microk_sandy-2.c +++ b/kernel/x86_64/saxpy_microk_sandy-2.c @@ -99,10 +99,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c index 36e61b0776..8958a33dcc 100644 --- a/kernel/x86_64/sdot_microk_bulldozer-2.c +++ b/kernel/x86_64/sdot_microk_bulldozer-2.c @@ -66,10 +66,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c index df367b61f1..91dc928d39 100644 --- a/kernel/x86_64/sdot_microk_haswell-2.c +++ b/kernel/x86_64/sdot_microk_haswell-2.c @@ -79,10 +79,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c index 1a27177f58..5a715d0083 100644 --- a/kernel/x86_64/sdot_microk_nehalem-2.c +++ b/kernel/x86_64/sdot_microk_nehalem-2.c @@ -75,10 +75,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "movss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c index ca13536f26..ae25d5a50b 100644 --- a/kernel/x86_64/sdot_microk_sandy-2.c +++ b/kernel/x86_64/sdot_microk_sandy-2.c @@ -82,10 +82,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c index 6b8b2566ba..bf6a5f2871 100644 --- a/kernel/x86_64/sdot_microk_steamroller-2.c +++ b/kernel/x86_64/sdot_microk_steamroller-2.c @@ -80,10 +80,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -143,10 +143,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c index 0e15761f79..15d3679717 100644 --- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c @@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c index 30e8b19552..89d23daf32 100644 --- a/kernel/x86_64/zaxpy_microk_haswell-2.c +++ b/kernel/x86_64/zaxpy_microk_haswell-2.c @@ -111,10 +111,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c index 233af143ad..17b8b24f7c 100644 --- a/kernel/x86_64/zaxpy_microk_sandy-2.c +++ b/kernel/x86_64/zaxpy_microk_sandy-2.c @@ -99,10 +99,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -176,10 +176,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c index 728d092133..907b1ae009 100644 --- a/kernel/x86_64/zaxpy_microk_steamroller-2.c +++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c @@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c index 30a9552d60..db9a48cce8 100644 --- a/kernel/x86_64/zdot_microk_bulldozer-2.c +++ b/kernel/x86_64/zdot_microk_bulldozer-2.c @@ -96,10 +96,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -175,10 +175,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 11056a3c16..9f2fc2c1d9 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -101,10 +101,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -186,10 +186,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c index 87c5b03402..33415e26e5 100644 --- a/kernel/x86_64/zdot_microk_sandy-2.c +++ b/kernel/x86_64/zdot_microk_sandy-2.c @@ -107,10 +107,10 @@ if ( n < 1280 ) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -199,10 +199,10 @@ if ( n < 1280 ) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c index 325f74ae30..87138fe9a0 100644 --- a/kernel/x86_64/zdot_microk_steamroller-2.c +++ b/kernel/x86_64/zdot_microk_steamroller-2.c @@ -95,10 +95,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -172,10 +172,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 From b495e54310a99049c50c20425269f4b026b47dbb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 18 Jan 2019 08:11:07 +0100 Subject: [PATCH 046/189] Fix declaration of input arguments in the x86_64 SCAL microkernels (#1966) * Tag arguments 0 and 1 as both input and output (see #1964) --- kernel/x86_64/cscal_microk_bulldozer-2.c | 32 +++++++++++----------- kernel/x86_64/cscal_microk_haswell-2.c | 30 ++++++++++---------- kernel/x86_64/cscal_microk_steamroller-2.c | 32 +++++++++++----------- kernel/x86_64/dscal_microk_bulldozer-2.c | 12 ++++---- kernel/x86_64/dscal_microk_haswell-2.c | 12 ++++---- kernel/x86_64/dscal_microk_sandy-2.c | 12 ++++---- kernel/x86_64/zscal_microk_bulldozer-2.c | 28 +++++++++---------- kernel/x86_64/zscal_microk_haswell-2.c | 32 +++++++++++----------- kernel/x86_64/zscal_microk_steamroller-2.c | 32 +++++++++++----------- 9 files changed, 111 insertions(+), 111 deletions(-) diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c index 3abffc4cfa..31451aa6cb 100644 --- a/kernel/x86_64/cscal_microk_bulldozer-2.c +++ b/kernel/x86_64/cscal_microk_bulldozer-2.c @@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c index 0a4eb683c2..a04a4c4aba 100644 --- a/kernel/x86_64/cscal_microk_haswell-2.c +++ b/kernel/x86_64/cscal_microk_haswell-2.c @@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 : "cc", // "0", "1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", @@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", @@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" - : - : - "r" (n), // 0 - "r" (x), // 1 + : + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c index 8346e17483..e8073d485e 100644 --- a/kernel/x86_64/cscal_microk_steamroller-2.c +++ b/kernel/x86_64/cscal_microk_steamroller-2.c @@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n), // 0 + "+r" (x) // 1 : - : - "r" (n), // 0 - "r" (x), // 1 "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c index de53b0bc4b..096662781e 100644 --- a/kernel/x86_64/dscal_microk_bulldozer-2.c +++ b/kernel/x86_64/dscal_microk_bulldozer-2.c @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", @@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c index e732a27181..77ed59a4e3 100644 --- a/kernel/x86_64/dscal_microk_haswell-2.c +++ b/kernel/x86_64/dscal_microk_haswell-2.c @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", @@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n1), // 0 + "+r" (x) // 1 : - : - "r" (n1), // 0 - "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c index 8d855072b4..9982b8e587 100644 --- a/kernel/x86_64/dscal_microk_sandy-2.c +++ b/kernel/x86_64/dscal_microk_sandy-2.c @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", @@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n1), // 0 + "+r" (x) // 1 : - : - "r" (n1), // 0 - "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c index 03882d6b66..5e733ffdae 100644 --- a/kernel/x86_64/zscal_microk_bulldozer-2.c +++ b/kernel/x86_64/zscal_microk_bulldozer-2.c @@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -285,9 +285,9 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", @@ -329,10 +329,10 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n), // 0 + "+r" (x) // 1 : - : - "r" (n), // 0 - "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c index d9253c1ed5..8c8f5b75cb 100644 --- a/kernel/x86_64/zscal_microk_haswell-2.c +++ b/kernel/x86_64/zscal_microk_haswell-2.c @@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -285,11 +285,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -330,11 +330,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c index 97b07add65..c9267ee0c3 100644 --- a/kernel/x86_64/zscal_microk_steamroller-2.c +++ b/kernel/x86_64/zscal_microk_steamroller-2.c @@ -116,12 +116,12 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n), // 0 + "+r" (x) // 1 : - : - "r" (n), // 0 - "r" (x), // 1 "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -209,11 +209,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -286,11 +286,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -331,11 +331,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", From 32b0f1168ec5eb93e146245d732c5a2fa9d73282 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 18 Jan 2019 08:11:39 +0100 Subject: [PATCH 047/189] Fix declaration of input arguments in the Sandybridge GER microkernels (#1967) * Tag arguments 0 and 1 as both input and output --- kernel/x86_64/dger_microk_sandy-2.c | 6 +++--- kernel/x86_64/sger_microk_sandy-2.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c index 2bf966a5f4..e8494500ff 100644 --- a/kernel/x86_64/dger_microk_sandy-2.c +++ b/kernel/x86_64/dger_microk_sandy-2.c @@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c index 79180b991e..14f13475b8 100644 --- a/kernel/x86_64/sger_microk_sandy-2.c +++ b/kernel/x86_64/sger_microk_sandy-2.c @@ -105,9 +105,9 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 From cda81cfae0e3dc18b1c2e9d05d6e0f8e1bec3917 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Jan 2019 00:10:01 +0100 Subject: [PATCH 048/189] Shift transition to multithreading towards larger matrix sizes See #1886 and JuliaRobotics issue 500. trsm benchmarks on Haswell and Zen showed that with these values performance is roughly doubled for matrix sizes between 8x8 and 14x14, and still 10 to 20 percent better near the new cutoff at 32x32. --- interface/trsm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/interface/trsm.c b/interface/trsm.c index 5c2750e791..faec03ac23 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -81,6 +81,12 @@ #endif #endif +#ifndef COMPLEX +#define SMP_FACTOR 8 +#else +#define SMP_FACTOR 4 +#endif + static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifndef TRMM TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, @@ -366,10 +372,10 @@ void CNAME(enum CBLAS_ORDER order, mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT); - if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) + if ( args.m < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else - if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) + if ( args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else args.nthreads = num_cpu_avail(3); From bbfdd6c0fe1e7d90099fe14f1e1f2fd775a47a36 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Jan 2019 23:01:31 +0100 Subject: [PATCH 049/189] Increase Zen SWITCH_RATIO to 16 following GEMM benchmarks on Ryzen2700X. For #1464 --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index fa6730208d..15ea663a8f 100644 --- a/param.h +++ b/param.h @@ -605,7 +605,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 4 +#define SWITCH_RATIO 16 #ifdef ARCH_X86 From 83b5c6b92dc6f66becae1418beef60042eb92c6d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Jan 2019 12:18:53 +0100 Subject: [PATCH 050/189] Fix compilation with NO_AVX=1 set fixes #1974 --- cpuid_x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 7260140330..c45ddd9680 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -228,7 +228,7 @@ int support_avx2(){ } int support_avx512(){ -#ifndef NO_AVX512 +#if !defined(NO_AVX) && !defined(NO_AVX512) int eax, ebx, ecx, edx; int ret=0; From b111829226874550c524b36882ff84c90008f494 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 21 Jan 2019 15:56:04 +0200 Subject: [PATCH 051/189] [ZARCH] Update max/min functions --- kernel/zarch/camax.c | 162 +++++++++++++++++-------------------- kernel/zarch/camin.c | 180 +++++++++++++++++++----------------------- kernel/zarch/damax.c | 108 ++++++++----------------- kernel/zarch/damin.c | 110 ++++++++------------------ kernel/zarch/dmax.c | 89 ++++++++------------- kernel/zarch/dmin.c | 89 ++++++++------------- kernel/zarch/icamax.c | 33 ++++---- kernel/zarch/icamin.c | 31 ++++---- kernel/zarch/idamax.c | 51 ++++++------ kernel/zarch/idamin.c | 51 ++++++------ kernel/zarch/idmax.c | 51 ++++++------ kernel/zarch/idmin.c | 51 ++++++------ kernel/zarch/isamax.c | 55 +++++++------ kernel/zarch/isamin.c | 55 +++++++------ kernel/zarch/ismax.c | 55 +++++++------ kernel/zarch/ismin.c | 55 +++++++------ kernel/zarch/izamax.c | 27 ++++--- kernel/zarch/izamin.c | 27 ++++--- kernel/zarch/samax.c | 111 ++++++++------------------ kernel/zarch/samin.c | 111 ++++++++------------------ kernel/zarch/smax.c | 92 ++++++++------------- kernel/zarch/smin.c | 92 ++++++++------------- kernel/zarch/zamax.c | 118 +++++++++++++-------------- kernel/zarch/zamin.c | 118 +++++++++++++-------------- 24 files changed, 805 insertions(+), 1117 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 2c913b62e5..66d2508962 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -55,7 +55,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" - "pfd 1, 1024(%2) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" "vlef %%v16,0(%%r1,%2),0 \n\t" "vlef %%v17,4(%%r1,%2),0 \n\t" @@ -93,100 +93,88 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v22,120(%%r1,%2),3 \n\t" "vlef %%v23,124(%%r1,%2),3 \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchsb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchsb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vlef %%v16,128(%%r1,%2),0 \n\t" - "vlef %%v17,132(%%r1,%2),0 \n\t" - "vlef %%v16,136(%%r1,%2),1 \n\t" - "vlef %%v17,140(%%r1,%2),1 \n\t" - "vlef %%v16,144(%%r1,%2),2 \n\t" - "vlef %%v17,148(%%r1,%2),2 \n\t" - "vlef %%v16,152(%%r1,%2),3 \n\t" - "vlef %%v17,156(%%r1,%2),3 \n\t" - - "vlef %%v18,160(%%r1,%2),0 \n\t" - "vlef %%v19,164(%%r1,%2),0 \n\t" - "vlef %%v18,168(%%r1,%2),1 \n\t" - "vlef %%v19,172(%%r1,%2),1 \n\t" - "vlef %%v18,176(%%r1,%2),2 \n\t" - "vlef %%v19,180(%%r1,%2),2 \n\t" - "vlef %%v18,184(%%r1,%2),3 \n\t" - "vlef %%v19,188(%%r1,%2),3 \n\t" - - "vlef %%v20,192(%%r1,%2),0 \n\t" - "vlef %%v21,196(%%r1,%2),0 \n\t" - "vlef %%v20,200(%%r1,%2),1 \n\t" - "vlef %%v21,204(%%r1,%2),1 \n\t" - "vlef %%v20,208(%%r1,%2),2 \n\t" - "vlef %%v21,212(%%r1,%2),2 \n\t" - "vlef %%v20,216(%%r1,%2),3 \n\t" - "vlef %%v21,220(%%r1,%2),3 \n\t" - - "vlef %%v22,224(%%r1,%2),0 \n\t" - "vlef %%v23,228(%%r1,%2),0 \n\t" - "vlef %%v22,232(%%r1,%2),1 \n\t" - "vlef %%v23,236(%%r1,%2),1 \n\t" - "vlef %%v22,240(%%r1,%2),2 \n\t" - "vlef %%v23,244(%%r1,%2),2 \n\t" - "vlef %%v22,248(%%r1,%2),3 \n\t" - "vlef %%v23,252(%%r1,%2),3 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vlef %%v24,128(%%r1,%2),0 \n\t" + "vlef %%v25,132(%%r1,%2),0 \n\t" + "vlef %%v24,136(%%r1,%2),1 \n\t" + "vlef %%v25,140(%%r1,%2),1 \n\t" + "vlef %%v24,144(%%r1,%2),2 \n\t" + "vlef %%v25,148(%%r1,%2),2 \n\t" + "vlef %%v24,152(%%r1,%2),3 \n\t" + "vlef %%v25,156(%%r1,%2),3 \n\t" + + "vlef %%v26,160(%%r1,%2),0 \n\t" + "vlef %%v27,164(%%r1,%2),0 \n\t" + "vlef %%v26,168(%%r1,%2),1 \n\t" + "vlef %%v27,172(%%r1,%2),1 \n\t" + "vlef %%v26,176(%%r1,%2),2 \n\t" + "vlef %%v27,180(%%r1,%2),2 \n\t" + "vlef %%v26,184(%%r1,%2),3 \n\t" + "vlef %%v27,188(%%r1,%2),3 \n\t" + + "vlef %%v28,192(%%r1,%2),0 \n\t" + "vlef %%v29,196(%%r1,%2),0 \n\t" + "vlef %%v28,200(%%r1,%2),1 \n\t" + "vlef %%v29,204(%%r1,%2),1 \n\t" + "vlef %%v28,208(%%r1,%2),2 \n\t" + "vlef %%v29,212(%%r1,%2),2 \n\t" + "vlef %%v28,216(%%r1,%2),3 \n\t" + "vlef %%v29,220(%%r1,%2),3 \n\t" + + "vlef %%v30,224(%%r1,%2),0 \n\t" + "vlef %%v31,228(%%r1,%2),0 \n\t" + "vlef %%v30,232(%%r1,%2),1 \n\t" + "vlef %%v31,236(%%r1,%2),1 \n\t" + "vlef %%v30,240(%%r1,%2),2 \n\t" + "vlef %%v31,244(%%r1,%2),2 \n\t" + "vlef %%v30,248(%%r1,%2),3 \n\t" + "vlef %%v31,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16,%%v16 \n\t" + "vflpsb %%v17,%%v17 \n\t" + "vflpsb %%v18,%%v18 \n\t" + "vflpsb %%v19,%%v19 \n\t" + "vflpsb %%v20,%%v20 \n\t" + "vflpsb %%v21,%%v21 \n\t" + "vflpsb %%v22,%%v22 \n\t" + "vflpsb %%v23,%%v23 \n\t" + "vflpsb %%v24,%%v24 \n\t" + "vflpsb %%v25,%%v25 \n\t" + "vflpsb %%v26,%%v26 \n\t" + "vflpsb %%v27,%%v27 \n\t" + "vflpsb %%v28,%%v28 \n\t" + "vflpsb %%v29,%%v29 \n\t" + "vflpsb %%v30,%%v30 \n\t" + "vflpsb %%v31,%%v31 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" + "vfasb %%v18,%%v18,%%v19 \n\t" + "vfasb %%v20,%%v20,%%v21 \n\t" + "vfasb %%v22,%%v22,%%v23 \n\t" + "vfasb %%v24,%%v24,%%v25 \n\t" + "vfasb %%v26,%%v26,%%v27 \n\t" + "vfasb %%v28,%%v28,%%v29 \n\t" + "vfasb %%v30,%%v30,%%v31 \n\t" - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" + + "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" - "vfchsb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" - "vfchsb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -233,11 +221,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { maxf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 733f98fbf9..5abc685b2e 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -43,8 +43,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) __asm__ volatile ( "vlef %%v0,0(%2),0 \n\t" "vlef %%v16,4(%2),0 \n\t" - "vlef %%v0,8(%2),0 \n\t" - "vlef %%v16,12(%2),0 \n\t" + "vlef %%v0,8(%2),1 \n\t" + "vlef %%v16,12(%2),1 \n\t" "vlef %%v0,16(%2),2 \n\t" "vlef %%v16,20(%2),2 \n\t" "vlef %%v0,24(%2),3 \n\t" @@ -59,8 +59,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v16,0(%%r1,%2),0 \n\t" "vlef %%v17,4(%%r1,%2),0 \n\t" - "vlef %%v16,8(%%r1,%2),0 \n\t" - "vlef %%v17,12(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),1 \n\t" + "vlef %%v17,12(%%r1,%2),1 \n\t" "vlef %%v16,16(%%r1,%2),2 \n\t" "vlef %%v17,20(%%r1,%2),2 \n\t" "vlef %%v16,24(%%r1,%2),3 \n\t" @@ -68,8 +68,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v18,32(%%r1,%2),0 \n\t" "vlef %%v19,36(%%r1,%2),0 \n\t" - "vlef %%v18,40(%%r1,%2),0 \n\t" - "vlef %%v19,44(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),1 \n\t" + "vlef %%v19,44(%%r1,%2),1 \n\t" "vlef %%v18,48(%%r1,%2),2 \n\t" "vlef %%v19,52(%%r1,%2),2 \n\t" "vlef %%v18,56(%%r1,%2),3 \n\t" @@ -77,8 +77,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v20,64(%%r1,%2),0 \n\t" "vlef %%v21,68(%%r1,%2),0 \n\t" - "vlef %%v20,72(%%r1,%2),0 \n\t" - "vlef %%v21,76(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),1 \n\t" + "vlef %%v21,76(%%r1,%2),1 \n\t" "vlef %%v20,80(%%r1,%2),2 \n\t" "vlef %%v21,84(%%r1,%2),2 \n\t" "vlef %%v20,88(%%r1,%2),3 \n\t" @@ -86,107 +86,95 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v22,96(%%r1,%2),0 \n\t" "vlef %%v23,100(%%r1,%2),0 \n\t" - "vlef %%v22,104(%%r1,%2),0 \n\t" - "vlef %%v23,108(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),1 \n\t" + "vlef %%v23,108(%%r1,%2),1 \n\t" "vlef %%v22,112(%%r1,%2),2 \n\t" "vlef %%v23,116(%%r1,%2),2 \n\t" "vlef %%v22,120(%%r1,%2),3 \n\t" "vlef %%v23,124(%%r1,%2),3 \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchsb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchsb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vlef %%v16,128(%%r1,%2),0 \n\t" - "vlef %%v17,132(%%r1,%2),0 \n\t" - "vlef %%v16,136(%%r1,%2),0 \n\t" - "vlef %%v17,140(%%r1,%2),0 \n\t" - "vlef %%v16,144(%%r1,%2),2 \n\t" - "vlef %%v17,148(%%r1,%2),2 \n\t" - "vlef %%v16,152(%%r1,%2),3 \n\t" - "vlef %%v17,156(%%r1,%2),3 \n\t" - - "vlef %%v18,160(%%r1,%2),0 \n\t" - "vlef %%v19,164(%%r1,%2),0 \n\t" - "vlef %%v18,168(%%r1,%2),0 \n\t" - "vlef %%v19,172(%%r1,%2),0 \n\t" - "vlef %%v18,176(%%r1,%2),2 \n\t" - "vlef %%v19,180(%%r1,%2),2 \n\t" - "vlef %%v18,184(%%r1,%2),3 \n\t" - "vlef %%v19,188(%%r1,%2),3 \n\t" - - "vlef %%v20,192(%%r1,%2),0 \n\t" - "vlef %%v21,196(%%r1,%2),0 \n\t" - "vlef %%v20,200(%%r1,%2),0 \n\t" - "vlef %%v21,204(%%r1,%2),0 \n\t" - "vlef %%v20,208(%%r1,%2),2 \n\t" - "vlef %%v21,212(%%r1,%2),2 \n\t" - "vlef %%v20,216(%%r1,%2),3 \n\t" - "vlef %%v21,220(%%r1,%2),3 \n\t" - - "vlef %%v22,224(%%r1,%2),0 \n\t" - "vlef %%v23,228(%%r1,%2),0 \n\t" - "vlef %%v22,232(%%r1,%2),0 \n\t" - "vlef %%v23,236(%%r1,%2),0 \n\t" - "vlef %%v22,240(%%r1,%2),2 \n\t" - "vlef %%v23,244(%%r1,%2),2 \n\t" - "vlef %%v22,248(%%r1,%2),3 \n\t" - "vlef %%v23,252(%%r1,%2),3 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vlef %%v24,128(%%r1,%2),0 \n\t" + "vlef %%v25,132(%%r1,%2),0 \n\t" + "vlef %%v24,136(%%r1,%2),1 \n\t" + "vlef %%v25,140(%%r1,%2),1 \n\t" + "vlef %%v24,144(%%r1,%2),2 \n\t" + "vlef %%v25,148(%%r1,%2),2 \n\t" + "vlef %%v24,152(%%r1,%2),3 \n\t" + "vlef %%v25,156(%%r1,%2),3 \n\t" + + "vlef %%v26,160(%%r1,%2),0 \n\t" + "vlef %%v27,164(%%r1,%2),0 \n\t" + "vlef %%v26,168(%%r1,%2),1 \n\t" + "vlef %%v27,172(%%r1,%2),1 \n\t" + "vlef %%v26,176(%%r1,%2),2 \n\t" + "vlef %%v27,180(%%r1,%2),2 \n\t" + "vlef %%v26,184(%%r1,%2),3 \n\t" + "vlef %%v27,188(%%r1,%2),3 \n\t" + + "vlef %%v28,192(%%r1,%2),0 \n\t" + "vlef %%v29,196(%%r1,%2),0 \n\t" + "vlef %%v28,200(%%r1,%2),1 \n\t" + "vlef %%v29,204(%%r1,%2),1 \n\t" + "vlef %%v28,208(%%r1,%2),2 \n\t" + "vlef %%v29,212(%%r1,%2),2 \n\t" + "vlef %%v28,216(%%r1,%2),3 \n\t" + "vlef %%v29,220(%%r1,%2),3 \n\t" + + "vlef %%v30,224(%%r1,%2),0 \n\t" + "vlef %%v31,228(%%r1,%2),0 \n\t" + "vlef %%v30,232(%%r1,%2),1 \n\t" + "vlef %%v31,236(%%r1,%2),1 \n\t" + "vlef %%v30,240(%%r1,%2),2 \n\t" + "vlef %%v31,244(%%r1,%2),2 \n\t" + "vlef %%v30,248(%%r1,%2),3 \n\t" + "vlef %%v31,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16,%%v16 \n\t" + "vflpsb %%v17,%%v17 \n\t" + "vflpsb %%v18,%%v18 \n\t" + "vflpsb %%v19,%%v19 \n\t" + "vflpsb %%v20,%%v20 \n\t" + "vflpsb %%v21,%%v21 \n\t" + "vflpsb %%v22,%%v22 \n\t" + "vflpsb %%v23,%%v23 \n\t" + "vflpsb %%v24,%%v24 \n\t" + "vflpsb %%v25,%%v25 \n\t" + "vflpsb %%v26,%%v26 \n\t" + "vflpsb %%v27,%%v27 \n\t" + "vflpsb %%v28,%%v28 \n\t" + "vflpsb %%v29,%%v29 \n\t" + "vflpsb %%v30,%%v30 \n\t" + "vflpsb %%v31,%%v31 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" + "vfasb %%v18,%%v18,%%v19 \n\t" + "vfasb %%v20,%%v20,%%v21 \n\t" + "vfasb %%v22,%%v22,%%v23 \n\t" + "vfasb %%v24,%%v24,%%v25 \n\t" + "vfasb %%v26,%%v26,%%v27 \n\t" + "vfasb %%v28,%%v28,%%v29 \n\t" + "vfasb %%v30,%%v30,%%v31 \n\t" - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfminsb %%v16,%%v16,%%v24,0 \n\t" + "vfminsb %%v18,%%v18,%%v26,0 \n\t" + "vfminsb %%v20,%%v20,%%v28,0 \n\t" + "vfminsb %%v22,%%v22,%%v30,0 \n\t" + + "vfminsb %%v16,%%v16,%%v20,0 \n\t" + "vfminsb %%v18,%%v18,%%v22,0 \n\t" - "vfchsb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfminsb %%v16,%%v16,%%v18,0 \n\t" - "vfchsb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfminsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -233,11 +221,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { minf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 236d11c722..a3d63fe532 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -39,8 +39,7 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) FLOAT amax; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" @@ -54,79 +53,42 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxdb %%v16,%%v16,%%v24,8 \n\t" + "vfmaxdb %%v17,%%v17,%%v25,8 \n\t" + "vfmaxdb %%v18,%%v18,%%v26,8 \n\t" + "vfmaxdb %%v19,%%v19,%%v27,8 \n\t" + "vfmaxdb %%v20,%%v20,%%v28,8 \n\t" + "vfmaxdb %%v21,%%v21,%%v29,8 \n\t" + "vfmaxdb %%v22,%%v22,%%v30,8 \n\t" + "vfmaxdb %%v23,%%v23,%%v31,8 \n\t" + + "vfmaxdb %%v16,%%v16,%%v20,8 \n\t" + "vfmaxdb %%v17,%%v17,%%v21,8 \n\t" + "vfmaxdb %%v18,%%v18,%%v22,8 \n\t" + "vfmaxdb %%v19,%%v19,%%v23,8 \n\t" + + "vfmaxdb %%v16,%%v16,%%v18,8 \n\t" + "vfmaxdb %%v17,%%v17,%%v19,8 \n\t" + + "vfmaxdb %%v16,%%v16,%%v17,8 \n\t" + + "vfmaxdb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ldr %0,%%f0 " + "vrepg %%v16,%%v0,1 \n\t" + "wfmaxdb %%v0,%%v0,%%v16,8 \n\t" + "lpdr %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -168,11 +130,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index c2c63c6c5a..738ed8710e 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -39,11 +39,10 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) FLOAT amin; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%2) \n\t" @@ -54,79 +53,42 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmindb %%v16,%%v16,%%v24,8 \n\t" + "vfmindb %%v17,%%v17,%%v25,8 \n\t" + "vfmindb %%v18,%%v18,%%v26,8 \n\t" + "vfmindb %%v19,%%v19,%%v27,8 \n\t" + "vfmindb %%v20,%%v20,%%v28,8 \n\t" + "vfmindb %%v21,%%v21,%%v29,8 \n\t" + "vfmindb %%v22,%%v22,%%v30,8 \n\t" + "vfmindb %%v23,%%v23,%%v31,8 \n\t" + + "vfmindb %%v16,%%v16,%%v20,8 \n\t" + "vfmindb %%v17,%%v17,%%v21,8 \n\t" + "vfmindb %%v18,%%v18,%%v22,8 \n\t" + "vfmindb %%v19,%%v19,%%v23,8 \n\t" + + "vfmindb %%v16,%%v16,%%v18,8 \n\t" + "vfmindb %%v17,%%v17,%%v19,8 \n\t" + + "vfmindb %%v16,%%v16,%%v17,8 \n\t" + + "vfmindb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ldr %0,%%f0 " + "vrepg %%v16,%%v0,1 \n\t" + "wfmindb %%v0,%%v0,%%v16,8 \n\t" + "lpdr %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -168,11 +130,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index 469f657358..aa8b932f9f 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -32,7 +32,7 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) FLOAT max; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" @@ -46,62 +46,41 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxdb %%v17,%%v17,%%v25,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxdb %%v19,%%v19,%%v27,0 \n\t" + "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxdb %%v21,%%v21,%%v29,0 \n\t" + "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" + "vfmaxdb %%v23,%%v23,%%v31,0 \n\t" + + "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxdb %%v17,%%v17,%%v21,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" + "vfmaxdb %%v19,%%v19,%%v23,0 \n\t" + + "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" + "vfmaxdb %%v17,%%v17,%%v19,0 \n\t" + + "vfmaxdb %%v16,%%v16,%%v17,0 \n\t" + + "vfmaxdb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepg %%v16,%%v0,1 \n\t" + "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(max) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -144,11 +123,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 3df5049500..8ae5fe868c 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -32,7 +32,7 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) FLOAT min; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" @@ -46,62 +46,41 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmindb %%v16,%%v16,%%v24,0 \n\t" + "vfmindb %%v17,%%v17,%%v25,0 \n\t" + "vfmindb %%v18,%%v18,%%v26,0 \n\t" + "vfmindb %%v19,%%v19,%%v27,0 \n\t" + "vfmindb %%v20,%%v20,%%v28,0 \n\t" + "vfmindb %%v21,%%v21,%%v29,0 \n\t" + "vfmindb %%v22,%%v22,%%v30,0 \n\t" + "vfmindb %%v23,%%v23,%%v31,0 \n\t" + + "vfmindb %%v16,%%v16,%%v20,0 \n\t" + "vfmindb %%v17,%%v17,%%v21,0 \n\t" + "vfmindb %%v18,%%v18,%%v22,0 \n\t" + "vfmindb %%v19,%%v19,%%v23,0 \n\t" + + "vfmindb %%v16,%%v16,%%v18,0 \n\t" + "vfmindb %%v17,%%v17,%%v19,0 \n\t" + + "vfmindb %%v16,%%v16,%%v17,0 \n\t" + + "vfmindb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepg %%v16,%%v0,1 \n\t" + "wfmindb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(min) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -144,11 +123,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 9b4077c6b0..27f969eee4 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -76,7 +76,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" - "pfd 1, 1024(%3) \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" "vlef %%v16,0(%%r1,%3),0 \n\t" "vlef %%v17,4(%%r1,%3),0 \n\t" @@ -127,14 +127,14 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -142,13 +142,13 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vlef %%v16,128(%%r1,%3),0 \n\t" @@ -200,14 +200,14 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -215,13 +215,13 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -250,8 +250,8 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchsb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -302,6 +302,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 6e952a3256..ae7b37b4fc 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -127,14 +127,14 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -142,13 +142,13 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vlef %%v16,128(%%r1,%3),0 \n\t" @@ -200,14 +200,14 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -215,13 +215,13 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -250,8 +250,8 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchsb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -302,6 +302,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 4f7ff69857..e5a1d3a7cc 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -63,7 +63,7 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -83,10 +83,10 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -96,21 +96,21 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -130,10 +130,10 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -143,21 +143,21 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -175,8 +175,8 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchdb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -221,12 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 3abc7a5585..a68f7282f8 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -63,7 +63,7 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -83,10 +83,10 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -96,21 +96,21 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -130,10 +130,10 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -143,21 +143,21 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -175,8 +175,8 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchdb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -221,12 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 313a88db44..4c3040779c 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -55,7 +55,7 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -67,10 +67,10 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -80,21 +80,21 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -106,10 +106,10 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -119,21 +119,21 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -151,8 +151,8 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "wfchdb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imax),"=m"(*max) @@ -197,12 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 42443215be..ba1776a49c 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -55,7 +55,7 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -67,10 +67,10 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -80,21 +80,21 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -106,10 +106,10 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -119,21 +119,21 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -151,8 +151,8 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "wfchdb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imin),"=m"(*min) @@ -197,12 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index dd2144db21..2f5c1c867a 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -81,7 +81,7 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -101,10 +101,10 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -114,14 +114,14 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -129,13 +129,13 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -155,10 +155,10 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -168,14 +168,14 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -183,13 +183,13 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -218,8 +218,8 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchsb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -264,12 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index d7e44421d0..04e05aad96 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -81,7 +81,7 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -101,10 +101,10 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -114,14 +114,14 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -129,13 +129,13 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -155,10 +155,10 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -168,14 +168,14 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -183,13 +183,13 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -218,8 +218,8 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchsb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -264,12 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 1ebc6c8c8e..084b4ce94f 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -73,7 +73,7 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -85,10 +85,10 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -98,14 +98,14 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -113,13 +113,13 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -131,10 +131,10 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -144,14 +144,14 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -159,13 +159,13 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -194,8 +194,8 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "wfchsb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imax),"=m"(*max) @@ -240,12 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index a6b9d59de9..4e85816a39 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -73,7 +73,7 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -85,10 +85,10 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -98,14 +98,14 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -113,13 +113,13 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -131,10 +131,10 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -144,14 +144,14 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -159,13 +159,13 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -194,8 +194,8 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "wfchsb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imin),"=m"(*min) @@ -240,12 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 541464b055..2ffad25703 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -93,21 +93,21 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vleg %%v16,128(%%r1,%3),0 \n\t" @@ -139,21 +139,21 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -171,8 +171,8 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchdb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -223,6 +223,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 4b5572b80c..1e037c0c77 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -93,21 +93,21 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vleg %%v16,128(%%r1,%3),0 \n\t" @@ -139,21 +139,21 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -171,8 +171,8 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchdb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -223,6 +223,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index 61d50159fe..c8d831d063 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -40,8 +40,7 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -54,83 +53,45 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxsb %%v16,%%v16,%%v24,8 \n\t" + "vfmaxsb %%v17,%%v17,%%v25,8 \n\t" + "vfmaxsb %%v18,%%v18,%%v26,8 \n\t" + "vfmaxsb %%v19,%%v19,%%v27,8 \n\t" + "vfmaxsb %%v20,%%v20,%%v28,8 \n\t" + "vfmaxsb %%v21,%%v21,%%v29,8 \n\t" + "vfmaxsb %%v22,%%v22,%%v30,8 \n\t" + "vfmaxsb %%v23,%%v23,%%v31,8 \n\t" + + "vfmaxsb %%v16,%%v16,%%v20,8 \n\t" + "vfmaxsb %%v17,%%v17,%%v21,8 \n\t" + "vfmaxsb %%v18,%%v18,%%v22,8 \n\t" + "vfmaxsb %%v19,%%v19,%%v23,8 \n\t" + + "vfmaxsb %%v16,%%v16,%%v18,8 \n\t" + "vfmaxsb %%v17,%%v17,%%v19,8 \n\t" + + "vfmaxsb %%v16,%%v16,%%v17,8 \n\t" + + "vfmaxsb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ler %0,%%f0 " + "vrepf %%v16,%%v0,2 \n\t" + "wfmaxsb %%v0,%%v0,%%v16,8 \n\t" + "lper %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -172,11 +133,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index a585a79ffb..dd24c74d75 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -40,8 +40,7 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -54,83 +53,45 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfminsb %%v16,%%v16,%%v24,8 \n\t" + "vfminsb %%v17,%%v17,%%v25,8 \n\t" + "vfminsb %%v18,%%v18,%%v26,8 \n\t" + "vfminsb %%v19,%%v19,%%v27,8 \n\t" + "vfminsb %%v20,%%v20,%%v28,8 \n\t" + "vfminsb %%v21,%%v21,%%v29,8 \n\t" + "vfminsb %%v22,%%v22,%%v30,8 \n\t" + "vfminsb %%v23,%%v23,%%v31,8 \n\t" + + "vfminsb %%v16,%%v16,%%v20,8 \n\t" + "vfminsb %%v17,%%v17,%%v21,8 \n\t" + "vfminsb %%v18,%%v18,%%v22,8 \n\t" + "vfminsb %%v19,%%v19,%%v23,8 \n\t" + + "vfminsb %%v16,%%v16,%%v18,8 \n\t" + "vfminsb %%v17,%%v17,%%v19,8 \n\t" + + "vfminsb %%v16,%%v16,%%v17,8 \n\t" + + "vfminsb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfminsb %%v0,%%v0,%%v16,8 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ler %0,%%f0 " + "vrepf %%v16,%%v0,2 \n\t" + "wfminsb %%v0,%%v0,%%v16,8 \n\t" + "lper %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -172,11 +133,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index bcdb473afa..8a2b86dc17 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -33,7 +33,7 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -46,66 +46,44 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxsb %%v17,%%v17,%%v25,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxsb %%v19,%%v19,%%v27,0 \n\t" + "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxsb %%v21,%%v21,%%v29,0 \n\t" + "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" + "vfmaxsb %%v23,%%v23,%%v31,0 \n\t" + + "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxsb %%v17,%%v17,%%v21,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" + "vfmaxsb %%v19,%%v19,%%v23,0 \n\t" + + "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" + "vfmaxsb %%v17,%%v17,%%v19,0 \n\t" + + "vfmaxsb %%v16,%%v16,%%v17,0 \n\t" + + "vfmaxsb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepf %%v16,%%v0,2 \n\t" + "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(max) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -148,11 +126,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index 91c31d284d..b87ec0fe81 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -33,7 +33,7 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -46,66 +46,44 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfminsb %%v16,%%v16,%%v24,0 \n\t" + "vfminsb %%v17,%%v17,%%v25,0 \n\t" + "vfminsb %%v18,%%v18,%%v26,0 \n\t" + "vfminsb %%v19,%%v19,%%v27,0 \n\t" + "vfminsb %%v20,%%v20,%%v28,0 \n\t" + "vfminsb %%v21,%%v21,%%v29,0 \n\t" + "vfminsb %%v22,%%v22,%%v30,0 \n\t" + "vfminsb %%v23,%%v23,%%v31,0 \n\t" + + "vfminsb %%v16,%%v16,%%v20,0 \n\t" + "vfminsb %%v17,%%v17,%%v21,0 \n\t" + "vfminsb %%v18,%%v18,%%v22,0 \n\t" + "vfminsb %%v19,%%v19,%%v23,0 \n\t" + + "vfminsb %%v16,%%v16,%%v18,0 \n\t" + "vfminsb %%v17,%%v17,%%v19,0 \n\t" + + "vfminsb %%v16,%%v16,%%v17,0 \n\t" + + "vfminsb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepf %%v16,%%v0,2 \n\t" + "wfminsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(min) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -148,11 +126,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 8ef3f42ca9..8175874c05 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -69,76 +69,66 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) "vleg %%v23,104(%%r1,%2),0 \n\t" "vleg %%v22,112(%%r1,%2),1 \n\t" "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vleg %%v24,128(%%r1,%2),0 \n\t" + "vleg %%v25,136(%%r1,%2),0 \n\t" + "vleg %%v24,144(%%r1,%2),1 \n\t" + "vleg %%v25,152(%%r1,%2),1 \n\t" + "vleg %%v26,160(%%r1,%2),0 \n\t" + "vleg %%v27,168(%%r1,%2),0 \n\t" + "vleg %%v26,176(%%r1,%2),1 \n\t" + "vleg %%v27,184(%%r1,%2),1 \n\t" + "vleg %%v28,192(%%r1,%2),0 \n\t" + "vleg %%v29,200(%%r1,%2),0 \n\t" + "vleg %%v28,208(%%r1,%2),1 \n\t" + "vleg %%v29,216(%%r1,%2),1 \n\t" + "vleg %%v30,224(%%r1,%2),0 \n\t" + "vleg %%v31,232(%%r1,%2),0 \n\t" + "vleg %%v30,240(%%r1,%2),1 \n\t" + "vleg %%v31,248(%%r1,%2),1 \n\t" + + "vflpdb %%v16,%%v16 \n\t" + "vflpdb %%v17,%%v17 \n\t" + "vflpdb %%v18,%%v18 \n\t" + "vflpdb %%v19,%%v19 \n\t" + "vflpdb %%v20,%%v20 \n\t" + "vflpdb %%v21,%%v21 \n\t" + "vflpdb %%v22,%%v22 \n\t" + "vflpdb %%v23,%%v23 \n\t" + "vflpdb %%v24,%%v24 \n\t" + "vflpdb %%v25,%%v25 \n\t" + "vflpdb %%v26,%%v26 \n\t" + "vflpdb %%v27,%%v27 \n\t" + "vflpdb %%v28,%%v28 \n\t" + "vflpdb %%v29,%%v29 \n\t" + "vflpdb %%v30,%%v30 \n\t" + "vflpdb %%v31,%%v31 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" + "vfadb %%v18,%%v18,%%v19 \n\t" + "vfadb %%v20,%%v20,%%v21 \n\t" + "vfadb %%v22,%%v22,%%v23 \n\t" + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" + + "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmaxdb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -185,11 +175,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { maxf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 30fd1d030a..5d57ff12e1 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -69,76 +69,66 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) "vleg %%v23,104(%%r1,%2),0 \n\t" "vleg %%v22,112(%%r1,%2),1 \n\t" "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vleg %%v24,128(%%r1,%2),0 \n\t" + "vleg %%v25,136(%%r1,%2),0 \n\t" + "vleg %%v24,144(%%r1,%2),1 \n\t" + "vleg %%v25,152(%%r1,%2),1 \n\t" + "vleg %%v26,160(%%r1,%2),0 \n\t" + "vleg %%v27,168(%%r1,%2),0 \n\t" + "vleg %%v26,176(%%r1,%2),1 \n\t" + "vleg %%v27,184(%%r1,%2),1 \n\t" + "vleg %%v28,192(%%r1,%2),0 \n\t" + "vleg %%v29,200(%%r1,%2),0 \n\t" + "vleg %%v28,208(%%r1,%2),1 \n\t" + "vleg %%v29,216(%%r1,%2),1 \n\t" + "vleg %%v30,224(%%r1,%2),0 \n\t" + "vleg %%v31,232(%%r1,%2),0 \n\t" + "vleg %%v30,240(%%r1,%2),1 \n\t" + "vleg %%v31,248(%%r1,%2),1 \n\t" + + "vflpdb %%v16,%%v16 \n\t" + "vflpdb %%v17,%%v17 \n\t" + "vflpdb %%v18,%%v18 \n\t" + "vflpdb %%v19,%%v19 \n\t" + "vflpdb %%v20,%%v20 \n\t" + "vflpdb %%v21,%%v21 \n\t" + "vflpdb %%v22,%%v22 \n\t" + "vflpdb %%v23,%%v23 \n\t" + "vflpdb %%v24,%%v24 \n\t" + "vflpdb %%v25,%%v25 \n\t" + "vflpdb %%v26,%%v26 \n\t" + "vflpdb %%v27,%%v27 \n\t" + "vflpdb %%v28,%%v28 \n\t" + "vflpdb %%v29,%%v29 \n\t" + "vflpdb %%v30,%%v30 \n\t" + "vflpdb %%v31,%%v31 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" + "vfadb %%v18,%%v18,%%v19 \n\t" + "vfadb %%v20,%%v20,%%v21 \n\t" + "vfadb %%v22,%%v22,%%v23 \n\t" + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfmindb %%v16,%%v16,%%v24,0 \n\t" + "vfmindb %%v18,%%v18,%%v26,0 \n\t" + "vfmindb %%v20,%%v20,%%v28,0 \n\t" + "vfmindb %%v22,%%v22,%%v30,0 \n\t" + + "vfmindb %%v16,%%v16,%%v20,0 \n\t" + "vfmindb %%v18,%%v18,%%v22,0 \n\t" - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfmindb %%v16,%%v16,%%v18,0 \n\t" - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmindb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfmindb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -185,11 +175,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { minf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); From 63bbd7b0d79d41da2a7cc81139a62b81fa247640 Mon Sep 17 00:00:00 2001 From: Daniel Cohen Gindi Date: Mon, 21 Jan 2019 08:35:23 +0200 Subject: [PATCH 052/189] Better support for MSVC/Windows in CMake --- CMakeLists.txt | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 812e6bf6f2..8f3abe4b8a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,6 +42,19 @@ endif() ####### +if(MSVC AND MSVC_STATIC_CRT) + set(CompilerFlags + CMAKE_CXX_FLAGS + CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_RELEASE + CMAKE_C_FLAGS + CMAKE_C_FLAGS_DEBUG + CMAKE_C_FLAGS_RELEASE + ) + foreach(CompilerFlag ${CompilerFlags}) + string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") + endforeach() +endif() message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") @@ -149,12 +162,6 @@ if (${DYNAMIC_ARCH}) endforeach() endif () -# Only build shared libs for MSVC -if (MSVC) - set(BUILD_SHARED_LIBS ON) -endif() - - # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) @@ -314,7 +321,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) if(NOT NOFORTRAN) message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h) + set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h) file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n") file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n") file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n") @@ -327,10 +334,11 @@ endif() if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") + set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") - install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h) + file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") + install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if(NOT NO_LAPACKE) From f0d834b824fd5723c5cd8df01ed1aaa7a78548c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Jan 2019 12:32:24 +0100 Subject: [PATCH 053/189] Use VERSION_LESS for comparisons involving software version numbers --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f3abe4b8a..afd9d2cf20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -147,7 +147,7 @@ endif () # Only generate .def for dll on MSVC and always produce pdb files for debug and release if(MSVC) - if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) + if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") endif() set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") @@ -173,7 +173,7 @@ endif() # Handle MSVC exports if(MSVC AND BUILD_SHARED_LIBS) - if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) + if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") else() # Creates verbose .def file (51KB vs 18KB) From 24288803b3cde043bc4c10d82080509989680efb Mon Sep 17 00:00:00 2001 From: Daniel Cohen Gindi Date: Tue, 22 Jan 2019 14:38:01 +0200 Subject: [PATCH 054/189] Adjust test script for correct deployment --- appveyor.yml | 2 +- utest/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 141d3a130c..95f6cf7c59 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -53,7 +53,7 @@ before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl .. + - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 1b426afe7e..dc306501f2 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -61,7 +61,7 @@ foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${CMAKE_CURRENT_BINARY_DIR}) endforeach() -if (MSVC) +if (MSVC AND BUILD_SHARED_LIBS) add_custom_command(TARGET ${OpenBLAS_utest_bin} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lib/${CMAKE_CFG_INTDIR}/${OpenBLAS_LIBNAME}.dll ${CMAKE_CURRENT_BINARY_DIR}/. From 21eda8b5774aa92aecb9babba0b3eda0a992ddb9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Jan 2019 18:47:12 +0100 Subject: [PATCH 055/189] Report SkylakeX as Haswell if compiler does not support AVX512 ... or make was invoked with NO_AVX512=1 --- getarch.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/getarch.c b/getarch.c index 78ba0fefdb..d03ce6e981 100644 --- a/getarch.c +++ b/getarch.c @@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#else +#define NO_AVX512 +#endif /* #define FORCE_P2 */ /* #define FORCE_KATMAI */ /* #define FORCE_COPPERMINE */ @@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef FORCE_SKYLAKEX +#ifdef NO_AVX512 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "HASWELL" +#define ARCHCONFIG "-DHASWELL " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3" +#define LIBNAME "haswell" +#define CORENAME "HASWELL" +#else #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" @@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "skylakex" #define CORENAME "SKYLAKEX" #endif +#endif #ifdef FORCE_ATOM #define FORCE From b56b34a75cf3ae253cf8904416c6716406aad1fd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Jan 2019 18:55:43 +0100 Subject: [PATCH 056/189] Syntax fix --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 95f6cf7c59..741c662910 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -53,7 +53,7 @@ before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT .. + - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. From 8533aca96470d361cc5cc81da329190811951df1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Jan 2019 10:03:00 +0100 Subject: [PATCH 057/189] Avoid penalizing tall skinny matrices --- interface/trsm.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/interface/trsm.c b/interface/trsm.c index faec03ac23..f2da285de2 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -82,9 +82,9 @@ #endif #ifndef COMPLEX -#define SMP_FACTOR 8 +#define SMP_FACTOR 256 #else -#define SMP_FACTOR 4 +#define SMP_FACTOR 128 #endif static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { @@ -372,11 +372,15 @@ void CNAME(enum CBLAS_ORDER order, mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT); - if ( args.m < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) +/* + if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else - if ( args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) + if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; +*/ + if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD) + args.nthreads = 1; else args.nthreads = num_cpu_avail(3); From e908ac2a5145ac1a0d43e6baf39df14ade061d57 Mon Sep 17 00:00:00 2001 From: Edison Gustavo Muenz Date: Wed, 23 Jan 2019 15:09:13 +0100 Subject: [PATCH 058/189] Fix include directory of exported targets --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 812e6bf6f2..d3a9a27971 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -157,7 +157,7 @@ endif() # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) -target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) +target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) # Android needs to explicitly link against libm if(ANDROID) From e882b239aa75090c7871d5848a0ead7d37bafb6f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jan 2019 00:45:45 +0100 Subject: [PATCH 059/189] Correct naming of getrf_parallel object fixes #1984 --- lapack/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index c0a7543caa..d48a270ab7 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -63,7 +63,6 @@ if (USE_THREAD) # these do not have 'z' versions set(PARALLEL_SOURCES - ${GETRF_SRC} lauum/lauum_U_parallel.c lauum/lauum_L_parallel.c potrf/potrf_U_parallel.c @@ -81,6 +80,10 @@ if (USE_THREAD) trtri/trtri_L_parallel.c ) + foreach (float_type ${FLOAT_TYPES}) + GenerateNamedObjects("${GETRF_SRC}" "" "getrf_parallel" false "" "" false ${float_type}) + endforeach() + GenerateNamedObjects("${PARALLEL_SOURCES}") endif () From 36b844af889374934a4c5af19cf371cf29731d2e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jan 2019 17:47:22 +0100 Subject: [PATCH 060/189] Change ARMV8 target to ARMV7 when BINARY32 is set fixes #1961 --- Makefile.system | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.system b/Makefile.system index 20d4f64920..67c8cd1972 100644 --- a/Makefile.system +++ b/Makefile.system @@ -95,6 +95,9 @@ endif ifeq ($(TARGET), ZEN) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET), ARMV8) +GETARCH_FLAGS := -DFORCE_ARMV7 +endif endif From 58dd7e4501ad55ca03ae1da783de72cc36345f61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jan 2019 17:52:33 +0100 Subject: [PATCH 061/189] Change ARMV8 target to ARMV7 for BINARY=32 --- cmake/system.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index a060d98cb0..4cee7bd18f 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") set(TARGET "BARCELONA") endif () + if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") + set(TARGET "ARMV7") + endif () endif () if (DEFINED TARGET) From 0f24b39ebf8945ddbe5d1516123e98b62853f5b4 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sun, 27 Jan 2019 15:33:00 +0100 Subject: [PATCH 062/189] Reword/expand comments in Makefile.rule Lots of small changes in the wording of the comments, plus an expansion of the NUM_THREADS and NO_AFFINITY sections. --- Makefile.rule | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 7c128fb498..1d5dcacaab 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -48,6 +48,8 @@ VERSION = 0.3.6.dev # HOSTCC = gcc # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 +# Please note that AVX is not available on 32-bit. +# Setting BINARY=32 disables AVX/AVX2/AVX-512. # BINARY=64 # About threaded BLAS. It will be automatically detected if you don't @@ -57,7 +59,7 @@ VERSION = 0.3.6.dev # USE_THREAD = 0 # If you're going to use this library with OpenMP, please comment it in. -# This flag is always set for POWER8. Don't modify the flag +# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. # USE_OPENMP = 1 # The OpenMP scheduler to use - by default this is "static" and you @@ -68,36 +70,39 @@ VERSION = 0.3.6.dev # allow you to select the scheduler from the environment variable OMP_SCHEDULE # CCOMMON_OPT += -DOMP_SCHED=dynamic -# You can define maximum number of threads. Basically it should be -# less than actual number of cores. If you don't specify one, it's +# You can define the maximum number of threads. Basically it should be less +# than or equal to the number of CPU threads. If you don't specify one, it's # automatically detected by the the script. +# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to +# restrict NUM_THREADS to the number of physical cores. By default, the automatic +# detection includes logical CPUs, thus allowing the use of SMT. # NUM_THREADS = 24 # If you have enabled USE_OPENMP and your application would call -# OpenBLAS's calculation API from multi threads, please comment it in. -# This flag defines how many instances of OpenBLAS's calculation API can -# actually run in parallel. If more threads call OpenBLAS's calculation API, +# OpenBLAS's calculation API from multiple threads, please comment this in. +# This flag defines how many instances of OpenBLAS's calculation API can actually +# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API, # they need to wait for the preceding API calls to finish or risk data corruption. # NUM_PARALLEL = 2 -# if you don't need to install the static library, please comment it in. +# If you don't need to generate the static library, please comment this in. # NO_STATIC = 1 -# if you don't need generate the shared library, please comment it in. +# If you don't need to generate the shared library, please comment this in. # NO_SHARED = 1 -# If you don't need CBLAS interface, please comment it in. +# If you don't need the CBLAS interface, please comment this in. # NO_CBLAS = 1 -# If you only want CBLAS interface without installing Fortran compiler, -# please comment it in. +# If you only want the CBLAS interface without installing a Fortran compiler, +# please comment this in. # ONLY_CBLAS = 1 -# If you don't need LAPACK, please comment it in. -# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. +# If you don't need LAPACK, please comment this in. +# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1. # NO_LAPACK = 1 -# If you don't need LAPACKE (C Interface to LAPACK), please comment it in. +# If you don't need LAPACKE (C Interface to LAPACK), please comment this in. # NO_LAPACKE = 1 # Build LAPACK Deprecated functions since LAPACK 3.6.0 @@ -106,7 +111,7 @@ BUILD_LAPACK_DEPRECATED = 1 # Build RecursiveLAPACK on top of LAPACK # BUILD_RELAPACK = 1 -# If you want to use legacy threaded Level 3 implementation. +# If you want to use the legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 # If you want to use the new, still somewhat experimental code that uses @@ -116,8 +121,8 @@ BUILD_LAPACK_DEPRECATED = 1 # USE_TLS = 1 # If you want to drive whole 64bit region by BLAS. Not all Fortran -# compiler supports this. It's safe to keep comment it out if you -# are not sure(equivalent to "-i8" option). +# compilers support this. It's safe to keep this commented out if you +# are not sure. (This is equivalent to the "-i8" ifort option). # INTERFACE64 = 1 # Unfortunately most of kernel won't give us high quality buffer. @@ -125,10 +130,15 @@ BUILD_LAPACK_DEPRECATED = 1 # but it will consume time. If you don't like it, you can disable one. NO_WARMUP = 1 -# If you want to disable CPU/Memory affinity on Linux. +# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling. +# This feature is only implemented on Linux, and is always disabled on other platforms. +# Enabling affinity handling may improve performance, especially on NUMA systems, but +# it may conflict with certain applications that also try to manage affinity. +# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing +# else modifies affinity settings. NO_AFFINITY = 1 -# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus +# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus # BIGNUMA = 1 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers From ea1716ce2aaa4edf09e837796026ecd6cae9116b Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sun, 27 Jan 2019 17:22:26 +0100 Subject: [PATCH 063/189] Update Makefile.rule Revert generate to install, explain the nature of the affinity conflict --- Makefile.rule | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 1d5dcacaab..faf34c0a11 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -85,7 +85,7 @@ VERSION = 0.3.6.dev # they need to wait for the preceding API calls to finish or risk data corruption. # NUM_PARALLEL = 2 -# If you don't need to generate the static library, please comment this in. +# If you don't need to install the static library, please comment this in. # NO_STATIC = 1 # If you don't need to generate the shared library, please comment this in. @@ -134,6 +134,8 @@ NO_WARMUP = 1 # This feature is only implemented on Linux, and is always disabled on other platforms. # Enabling affinity handling may improve performance, especially on NUMA systems, but # it may conflict with certain applications that also try to manage affinity. +# This conflict can result in threads of the application calling OpenBLAS ending up locked +# to the same core(s) as OpenBLAS, possibly binding all threads to a single core. # For this reason, affinity handling is disabled by default. Can be safely enabled if nothing # else modifies affinity settings. NO_AFFINITY = 1 From c8ef9fb22064dc6cb1c7515ad8d7e25c7adf9a8a Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 28 Jan 2019 17:16:18 +0200 Subject: [PATCH 064/189] [ZARCH] Fix bug in iamax/iamin/imax/imin --- kernel/zarch/icamax.c | 1 + kernel/zarch/icamin.c | 1 + kernel/zarch/idamax.c | 1 + kernel/zarch/idamin.c | 1 + kernel/zarch/idmax.c | 1 + kernel/zarch/idmin.c | 1 + kernel/zarch/isamax.c | 1 + kernel/zarch/isamin.c | 1 + kernel/zarch/ismax.c | 1 + kernel/zarch/ismin.c | 1 + kernel/zarch/izamax.c | 1 + kernel/zarch/izamin.c | 1 + 12 files changed, 12 insertions(+) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 27f969eee4..96cb37a1d7 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -283,6 +283,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index ae7b37b4fc..73bd9e8de9 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -283,6 +283,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index e5a1d3a7cc..4a0114242e 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -206,6 +206,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index a68f7282f8..503f92ff70 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -206,6 +206,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 4c3040779c..871c896e6a 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -182,6 +182,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index ba1776a49c..dd14ec92cc 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -182,6 +182,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 2f5c1c867a..1a9ac3cd8f 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -249,6 +249,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 04e05aad96..5a7e669eb9 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -249,6 +249,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 084b4ce94f..0b144c2008 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -225,6 +225,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 4e85816a39..7fda9dffc3 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -225,6 +225,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 2ffad25703..7db64181c1 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -204,6 +204,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 1e037c0c77..707d702d37 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -204,6 +204,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); ix += 2; i++; From 04873bb174d45a9cac478d7db7fd6f2618df2e81 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 28 Jan 2019 17:32:24 +0200 Subject: [PATCH 065/189] [ZARCH] Undo the last commit --- kernel/zarch/icamax.c | 1 - kernel/zarch/icamin.c | 1 - kernel/zarch/idamax.c | 1 - kernel/zarch/idamin.c | 1 - kernel/zarch/idmax.c | 1 - kernel/zarch/idmin.c | 1 - kernel/zarch/isamax.c | 1 - kernel/zarch/isamin.c | 1 - kernel/zarch/ismax.c | 1 - kernel/zarch/ismin.c | 1 - kernel/zarch/izamax.c | 1 - kernel/zarch/izamin.c | 1 - 12 files changed, 12 deletions(-) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 96cb37a1d7..27f969eee4 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -283,7 +283,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 73bd9e8de9..ae7b37b4fc 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -283,7 +283,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - min = 0; minf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 4a0114242e..e5a1d3a7cc 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -206,7 +206,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 503f92ff70..a68f7282f8 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -206,7 +206,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 871c896e6a..4c3040779c 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -182,7 +182,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index dd14ec92cc..ba1776a49c 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -182,7 +182,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 1a9ac3cd8f..2f5c1c867a 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -249,7 +249,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 5a7e669eb9..04e05aad96 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -249,7 +249,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 0b144c2008..084b4ce94f 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -225,7 +225,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 7fda9dffc3..4e85816a39 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -225,7 +225,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 7db64181c1..2ffad25703 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -204,7 +204,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 707d702d37..1e037c0c77 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -204,7 +204,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - min = 0; minf = CABS1(x,0); ix += 2; i++; From c7143c1019d7a35f94454e2ac811cd948a41d22e Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 28 Jan 2019 17:52:23 +0200 Subject: [PATCH 066/189] [ZARCH] Fix iamax/imax single precision --- kernel/zarch/icamax.c | 2 ++ kernel/zarch/icamin.c | 2 ++ kernel/zarch/isamax.c | 2 ++ kernel/zarch/isamin.c | 2 ++ kernel/zarch/ismax.c | 2 ++ kernel/zarch/ismin.c | 2 ++ 6 files changed, 12 insertions(+) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 27f969eee4..2d1442ad90 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -248,6 +248,8 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v2,%%v0 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index ae7b37b4fc..79aa6d3410 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -248,6 +248,8 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v0,%%v2 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 2f5c1c867a..6e0aaa162d 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -216,6 +216,8 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v2,%%v0 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 04e05aad96..266c48f7ff 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -216,6 +216,8 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v0,%%v2 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 084b4ce94f..c968ce6fa8 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -192,6 +192,8 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v2,%%v0 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 4e85816a39..0145b31b31 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -192,6 +192,8 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v0,%%v2 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" From dc4d3bccd5ee7de7bb823aa0bb7008a04bcc21d4 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 29 Jan 2019 03:47:49 +0200 Subject: [PATCH 067/189] [ZARCH] Fix icamax/icamin --- kernel/zarch/icamax.c | 2 +- kernel/zarch/icamin.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 2d1442ad90..113c0cef5d 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -94,7 +94,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vlef %%v18,48(%%r1,%3),2 \n\t" "vlef %%v19,52(%%r1,%3),2 \n\t" "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,30(%%r1,%3),3 \n\t" + "vlef %%v19,60(%%r1,%3),3 \n\t" "vlef %%v20,64(%%r1,%3),0 \n\t" "vlef %%v21,68(%%r1,%3),0 \n\t" diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 79aa6d3410..5096b641b4 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -94,7 +94,7 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vlef %%v18,48(%%r1,%3),2 \n\t" "vlef %%v19,52(%%r1,%3),2 \n\t" "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,30(%%r1,%3),3 \n\t" + "vlef %%v19,60(%%r1,%3),3 \n\t" "vlef %%v20,64(%%r1,%3),0 \n\t" "vlef %%v21,68(%%r1,%3),0 \n\t" From fcd814a8d292b7712a4230d9b9a20f0f2ce0fe52 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 29 Jan 2019 17:59:38 +0200 Subject: [PATCH 068/189] [ZARCH] Fix bug in max/min functions --- kernel/zarch/camax.c | 2 +- kernel/zarch/camin.c | 2 +- kernel/zarch/icamax.c | 2 +- kernel/zarch/icamin.c | 2 +- kernel/zarch/izamax.c | 2 +- kernel/zarch/izamin.c | 2 +- kernel/zarch/zamax.c | 2 +- kernel/zarch/zamin.c | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 66d2508962..f6fa772acc 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -198,7 +198,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { maxf = camax_kernel_32(n1, x); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 5abc685b2e..4bd6ca17d6 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -198,7 +198,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { minf = camin_kernel_32(n1, x); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 113c0cef5d..a9e7f91fcb 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -280,7 +280,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { max = icamax_kernel_32(n1, x, &maxf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 5096b641b4..faf5f9c650 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -280,7 +280,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { min = icamin_kernel_32(n1, x, &minf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 2ffad25703..2d1cc23653 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -199,7 +199,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { max = izamax_kernel_16(n1, x, &maxf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 1e037c0c77..676fd7c6d9 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -199,7 +199,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { min = izamin_kernel_16(n1, x, &minf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 8175874c05..b7214783f7 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { maxf = zamax_kernel_16(n1, x); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 5d57ff12e1..d53fdb6b8e 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { minf = zamin_kernel_16(n1, x); - + ix = n1 * 2; i = n1; } else From eaf20f0e7ac8c2ab53deeb78f959bebb2a49cddd Mon Sep 17 00:00:00 2001 From: maamountki Date: Thu, 31 Jan 2019 09:26:50 +0200 Subject: [PATCH 069/189] Remove ztest --- ztest/Makefile | 437 ---------------------------------- ztest/amax.c | 235 ------------------ ztest/amin.c | 235 ------------------ ztest/asum.c | 263 -------------------- ztest/axpy.c | 303 ----------------------- ztest/copy.c | 291 ----------------------- ztest/dot.c | 296 ----------------------- ztest/dsdot.c | 229 ------------------ ztest/gemv.c | 633 ------------------------------------------------- ztest/iamax.c | 284 ---------------------- ztest/iamin.c | 284 ---------------------- ztest/imax.c | 231 ------------------ ztest/imin.c | 231 ------------------ ztest/max.c | 229 ------------------ ztest/min.c | 229 ------------------ ztest/rot.c | 303 ----------------------- ztest/scal.c | 308 ------------------------ ztest/swap.c | 306 ------------------------ 18 files changed, 5327 deletions(-) delete mode 100644 ztest/Makefile delete mode 100644 ztest/amax.c delete mode 100644 ztest/amin.c delete mode 100644 ztest/asum.c delete mode 100644 ztest/axpy.c delete mode 100644 ztest/copy.c delete mode 100644 ztest/dot.c delete mode 100644 ztest/dsdot.c delete mode 100644 ztest/gemv.c delete mode 100644 ztest/iamax.c delete mode 100644 ztest/iamin.c delete mode 100644 ztest/imax.c delete mode 100644 ztest/imin.c delete mode 100644 ztest/max.c delete mode 100644 ztest/min.c delete mode 100644 ztest/rot.c delete mode 100644 ztest/scal.c delete mode 100644 ztest/swap.c diff --git a/ztest/Makefile b/ztest/Makefile deleted file mode 100644 index 0ff7fe46a5..0000000000 --- a/ztest/Makefile +++ /dev/null @@ -1,437 +0,0 @@ -TOPDIR = .. -include $(TOPDIR)/Makefile.system - -goto :: sdot.goto ddot.goto cdot.goto zdot.goto dsdot.goto sswap.goto dswap.goto cswap.goto zswap.goto isamax.goto idamax.goto icamax.goto izamax.goto samax.goto damax.goto ismax.goto idmax.goto smax.goto dmax.goto isamin.goto idamin.goto icamin.goto izamin.goto samin.goto damin.goto camin.goto zamin.goto ismin.goto idmin.goto smin.goto dmin.goto sgemv.goto dgemv.goto cgemv.goto zgemv.goto sscal.goto dscal.goto cscal.goto zscal.goto saxpy.goto daxpy.goto caxpy.goto zaxpy.goto srot.goto drot.goto crot.goto zrot.goto sasum.goto dasum.goto casum.goto zasum.goto scopy.goto dcopy.goto ccopy.goto zcopy.goto - -##################################### Sdot #################################################### -sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Ddot #################################################### -ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cdot #################################################### -cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zdot #################################################### -zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dsdot #################################################### -dsdot.goto : dsdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISAMAX ############################################## -isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDAMAX ############################################## -idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ICAMAX ############################################## -icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IZAMAX ############################################## -izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMAX ############################################## -samax.goto : samax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMAX ############################################## -damax.goto : damax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISMAX ############################################## -ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMAX ############################################## -idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMAX ############################################## -smax.goto : smax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMAX ############################################## -dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISAMIN ############################################## -isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDAMIN ############################################## -idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ICAMIN ############################################## -icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IZAMIN ############################################## -izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMIN ############################################## -samin.goto : samin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMIN ############################################## -damin.goto : damin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## CAMIN ############################################## -camin.goto : camin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ZAMIN ############################################## -zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISMIN ############################################## -ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMIN ############################################## -idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMIN ############################################## -smin.goto : smin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMIN ############################################## -dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sgemv #################################################### -sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dgemv #################################################### -dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cgemv #################################################### - -cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zgemv #################################################### - -zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sscal #################################################### -sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dscal #################################################### -dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cscal #################################################### - -cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zscal #################################################### - -zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Saxpy #################################################### -saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Daxpy #################################################### -daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Caxpy #################################################### - -caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zaxpy #################################################### - -zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Srot #################################################### -srot.goto : srot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Drot #################################################### -drot.goto : drot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Crot #################################################### -crot.goto : crot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zrot #################################################### -zrot.goto : zrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sswap #################################################### -sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dswap #################################################### -dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cswap #################################################### - -cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zswap #################################################### - -zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Saxpy #################################################### -saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Daxpy #################################################### -daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Caxpy #################################################### - -caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zaxpy #################################################### - -zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sasum #################################################### -sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dasum #################################################### -dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Casum #################################################### - -casum.goto : casum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zasum #################################################### - -zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Scopy #################################################### -scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dcopy #################################################### -dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Ccopy #################################################### - -ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zcopy #################################################### - -zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -################################################################################################### - -sdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -ddot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -dsdot.$(SUFFIX) : dsdot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -isamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -samax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ismax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -smax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -isamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -samin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -camin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zamin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ismin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -smin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -saxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -drot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -crot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -saxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -casum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -scopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ccopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -clean :: - @rm -f *.goto - diff --git a/ztest/amax.c b/ztest/amax.c deleted file mode 100644 index f2e3f54119..0000000000 --- a/ztest/amax.c +++ /dev/null @@ -1,235 +0,0 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -FLOAT amax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - - if (n <= 0 || inc_x <= 0) return(maxf); - - maxf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) > maxf ) - { - maxf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(maxf); -} - -#undef AMAX -#ifdef DOUBLE -#define AMAX BLASFUNC(damax) -#else -#define AMAX BLASFUNC(samax) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -FLOAT amin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - - if (n <= 0 || inc_x <= 0) return(minf); - - minf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) < minf ) - { - minf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(minf); -} - -#undef AMIN -#ifdef DOUBLE -#define AMIN BLASFUNC(damin) -#else -#define AMIN BLASFUNC(samin) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#ifdef COMPLEX -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -FLOAT zasum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - FLOAT sumf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(sumf); - - inc_x2 = 2 * inc_x; - - n *= inc_x2; - while(i < n) - { - sumf += CABS1(x,i); - i += inc_x2; - } - return(sumf); -} -#else -FLOAT asum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - FLOAT sumf = 0.0; - if (n <= 0 || inc_x <= 0) return(sumf); - - n *= inc_x; - while(i < n) - { - sumf += ABS(x[i]); - i += inc_x; - } - return(sumf); -} -#endif - -#undef ASUM -#ifdef COMPLEX -#ifdef DOUBLE -#define ASUM BLASFUNC(dzasum) -#else -#define ASUM BLASFUNC(scasum) -#endif -#else -#ifdef DOUBLE -#define ASUM BLASFUNC(dasum) -#else -#define ASUM BLASFUNC(sasum) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zaxpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix,iy; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); - if ( da_r == 0.0 && da_i == 0.0 ) return(0); - - ix = 0; - iy = 0; - - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - - while(i < n) - { -#if !defined(CONJ) - y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; - y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; -#else - y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; - y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; -#endif - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - return(0); - -} -#else -int axpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix,iy; - - if ( n < 0 ) return(0); - if ( da == 0.0 ) return(0); - - ix = 0; - iy = 0; - - while(i < n) - { - - y[iy] += da * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); - -} -#endif - -#undef AXPY -#ifdef COMPLEX -#ifdef DOUBLE -#define AXPY BLASFUNC(zaxpy) -#else -#define AXPY BLASFUNC(caxpy) -#endif -#else -#ifdef DOUBLE -#define AXPY BLASFUNC(daxpy) -#else -#define AXPY BLASFUNC(saxpy) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *y_c;; - FLOAT alpha[2] = { 2.0, 2.0 }; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - argc--;argv++; - - blasint iy; - int test = 1; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zcopy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); - - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - - while(i < n) - { - - y[iy] = x[ix] ; - y[iy+1] = x[ix+1] ; - ix += inc_x2; - iy += inc_y2; - i++ ; - - } - return(0); - -} -#else -int copy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - - if ( n < 0 ) return(0); - - while(i < n) - { - - y[iy] = x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); - -} -#endif - -#undef COPY -#ifdef COMPLEX -#ifdef DOUBLE -#define COPY BLASFUNC(zcopy) -#else -#define COPY BLASFUNC(ccopy) -#endif -#else -#ifdef DOUBLE -#define COPY BLASFUNC(dcopy) -#else -#define COPY BLASFUNC(scopy) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *y_c; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -OPENBLAS_COMPLEX_FLOAT zdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT dot[2]; - OPENBLAS_COMPLEX_FLOAT result; - BLASLONG inc_x2; - BLASLONG inc_y2; - - dot[0]=0.0; - dot[1]=0.0; - - CREAL(result) = 0.0 ; - CIMAG(result) = 0.0 ; - - if ( n < 1 ) return(result); - - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - - while(i < n) - { -#if !defined(CONJ) - dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; - dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; -#else - dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; - dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; -#endif - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - CREAL(result) = dot[0]; - CIMAG(result) = dot[1]; - return(result); - -} -#else -FLOAT dot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT dot = 0.0 ; - - if ( n < 0 ) return(dot); - - while(i < n) - { - - dot += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); -} -#endif - -#undef DOT -#ifdef COMPLEX -#ifdef DOUBLE -#define DOT BLASFUNC(zdotu) -#else -#define DOT BLASFUNC(cdotu) -#endif -#else -#ifdef DOUBLE -#define DOT BLASFUNC(ddot) -#else -#define DOT BLASFUNC(sdot) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y; -#ifdef COMPLEX - OPENBLAS_COMPLEX_FLOAT result, result_c; -#else - FLOAT result, result_c; -#endif - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -double dsdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - double dot = 0.0 ; - - if ( n < 0 ) return(dot); - - while(i < n) - { - - dot += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); -} - -#undef DSDOT -#define DSDOT BLASFUNC(dsdot) - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y; - double result, result_c; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i; - BLASLONG ix, iy; - BLASLONG j; - FLOAT *a_ptr; - FLOAT temp_r, temp_i; - BLASLONG inc_x2, inc_y2; - BLASLONG lda2; - BLASLONG i2; - - lda2 = 2 * lda; - - ix = 0; - a_ptr = a; - - if (inc_x == 1 && inc_y == 1) - { - - for (j = 0; jtv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *a, *x, *y, *y_c; - FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 0.0}; - char trans='N'; - blasint m, i, j; - blasint inc_x=1,inc_y=1; - blasint n=0; - int has_param_n = 0; - int has_param_m = 0; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint y_size; - blasint iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - - int tomax = to; - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; - if ((p = getenv("OPENBLAS_PARAM_N"))) { - n = atoi(p); - if ((n>0)) has_param_n = 1; - if ( n > tomax ) tomax = n; - } - if ( has_param_n == 0 ) - if ((p = getenv("OPENBLAS_PARAM_M"))) { - m = atoi(p); - if ((m>0)) has_param_m = 1; - if ( m > tomax ) tomax = m; - } - - - - fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); - - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - if (has_param_m == 0) - { - - for(m = from; m <= to; m += step) - { - timeg=0; - timeg_c=0; - if ( has_param_n == 0 ) n = m; - fprintf(stderr, " %6dx%d :", (int)m,(int)n); - for(j = 0; j < m; j++){ - for(i = 0; i < n * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#ifdef COMPLEX -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -BLASLONG izamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf; - BLASLONG max=0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(max); - - inc_x2 = 2 * inc_x; - - maxf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return(max+1); -} -#else -BLASLONG iamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - BLASLONG max=0; - - if (n <= 0 || inc_x <= 0) return(max); - - maxf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) > maxf ) - { - max = i; - maxf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(max+1); -} -#endif - -#undef IAMAX -#ifdef COMPLEX -#ifdef DOUBLE -#define IAMAX BLASFUNC(izamax) -#else -#define IAMAX BLASFUNC(icamax) -#endif -#else -#ifdef DOUBLE -#define IAMAX BLASFUNC(idamax) -#else -#define IAMAX BLASFUNC(isamax) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#ifdef COMPLEX -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -BLASLONG izamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf; - BLASLONG min=0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(min); - - inc_x2 = 2 * inc_x; - - minf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return(min+1); -} -#else -BLASLONG iamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - BLASLONG min=0; - - if (n <= 0 || inc_x <= 0) return(min); - - minf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) < minf ) - { - min = i; - minf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(min+1); -} -#endif - -#undef IAMIN -#ifdef COMPLEX -#ifdef DOUBLE -#define IAMIN BLASFUNC(izamin) -#else -#define IAMIN BLASFUNC(icamin) -#endif -#else -#ifdef DOUBLE -#define IAMIN BLASFUNC(idamin) -#else -#define IAMIN BLASFUNC(isamin) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -BLASLONG imax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - BLASLONG max=0; - - if (n <= 0 || inc_x <= 0) return(max); - - maxf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] > maxf ) - { - max = i; - maxf = x[ix]; - } - ix += inc_x; - i++; - } - return(max+1); -} - -#undef IMAX -#ifdef DOUBLE -#define IMAX BLASFUNC(idmax) -#else -#define IMAX BLASFUNC(ismax) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -BLASLONG imin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - BLASLONG min=0; - - if (n <= 0 || inc_x <= 0) return(min); - - minf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] < minf ) - { - min = i; - minf = x[ix]; - } - ix += inc_x; - i++; - } - return(min+1); -} - -#undef IMIN -#ifdef DOUBLE -#define IMIN BLASFUNC(idmin) -#else -#define IMIN BLASFUNC(ismin) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -FLOAT max_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - - if (n <= 0 || inc_x <= 0) return(maxf); - - maxf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] > maxf ) - { - maxf = x[ix]; - } - ix += inc_x; - i++; - } - return(maxf); -} - -#undef MAX_ -#ifdef DOUBLE -#define MAX_ BLASFUNC(dmax) -#else -#define MAX_ BLASFUNC(smax) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -FLOAT min_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - - if (n <= 0 || inc_x <= 0) return(minf); - - minf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] < minf ) - { - minf = x[ix]; - } - ix += inc_x; - i++; - } - return(minf); -} - -#undef MIN_ -#ifdef DOUBLE -#define MIN_ BLASFUNC(dmin) -#else -#define MIN_ BLASFUNC(smin) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zrot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - return(0); -} -#else -int rot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n <= 0 ) return(0); - - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; - - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); -} -#endif - -#undef ROT -#ifdef COMPLEX -#ifdef DOUBLE -#define ROT BLASFUNC(zdrot) -#else -#define ROT BLASFUNC(csrot) -#endif -#else -#ifdef DOUBLE -#define ROT BLASFUNC(drot) -#else -#define ROT BLASFUNC(srot) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *x_c, *y_c; - // FLOAT result; - blasint m, i; - blasint inc_x=1,inc_y=1; - FLOAT c[1] = { 2.0 }; - FLOAT s[1] = { 2.0 }; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint ix,iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zscal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG inc_x2; - BLASLONG ip = 0; - FLOAT temp; - - if ( (n <= 0) || (inc_x <= 0)) - return(0); - - inc_x2 = 2 * inc_x; - for ( i=0; itv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *x_c; - FLOAT alpha[2] = { 2.0, 2.0 }; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint ix; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zswap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); - - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - - while(i < n) - { - - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - return(0); -} -#else -int swap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n < 0 ) return(0); - - while(i < n) - { - - temp = x[ix] ; - x[ix] = y[iy] ; - y[iy] = temp ; - - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); -} -#endif - -#undef SWAP -#ifdef COMPLEX -#ifdef DOUBLE -#define SWAP BLASFUNC(zswap) -#else -#define SWAP BLASFUNC(cswap) -#endif -#else -#ifdef DOUBLE -#define SWAP BLASFUNC(dswap) -#else -#define SWAP BLASFUNC(sswap) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *x_c, *y_c; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint ix,iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l Date: Thu, 31 Jan 2019 15:25:15 +0100 Subject: [PATCH 070/189] Fix wrong comparison that made IMIN identical to IMAX as suggested in #1990 --- kernel/arm/imin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c index 598cba3871..ffc65226ed 100644 --- a/kernel/arm/imin.c +++ b/kernel/arm/imin.c @@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > minf ) + if( x[ix] < minf ) { min = i; minf = x[ix]; From 86a824c97f1f4ccfe8b24678dc0fdaf4846a7055 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 15:27:21 +0100 Subject: [PATCH 071/189] Fix wrong comparison that made IMIN identical to IMAX as reported by aarnez in #1990 --- kernel/mips/imin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips/imin.c b/kernel/mips/imin.c index d9b283d2d9..bf130613bf 100644 --- a/kernel/mips/imin.c +++ b/kernel/mips/imin.c @@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > minf ) + if( x[ix] < minf ) { min = i; minf = x[ix]; From 48b9b94f7f7d1856babac7f20f7e9d90fa8750d0 Mon Sep 17 00:00:00 2001 From: maamountki Date: Thu, 31 Jan 2019 18:52:11 +0200 Subject: [PATCH 072/189] [ZARCH] Improve loading performance for camax/icamax --- kernel/zarch/camax.c | 128 ++++++++++++++++++------------------------ kernel/zarch/camin.c | 128 ++++++++++++++++++------------------------ kernel/zarch/icamax.c | 114 ++++++++++++++++--------------------- kernel/zarch/icamin.c | 114 ++++++++++++++++--------------------- kernel/zarch/zamax.c | 2 +- kernel/zarch/zamin.c | 2 +- 6 files changed, 212 insertions(+), 276 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index f6fa772acc..2e9648640a 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -52,82 +52,66 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "vflpsb %%v0,%%v0 \n\t" "vflpsb %%v16,%%v16 \n\t" "vfasb %%v0,%%v0,%%v16 \n\t" + "vleib %%v1,0,0 \n\t" + "vleib %%v1,1,1 \n\t" + "vleib %%v1,2,2 \n\t" + "vleib %%v1,3,3 \n\t" + "vleib %%v1,8,4 \n\t" + "vleib %%v1,9,5 \n\t" + "vleib %%v1,10,6 \n\t" + "vleib %%v1,11,7 \n\t" + "vleib %%v1,16,8 \n\t" + "vleib %%v1,17,9 \n\t" + "vleib %%v1,18,10 \n\t" + "vleib %%v1,19,11 \n\t" + "vleib %%v1,24,12 \n\t" + "vleib %%v1,25,13 \n\t" + "vleib %%v1,26,14 \n\t" + "vleib %%v1,27,15 \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" - "vlef %%v16,0(%%r1,%2),0 \n\t" - "vlef %%v17,4(%%r1,%2),0 \n\t" - "vlef %%v16,8(%%r1,%2),1 \n\t" - "vlef %%v17,12(%%r1,%2),1 \n\t" - "vlef %%v16,16(%%r1,%2),2 \n\t" - "vlef %%v17,20(%%r1,%2),2 \n\t" - "vlef %%v16,24(%%r1,%2),3 \n\t" - "vlef %%v17,28(%%r1,%2),3 \n\t" - - "vlef %%v18,32(%%r1,%2),0 \n\t" - "vlef %%v19,36(%%r1,%2),0 \n\t" - "vlef %%v18,40(%%r1,%2),1 \n\t" - "vlef %%v19,44(%%r1,%2),1 \n\t" - "vlef %%v18,48(%%r1,%2),2 \n\t" - "vlef %%v19,52(%%r1,%2),2 \n\t" - "vlef %%v18,56(%%r1,%2),3 \n\t" - "vlef %%v19,30(%%r1,%2),3 \n\t" - - "vlef %%v20,64(%%r1,%2),0 \n\t" - "vlef %%v21,68(%%r1,%2),0 \n\t" - "vlef %%v20,72(%%r1,%2),1 \n\t" - "vlef %%v21,76(%%r1,%2),1 \n\t" - "vlef %%v20,80(%%r1,%2),2 \n\t" - "vlef %%v21,84(%%r1,%2),2 \n\t" - "vlef %%v20,88(%%r1,%2),3 \n\t" - "vlef %%v21,92(%%r1,%2),3 \n\t" - - "vlef %%v22,96(%%r1,%2),0 \n\t" - "vlef %%v23,100(%%r1,%2),0 \n\t" - "vlef %%v22,104(%%r1,%2),1 \n\t" - "vlef %%v23,108(%%r1,%2),1 \n\t" - "vlef %%v22,112(%%r1,%2),2 \n\t" - "vlef %%v23,116(%%r1,%2),2 \n\t" - "vlef %%v22,120(%%r1,%2),3 \n\t" - "vlef %%v23,124(%%r1,%2),3 \n\t" - - "vlef %%v24,128(%%r1,%2),0 \n\t" - "vlef %%v25,132(%%r1,%2),0 \n\t" - "vlef %%v24,136(%%r1,%2),1 \n\t" - "vlef %%v25,140(%%r1,%2),1 \n\t" - "vlef %%v24,144(%%r1,%2),2 \n\t" - "vlef %%v25,148(%%r1,%2),2 \n\t" - "vlef %%v24,152(%%r1,%2),3 \n\t" - "vlef %%v25,156(%%r1,%2),3 \n\t" - - "vlef %%v26,160(%%r1,%2),0 \n\t" - "vlef %%v27,164(%%r1,%2),0 \n\t" - "vlef %%v26,168(%%r1,%2),1 \n\t" - "vlef %%v27,172(%%r1,%2),1 \n\t" - "vlef %%v26,176(%%r1,%2),2 \n\t" - "vlef %%v27,180(%%r1,%2),2 \n\t" - "vlef %%v26,184(%%r1,%2),3 \n\t" - "vlef %%v27,188(%%r1,%2),3 \n\t" - - "vlef %%v28,192(%%r1,%2),0 \n\t" - "vlef %%v29,196(%%r1,%2),0 \n\t" - "vlef %%v28,200(%%r1,%2),1 \n\t" - "vlef %%v29,204(%%r1,%2),1 \n\t" - "vlef %%v28,208(%%r1,%2),2 \n\t" - "vlef %%v29,212(%%r1,%2),2 \n\t" - "vlef %%v28,216(%%r1,%2),3 \n\t" - "vlef %%v29,220(%%r1,%2),3 \n\t" - - "vlef %%v30,224(%%r1,%2),0 \n\t" - "vlef %%v31,228(%%r1,%2),0 \n\t" - "vlef %%v30,232(%%r1,%2),1 \n\t" - "vlef %%v31,236(%%r1,%2),1 \n\t" - "vlef %%v30,240(%%r1,%2),2 \n\t" - "vlef %%v31,244(%%r1,%2),2 \n\t" - "vlef %%v30,248(%%r1,%2),3 \n\t" - "vlef %%v31,252(%%r1,%2),3 \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v2,16(%%r1,%2) \n\t" + "vpkg %%v17,%%v16,%%v2 \n\t" + "vperm %%v16,%%v16,%%v2,%%v1 \n\t" + + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v2,48(%%r1,%2) \n\t" + "vpkg %%v19,%%v18,%%v2 \n\t" + "vperm %%v18,%%v18,%%v2,%%v1 \n\t" + + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v2,80(%%r1,%2) \n\t" + "vpkg %%v21,%%v20,%%v2 \n\t" + "vperm %%v20,%%v20,%%v2,%%v1 \n\t" + + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v2,112(%%r1,%2) \n\t" + "vpkg %%v23,%%v22,%%v2 \n\t" + "vperm %%v22,%%v22,%%v2,%%v1 \n\t" + + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v2,144(%%r1,%2) \n\t" + "vpkg %%v25,%%v24,%%v2 \n\t" + "vperm %%v24,%%v24,%%v2,%%v1 \n\t" + + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v2,176(%%r1,%2) \n\t" + "vpkg %%v27,%%v26,%%v2 \n\t" + "vperm %%v26,%%v26,%%v2,%%v1 \n\t" + + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v2,208(%%r1,%2) \n\t" + "vpkg %%v29,%%v28,%%v2 \n\t" + "vperm %%v28,%%v28,%%v2,%%v1 \n\t" + + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v2,240(%%r1,%2) \n\t" + "vpkg %%v31,%%v30,%%v2 \n\t" + "vperm %%v30,%%v30,%%v2,%%v1 \n\t" "vflpsb %%v16,%%v16 \n\t" "vflpsb %%v17,%%v17 \n\t" @@ -178,7 +162,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "ler %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amax; diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 4bd6ca17d6..aec59058ed 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -52,82 +52,66 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vflpsb %%v0,%%v0 \n\t" "vflpsb %%v16,%%v16 \n\t" "vfasb %%v0,%%v0,%%v16 \n\t" + "vleib %%v1,0,0 \n\t" + "vleib %%v1,1,1 \n\t" + "vleib %%v1,2,2 \n\t" + "vleib %%v1,3,3 \n\t" + "vleib %%v1,8,4 \n\t" + "vleib %%v1,9,5 \n\t" + "vleib %%v1,10,6 \n\t" + "vleib %%v1,11,7 \n\t" + "vleib %%v1,16,8 \n\t" + "vleib %%v1,17,9 \n\t" + "vleib %%v1,18,10 \n\t" + "vleib %%v1,19,11 \n\t" + "vleib %%v1,24,12 \n\t" + "vleib %%v1,25,13 \n\t" + "vleib %%v1,26,14 \n\t" + "vleib %%v1,27,15 \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" - "vlef %%v16,0(%%r1,%2),0 \n\t" - "vlef %%v17,4(%%r1,%2),0 \n\t" - "vlef %%v16,8(%%r1,%2),1 \n\t" - "vlef %%v17,12(%%r1,%2),1 \n\t" - "vlef %%v16,16(%%r1,%2),2 \n\t" - "vlef %%v17,20(%%r1,%2),2 \n\t" - "vlef %%v16,24(%%r1,%2),3 \n\t" - "vlef %%v17,28(%%r1,%2),3 \n\t" - - "vlef %%v18,32(%%r1,%2),0 \n\t" - "vlef %%v19,36(%%r1,%2),0 \n\t" - "vlef %%v18,40(%%r1,%2),1 \n\t" - "vlef %%v19,44(%%r1,%2),1 \n\t" - "vlef %%v18,48(%%r1,%2),2 \n\t" - "vlef %%v19,52(%%r1,%2),2 \n\t" - "vlef %%v18,56(%%r1,%2),3 \n\t" - "vlef %%v19,30(%%r1,%2),3 \n\t" - - "vlef %%v20,64(%%r1,%2),0 \n\t" - "vlef %%v21,68(%%r1,%2),0 \n\t" - "vlef %%v20,72(%%r1,%2),1 \n\t" - "vlef %%v21,76(%%r1,%2),1 \n\t" - "vlef %%v20,80(%%r1,%2),2 \n\t" - "vlef %%v21,84(%%r1,%2),2 \n\t" - "vlef %%v20,88(%%r1,%2),3 \n\t" - "vlef %%v21,92(%%r1,%2),3 \n\t" - - "vlef %%v22,96(%%r1,%2),0 \n\t" - "vlef %%v23,100(%%r1,%2),0 \n\t" - "vlef %%v22,104(%%r1,%2),1 \n\t" - "vlef %%v23,108(%%r1,%2),1 \n\t" - "vlef %%v22,112(%%r1,%2),2 \n\t" - "vlef %%v23,116(%%r1,%2),2 \n\t" - "vlef %%v22,120(%%r1,%2),3 \n\t" - "vlef %%v23,124(%%r1,%2),3 \n\t" - - "vlef %%v24,128(%%r1,%2),0 \n\t" - "vlef %%v25,132(%%r1,%2),0 \n\t" - "vlef %%v24,136(%%r1,%2),1 \n\t" - "vlef %%v25,140(%%r1,%2),1 \n\t" - "vlef %%v24,144(%%r1,%2),2 \n\t" - "vlef %%v25,148(%%r1,%2),2 \n\t" - "vlef %%v24,152(%%r1,%2),3 \n\t" - "vlef %%v25,156(%%r1,%2),3 \n\t" - - "vlef %%v26,160(%%r1,%2),0 \n\t" - "vlef %%v27,164(%%r1,%2),0 \n\t" - "vlef %%v26,168(%%r1,%2),1 \n\t" - "vlef %%v27,172(%%r1,%2),1 \n\t" - "vlef %%v26,176(%%r1,%2),2 \n\t" - "vlef %%v27,180(%%r1,%2),2 \n\t" - "vlef %%v26,184(%%r1,%2),3 \n\t" - "vlef %%v27,188(%%r1,%2),3 \n\t" - - "vlef %%v28,192(%%r1,%2),0 \n\t" - "vlef %%v29,196(%%r1,%2),0 \n\t" - "vlef %%v28,200(%%r1,%2),1 \n\t" - "vlef %%v29,204(%%r1,%2),1 \n\t" - "vlef %%v28,208(%%r1,%2),2 \n\t" - "vlef %%v29,212(%%r1,%2),2 \n\t" - "vlef %%v28,216(%%r1,%2),3 \n\t" - "vlef %%v29,220(%%r1,%2),3 \n\t" - - "vlef %%v30,224(%%r1,%2),0 \n\t" - "vlef %%v31,228(%%r1,%2),0 \n\t" - "vlef %%v30,232(%%r1,%2),1 \n\t" - "vlef %%v31,236(%%r1,%2),1 \n\t" - "vlef %%v30,240(%%r1,%2),2 \n\t" - "vlef %%v31,244(%%r1,%2),2 \n\t" - "vlef %%v30,248(%%r1,%2),3 \n\t" - "vlef %%v31,252(%%r1,%2),3 \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v2,16(%%r1,%2) \n\t" + "vpkg %%v17,%%v16,%%v2 \n\t" + "vperm %%v16,%%v16,%%v2,%%v1 \n\t" + + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v2,48(%%r1,%2) \n\t" + "vpkg %%v19,%%v18,%%v2 \n\t" + "vperm %%v18,%%v18,%%v2,%%v1 \n\t" + + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v2,80(%%r1,%2) \n\t" + "vpkg %%v21,%%v20,%%v2 \n\t" + "vperm %%v20,%%v20,%%v2,%%v1 \n\t" + + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v2,112(%%r1,%2) \n\t" + "vpkg %%v23,%%v22,%%v2 \n\t" + "vperm %%v22,%%v22,%%v2,%%v1 \n\t" + + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v2,144(%%r1,%2) \n\t" + "vpkg %%v25,%%v24,%%v2 \n\t" + "vperm %%v24,%%v24,%%v2,%%v1 \n\t" + + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v2,176(%%r1,%2) \n\t" + "vpkg %%v27,%%v26,%%v2 \n\t" + "vperm %%v26,%%v26,%%v2,%%v1 \n\t" + + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v2,208(%%r1,%2) \n\t" + "vpkg %%v29,%%v28,%%v2 \n\t" + "vperm %%v28,%%v28,%%v2,%%v1 \n\t" + + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v2,240(%%r1,%2) \n\t" + "vpkg %%v31,%%v30,%%v2 \n\t" + "vperm %%v30,%%v30,%%v2,%%v1 \n\t" "vflpsb %%v16,%%v16 \n\t" "vflpsb %%v17,%%v17 \n\t" @@ -178,7 +162,7 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "ler %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amin; diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index a9e7f91fcb..5129ca6ee3 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -57,6 +57,22 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vleig %%v2,3,1 \n\t" "vrepig %%v3,16 \n\t" "vzero %%v4 \n\t" + "vleib %%v9,0,0 \n\t" + "vleib %%v9,1,1 \n\t" + "vleib %%v9,2,2 \n\t" + "vleib %%v9,3,3 \n\t" + "vleib %%v9,8,4 \n\t" + "vleib %%v9,9,5 \n\t" + "vleib %%v9,10,6 \n\t" + "vleib %%v9,11,7 \n\t" + "vleib %%v9,16,8 \n\t" + "vleib %%v9,17,9 \n\t" + "vleib %%v9,18,10 \n\t" + "vleib %%v9,19,11 \n\t" + "vleib %%v9,24,12 \n\t" + "vleib %%v9,25,13 \n\t" + "vleib %%v9,26,14 \n\t" + "vleib %%v9,27,15 \n\t" "vleif %%v24,0,0 \n\t" "vleif %%v24,1,1 \n\t" "vleif %%v24,2,2 \n\t" @@ -78,41 +94,25 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" - "vlef %%v16,0(%%r1,%3),0 \n\t" - "vlef %%v17,4(%%r1,%3),0 \n\t" - "vlef %%v16,8(%%r1,%3),1 \n\t" - "vlef %%v17,12(%%r1,%3),1 \n\t" - "vlef %%v16,16(%%r1,%3),2 \n\t" - "vlef %%v17,20(%%r1,%3),2 \n\t" - "vlef %%v16,24(%%r1,%3),3 \n\t" - "vlef %%v17,28(%%r1,%3),3 \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v28,16(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,32(%%r1,%3),0 \n\t" - "vlef %%v19,36(%%r1,%3),0 \n\t" - "vlef %%v18,40(%%r1,%3),1 \n\t" - "vlef %%v19,44(%%r1,%3),1 \n\t" - "vlef %%v18,48(%%r1,%3),2 \n\t" - "vlef %%v19,52(%%r1,%3),2 \n\t" - "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,60(%%r1,%3),3 \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v29,48(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,64(%%r1,%3),0 \n\t" - "vlef %%v21,68(%%r1,%3),0 \n\t" - "vlef %%v20,72(%%r1,%3),1 \n\t" - "vlef %%v21,76(%%r1,%3),1 \n\t" - "vlef %%v20,80(%%r1,%3),2 \n\t" - "vlef %%v21,84(%%r1,%3),2 \n\t" - "vlef %%v20,88(%%r1,%3),3 \n\t" - "vlef %%v21,92(%%r1,%3),3 \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,96(%%r1,%3),0 \n\t" - "vlef %%v23,100(%%r1,%3),0 \n\t" - "vlef %%v22,104(%%r1,%3),1 \n\t" - "vlef %%v23,108(%%r1,%3),1 \n\t" - "vlef %%v22,112(%%r1,%3),2 \n\t" - "vlef %%v23,116(%%r1,%3),2 \n\t" - "vlef %%v22,120(%%r1,%3),3 \n\t" - "vlef %%v23,124(%%r1,%3),3 \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -151,41 +151,25 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vlef %%v16,128(%%r1,%3),0 \n\t" - "vlef %%v17,132(%%r1,%3),0 \n\t" - "vlef %%v16,136(%%r1,%3),1 \n\t" - "vlef %%v17,140(%%r1,%3),1 \n\t" - "vlef %%v16,144(%%r1,%3),2 \n\t" - "vlef %%v17,148(%%r1,%3),2 \n\t" - "vlef %%v16,152(%%r1,%3),3 \n\t" - "vlef %%v17,156(%%r1,%3),3 \n\t" + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v28,144(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,160(%%r1,%3),0 \n\t" - "vlef %%v19,164(%%r1,%3),0 \n\t" - "vlef %%v18,168(%%r1,%3),1 \n\t" - "vlef %%v19,172(%%r1,%3),1 \n\t" - "vlef %%v18,176(%%r1,%3),2 \n\t" - "vlef %%v19,180(%%r1,%3),2 \n\t" - "vlef %%v18,184(%%r1,%3),3 \n\t" - "vlef %%v19,188(%%r1,%3),3 \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v29,176(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,192(%%r1,%3),0 \n\t" - "vlef %%v21,196(%%r1,%3),0 \n\t" - "vlef %%v20,200(%%r1,%3),1 \n\t" - "vlef %%v21,204(%%r1,%3),1 \n\t" - "vlef %%v20,208(%%r1,%3),2 \n\t" - "vlef %%v21,212(%%r1,%3),2 \n\t" - "vlef %%v20,216(%%r1,%3),3 \n\t" - "vlef %%v21,220(%%r1,%3),3 \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v30,208(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,224(%%r1,%3),0 \n\t" - "vlef %%v23,228(%%r1,%3),0 \n\t" - "vlef %%v22,232(%%r1,%3),1 \n\t" - "vlef %%v23,236(%%r1,%3),1 \n\t" - "vlef %%v22,240(%%r1,%3),2 \n\t" - "vlef %%v23,244(%%r1,%3),2 \n\t" - "vlef %%v22,248(%%r1,%3),3 \n\t" - "vlef %%v23,252(%%r1,%3),3 \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v31,240(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -258,7 +242,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "nop " :"=r"(iamax),"=m"(*amax) :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return iamax; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index faf5f9c650..05068b212c 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -57,6 +57,22 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vleig %%v2,3,1 \n\t" "vrepig %%v3,16 \n\t" "vzero %%v4 \n\t" + "vleib %%v9,0,0 \n\t" + "vleib %%v9,1,1 \n\t" + "vleib %%v9,2,2 \n\t" + "vleib %%v9,3,3 \n\t" + "vleib %%v9,8,4 \n\t" + "vleib %%v9,9,5 \n\t" + "vleib %%v9,10,6 \n\t" + "vleib %%v9,11,7 \n\t" + "vleib %%v9,16,8 \n\t" + "vleib %%v9,17,9 \n\t" + "vleib %%v9,18,10 \n\t" + "vleib %%v9,19,11 \n\t" + "vleib %%v9,24,12 \n\t" + "vleib %%v9,25,13 \n\t" + "vleib %%v9,26,14 \n\t" + "vleib %%v9,27,15 \n\t" "vleif %%v24,0,0 \n\t" "vleif %%v24,1,1 \n\t" "vleif %%v24,2,2 \n\t" @@ -78,41 +94,25 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" - "vlef %%v16,0(%%r1,%3),0 \n\t" - "vlef %%v17,4(%%r1,%3),0 \n\t" - "vlef %%v16,8(%%r1,%3),1 \n\t" - "vlef %%v17,12(%%r1,%3),1 \n\t" - "vlef %%v16,16(%%r1,%3),2 \n\t" - "vlef %%v17,20(%%r1,%3),2 \n\t" - "vlef %%v16,24(%%r1,%3),3 \n\t" - "vlef %%v17,28(%%r1,%3),3 \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v28,16(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,32(%%r1,%3),0 \n\t" - "vlef %%v19,36(%%r1,%3),0 \n\t" - "vlef %%v18,40(%%r1,%3),1 \n\t" - "vlef %%v19,44(%%r1,%3),1 \n\t" - "vlef %%v18,48(%%r1,%3),2 \n\t" - "vlef %%v19,52(%%r1,%3),2 \n\t" - "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,60(%%r1,%3),3 \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v29,48(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,64(%%r1,%3),0 \n\t" - "vlef %%v21,68(%%r1,%3),0 \n\t" - "vlef %%v20,72(%%r1,%3),1 \n\t" - "vlef %%v21,76(%%r1,%3),1 \n\t" - "vlef %%v20,80(%%r1,%3),2 \n\t" - "vlef %%v21,84(%%r1,%3),2 \n\t" - "vlef %%v20,88(%%r1,%3),3 \n\t" - "vlef %%v21,92(%%r1,%3),3 \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,96(%%r1,%3),0 \n\t" - "vlef %%v23,100(%%r1,%3),0 \n\t" - "vlef %%v22,104(%%r1,%3),1 \n\t" - "vlef %%v23,108(%%r1,%3),1 \n\t" - "vlef %%v22,112(%%r1,%3),2 \n\t" - "vlef %%v23,116(%%r1,%3),2 \n\t" - "vlef %%v22,120(%%r1,%3),3 \n\t" - "vlef %%v23,124(%%r1,%3),3 \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -151,41 +151,25 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vlef %%v16,128(%%r1,%3),0 \n\t" - "vlef %%v17,132(%%r1,%3),0 \n\t" - "vlef %%v16,136(%%r1,%3),1 \n\t" - "vlef %%v17,140(%%r1,%3),1 \n\t" - "vlef %%v16,144(%%r1,%3),2 \n\t" - "vlef %%v17,148(%%r1,%3),2 \n\t" - "vlef %%v16,152(%%r1,%3),3 \n\t" - "vlef %%v17,156(%%r1,%3),3 \n\t" + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v28,144(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,160(%%r1,%3),0 \n\t" - "vlef %%v19,164(%%r1,%3),0 \n\t" - "vlef %%v18,168(%%r1,%3),1 \n\t" - "vlef %%v19,172(%%r1,%3),1 \n\t" - "vlef %%v18,176(%%r1,%3),2 \n\t" - "vlef %%v19,180(%%r1,%3),2 \n\t" - "vlef %%v18,184(%%r1,%3),3 \n\t" - "vlef %%v19,188(%%r1,%3),3 \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v29,176(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,192(%%r1,%3),0 \n\t" - "vlef %%v21,196(%%r1,%3),0 \n\t" - "vlef %%v20,200(%%r1,%3),1 \n\t" - "vlef %%v21,204(%%r1,%3),1 \n\t" - "vlef %%v20,208(%%r1,%3),2 \n\t" - "vlef %%v21,212(%%r1,%3),2 \n\t" - "vlef %%v20,216(%%r1,%3),3 \n\t" - "vlef %%v21,220(%%r1,%3),3 \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v30,208(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,224(%%r1,%3),0 \n\t" - "vlef %%v23,228(%%r1,%3),0 \n\t" - "vlef %%v22,232(%%r1,%3),1 \n\t" - "vlef %%v23,236(%%r1,%3),1 \n\t" - "vlef %%v22,240(%%r1,%3),2 \n\t" - "vlef %%v23,244(%%r1,%3),2 \n\t" - "vlef %%v22,248(%%r1,%3),3 \n\t" - "vlef %%v23,252(%%r1,%3),3 \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v31,240(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -258,7 +242,7 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "nop " :"=r"(iamin),"=m"(*amin) :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return iamin; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index b7214783f7..cc63471272 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -132,7 +132,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) "ldr %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amax; diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index d53fdb6b8e..18610daea3 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -132,7 +132,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) "ldr %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amin; From 29416cb5a37b990052d019f66736af5263a81809 Mon Sep 17 00:00:00 2001 From: maamountki Date: Thu, 31 Jan 2019 19:11:11 +0200 Subject: [PATCH 073/189] [ZARCH] Add Z13 version for max/min functions --- kernel/zarch/KERNEL.Z13 | 12 +-- kernel/zarch/damax_z13.c | 204 ++++++++++++++++++++++++++++++++++++ kernel/zarch/damin_z13.c | 204 ++++++++++++++++++++++++++++++++++++ kernel/zarch/dmax_z13.c | 180 +++++++++++++++++++++++++++++++ kernel/zarch/dmin_z13.c | 180 +++++++++++++++++++++++++++++++ kernel/zarch/zamax_z13.c | 221 +++++++++++++++++++++++++++++++++++++++ kernel/zarch/zamin_z13.c | 221 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 1216 insertions(+), 6 deletions(-) create mode 100644 kernel/zarch/damax_z13.c create mode 100644 kernel/zarch/damin_z13.c create mode 100644 kernel/zarch/dmax_z13.c create mode 100644 kernel/zarch/dmin_z13.c create mode 100644 kernel/zarch/zamax_z13.c create mode 100644 kernel/zarch/zamin_z13.c diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index e5b974ab4e..22c7e97032 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -1,18 +1,18 @@ SAMAXKERNEL = ../arm/amax.c -DAMAXKERNEL = damax.c +DAMAXKERNEL = damax_z13.c CAMAXKERNEL = ../arm/zamax.c -ZAMAXKERNEL = zamax.c +ZAMAXKERNEL = zamax_z13.c SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = damin.c +DAMINKERNEL = damin_z13.c CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = zamin.c +ZAMINKERNEL = zamin_z13.c SMAXKERNEL = ../arm/max.c -DMAXKERNEL = dmax.c +DMAXKERNEL = dmax_z13.c SMINKERNEL = ../arm/min.c -DMINKERNEL = dmin.c +DMINKERNEL = dmin_z13.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = idamax.c diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c new file mode 100644 index 0000000000..95b94ee4ae --- /dev/null +++ b/kernel/zarch/damax_z13.c @@ -0,0 +1,204 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = damax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c new file mode 100644 index 0000000000..538690ee55 --- /dev/null +++ b/kernel/zarch/damin_z13.c @@ -0,0 +1,204 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = damin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c new file mode 100644 index 0000000000..83e7b02a86 --- /dev/null +++ b/kernel/zarch/dmax_z13.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT max; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return max; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = dmax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c new file mode 100644 index 0000000000..e64f90ee38 --- /dev/null +++ b/kernel/zarch/dmin_z13.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = dmin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c new file mode 100644 index 0000000000..ae711c1730 --- /dev/null +++ b/kernel/zarch/zamax_z13.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + maxf = zamax_kernel_16(n1, x); + ix = n1 * 2; + i = n1; + } + else + { + maxf=CABS1(x,0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (maxf); + + } else { + + maxf=CABS1(x,0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); + } + if (CABS1(x,ix+inc_x2) > maxf) { + maxf = CABS1(x,ix+inc_x2); + } + if (CABS1(x,ix+inc_x2*2) > maxf) { + maxf = CABS1(x,ix+inc_x2*2); + } + if (CABS1(x,ix+inc_x2*3) > maxf) { + maxf = CABS1(x,ix+inc_x2*3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (maxf); + } +} diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c new file mode 100644 index 0000000000..f82c57e81f --- /dev/null +++ b/kernel/zarch/zamin_z13.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + minf = zamin_kernel_16(n1, x); + ix = n1 * 2; + i = n1; + } + else + { + minf=CABS1(x,0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (minf); + + } else { + + minf=CABS1(x,0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); + } + if (CABS1(x,ix+inc_x2) < minf) { + minf = CABS1(x,ix+inc_x2); + } + if (CABS1(x,ix+inc_x2*2) < minf) { + minf = CABS1(x,ix+inc_x2*2); + } + if (CABS1(x,ix+inc_x2*3) < minf) { + minf = CABS1(x,ix+inc_x2*3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (minf); + } +} From 1249ee1fd0e62f5386b8b5dbce7b3d5fac785006 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:13:46 +0100 Subject: [PATCH 074/189] Add Z14 target from patch provided by aarnez in #991 --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index 3d04a57cf3..3a5a322344 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -94,3 +94,4 @@ THUNDERX2T99 9.System Z: ZARCH_GENERIC Z13 +Z14 From bdc73a49e0e3fe375fe2a015abebc962e29d72af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:14:37 +0100 Subject: [PATCH 075/189] Add parameters for Z14 from patch provided by aarnez in #991 --- param.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/param.h b/param.h index 15ea663a8f..3cc400b548 100644 --- a/param.h +++ b/param.h @@ -2915,6 +2915,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(Z14) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 456 +#define DGEMM_DEFAULT_P 320 +#define CGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 224 + +#define SGEMM_DEFAULT_Q 488 +#define DGEMM_DEFAULT_Q 384 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 352 + +#define SGEMM_DEFAULT_R 8192 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 2048 + + +#define SYMV_P 16 +#endif + + #ifdef GENERIC From 72d3e7c9b49af5c13ff1e26d13fc3b35ffd92076 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:15:50 +0100 Subject: [PATCH 076/189] Add FORCE Z14 from patch provided by aarnez in #991 --- getarch.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/getarch.c b/getarch.c index d03ce6e981..242d080044 100644 --- a/getarch.c +++ b/getarch.c @@ -1085,6 +1085,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "Z13" #endif +#ifdef FORCE_Z14 +#define FORCE +#define ARCHITECTURE "ZARCH" +#define SUBARCHITECTURE "Z14" +#define ARCHCONFIG "-DZ14 " \ + "-DDTB_DEFAULT_ENTRIES=64" +#define LIBNAME "z14" +#define CORENAME "Z14" +#endif + #ifndef FORCE #ifdef USER_TARGET From 4b512f84dd2b5861e6c860f68d05e56484efe7ce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:16:44 +0100 Subject: [PATCH 077/189] Add cache sizes for Z14 from patch provided by aarnez in #991 --- cpuid_zarch.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 8ed40099b4..896ed94f5d 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -114,7 +114,14 @@ void get_cpuconfig(void) break; case CPU_Z14: printf("#define Z14\n"); + printf("#define L1_DATA_SIZE 131072\n"); + printf("#define L1_DATA_LINESIZE 256\n"); + printf("#define L1_DATA_ASSOCIATIVE 8\n"); + printf("#define L2_SIZE 4194304\n"); + printf("#define L2_LINESIZE 256\n"); + printf("#define L2_ASSOCIATIVE 8\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); break; } } From 885a3c435092f5356ee4665b03d3709ce58a22f1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:18:09 +0100 Subject: [PATCH 078/189] USE_TRMM on Z14 from patch provided by aarnez in #991 --- kernel/Makefile.L3 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 9258f216dd..eafcfb1b41 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -48,6 +48,10 @@ ifeq ($(ARCH), zarch) USE_TRMM = 1 endif +ifeq ($(CORE), Z14) +USE_TRMM = 1 +endif + From 265142edd5dc4c8d7e5e9f781468ac9c5bddb3ba Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:21:40 +0100 Subject: [PATCH 079/189] Fix typo in the zarch min/max kernels from patch provided by aarnez in #991 --- kernel/zarch/damax.c | 2 +- kernel/zarch/damin.c | 2 +- kernel/zarch/dmax.c | 2 +- kernel/zarch/dmin.c | 2 +- kernel/zarch/samax.c | 2 +- kernel/zarch/samin.c | 2 +- kernel/zarch/smax.c | 2 +- kernel/zarch/smin.c | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index a3d63fe532..827467189e 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -81,7 +81,7 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) "vfmaxdb %%v16,%%v16,%%v17,8 \n\t" - "vfmaxdb %%v0,%%v0,%%16,8 \n\t" + "vfmaxdb %%v0,%%v0,%%v16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 738ed8710e..821f9eccc8 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -81,7 +81,7 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) "vfmindb %%v16,%%v16,%%v17,8 \n\t" - "vfmindb %%v0,%%v0,%%16,8 \n\t" + "vfmindb %%v0,%%v0,%%v16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index aa8b932f9f..5ec54c7bf7 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -74,7 +74,7 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) "vfmaxdb %%v16,%%v16,%%v17,0 \n\t" - "vfmaxdb %%v0,%%v0,%%16,0 \n\t" + "vfmaxdb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 8ae5fe868c..073289186e 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -74,7 +74,7 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) "vfmindb %%v16,%%v16,%%v17,0 \n\t" - "vfmindb %%v0,%%v0,%%16,0 \n\t" + "vfmindb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index c8d831d063..b629d64c06 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -81,7 +81,7 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) "vfmaxsb %%v16,%%v16,%%v17,8 \n\t" - "vfmaxsb %%v0,%%v0,%%16,8 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index dd24c74d75..7ce6ee657c 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -81,7 +81,7 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) "vfminsb %%v16,%%v16,%%v17,8 \n\t" - "vfminsb %%v0,%%v0,%%16,8 \n\t" + "vfminsb %%v0,%%v0,%%v16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index 8a2b86dc17..e492d739c3 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -74,7 +74,7 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) "vfmaxsb %%v16,%%v16,%%v17,0 \n\t" - "vfmaxsb %%v0,%%v0,%%16,0 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index b87ec0fe81..e7d83441b3 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -74,7 +74,7 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) "vfminsb %%v16,%%v16,%%v17,0 \n\t" - "vfminsb %%v0,%%v0,%%16,0 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" From 877023e1e194faf5e42e2bb2d0771b52b52fed94 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:22:26 +0100 Subject: [PATCH 080/189] Fix precision of zarch DSDOT from patch provided by aarnez in #991 --- kernel/zarch/dsdot.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 800bb0d51a..72950c9f44 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -132,7 +132,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) while(i < n) { - dot += y[i] * x[i] ; + dot += (double) y[i] * (double) x[i] ; i++ ; } @@ -146,7 +146,8 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) while(i < n1) { - dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + dot += (double) y[iy] * (double) x[ix]; + dot += (double) y[iy+inc_y] * (double) x[ix+inc_x]; ix += inc_x*2 ; iy += inc_y*2 ; i+=2 ; @@ -156,7 +157,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) while(i < n) { - dot += y[iy] * x[ix] ; + dot += (double) y[iy] * (double) x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; From cce574c3e0763af7a5017f20fa36959c896fc4fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:24:55 +0100 Subject: [PATCH 081/189] Improve the z14 SGEMVT kernel from patch provided by aarnez in #991 --- sgemv_t_4.c | 811 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 811 insertions(+) create mode 100644 sgemv_t_4.c diff --git a/sgemv_t_4.c b/sgemv_t_4.c new file mode 100644 index 0000000000..a3136723ae --- /dev/null +++ b/sgemv_t_4.c @@ -0,0 +1,811 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + "vl %%v18,32(%%r1,%5) \n\t" + "vl %%v19,48(%%r1,%5) \n\t" + "vl %%v20,64(%%r1,%5) \n\t" + "vl %%v21,80(%%r1,%5) \n\t" + "vl %%v22,96(%%r1,%5) \n\t" + "vl %%v23,112(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" + + "vl %%v24,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" + + "vl %%v28,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + "vl %%v26,64(%%r1,%3) \n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" + "vl %%v27,64(%%r1,%4) \n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" + + "vl %%v28,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" + "vl %%v31,80(%%r1,%4) \n\t" + "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" + + "vl %%v24,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" + + "vl %%v28,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "veslg %%v4,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v4 \n\t" + "vrepg %%v4,%%v0,1 \n\t" + "aebr %%f0,%%f4 \n\t" + "ste %%f0,0(%6) \n\t" + "veslg %%v4,%%v1,32 \n\t" + "vfasb %%v1,%%v1,%%v4 \n\t" + "vrepg %%v4,%%v1,1 \n\t" + "aebr %%f1,%%f4 \n\t" + "ste %%f1,4(%6) \n\t" + "veslg %%v4,%%v2,32 \n\t" + "vfasb %%v2,%%v2,%%v4 \n\t" + "vrepg %%v4,%%v2,1 \n\t" + "aebr %%f2,%%f4 \n\t" + "ste %%f2,8(%6) \n\t" + "veslg %%v4,%%v3,32 \n\t" + "vfasb %%v3,%%v3,%%v4 \n\t" + "vrepg %%v4,%%v3,1 \n\t" + "aebr %%f3,%%f4 \n\t" + "ste %%f3,12(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" + + "vl %%v28,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" + "vl %%v29,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" + + "vl %%v30,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" + "vl %%v31,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + + "vl %%v26,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" + + "vl %%v28,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" + + "vl %%v30,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "veslg %%v2,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vrepg %%v2,%%v0,1 \n\t" + "aebr %%f0,%%f2 \n\t" + "ste %%f0,0(%4) \n\t" + "veslg %%v2,%%v1,32 \n\t" + "vfasb %%v1,%%v1,%%v2 \n\t" + "vrepg %%v2,%%v1,1 \n\t" + "aebr %%f1,%%f2 \n\t" + "ste %%f1,4(%4) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" + + "vl %%v26,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" + + "vl %%v27,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" + + "vl %%v28,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" + + "vl %%v29,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" + + "vl %%v30,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" + + "vl %%v31,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "aebr %%f0,%%f1 \n\t" + "ste %%f0,0(%3) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for (i = 0; i < n; i++) + { + dest[i] = *src; + src += inc_src; + } +} + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) +{ + __asm__ volatile ( + "vlrepf %%v0,%1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + "vl %%v25, 16(%%r1,%3) \n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" + "vst %%v25, 16(%%r1,%3) \n\t" + "vl %%v26, 32(%%r1,%3) \n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" + "vst %%v26, 32(%%r1,%3) \n\t" + "vl %%v27, 48(%%r1,%3) \n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" + "vst %%v27, 48(%%r1,%3) \n\t" + "vl %%v28, 64(%%r1,%3) \n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" + "vst %%v28, 64(%%r1,%3) \n\t" + "vl %%v29, 80(%%r1,%3) \n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" + "vst %%v29, 80(%%r1,%3) \n\t" + "vl %%v30, 96(%%r1,%3) \n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" + "vst %%v30, 96(%%r1,%3) \n\t" + "vl %%v31, 112(%%r1,%3) \n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" + "vst %%v31, 112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else + { + BLASLONG i; + for (i = 0; i < n; i++) + { + *dest += src[i] * da; + dest += inc_dest; + } + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG register i; + BLASLONG register j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + BLASLONG n0; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[2] __attribute__ ((aligned(16))); + FLOAT *xbuffer; + FLOAT *ytemp; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + xbuffer = buffer; + ytemp = buffer + (m < NBMAX ? m : NBMAX); + + n0 = n / NBMAX; + n1 = (n % NBMAX) >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } + + if ( n2 & 2 ) + { + + sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) + { + + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + // a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + // y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j Date: Fri, 1 Feb 2019 12:57:01 +0100 Subject: [PATCH 082/189] Delete misplaced file sgemv_t_4.c from #1993 , file should have gone into kernel/zarch --- sgemv_t_4.c | 811 ---------------------------------------------------- 1 file changed, 811 deletions(-) delete mode 100644 sgemv_t_4.c diff --git a/sgemv_t_4.c b/sgemv_t_4.c deleted file mode 100644 index a3136723ae..0000000000 --- a/sgemv_t_4.c +++ /dev/null @@ -1,811 +0,0 @@ -/*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#define NBMAX 2048 - -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 1,1024(%%r1,%5) \n\t" - - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - "vl %%v18,32(%%r1,%5) \n\t" - "vl %%v19,48(%%r1,%5) \n\t" - "vl %%v20,64(%%r1,%5) \n\t" - "vl %%v21,80(%%r1,%5) \n\t" - "vl %%v22,96(%%r1,%5) \n\t" - "vl %%v23,112(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" - - "vl %%v24,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" - - "vl %%v28,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - "vl %%v26,64(%%r1,%3) \n\t" - "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" - "vl %%v27,64(%%r1,%4) \n\t" - "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" - - "vl %%v28,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" - "vl %%v31,80(%%r1,%4) \n\t" - "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" - - "vl %%v24,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" - - "vl %%v28,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v4,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v4 \n\t" - "vrepg %%v4,%%v0,1 \n\t" - "aebr %%f0,%%f4 \n\t" - "ste %%f0,0(%6) \n\t" - "veslg %%v4,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v4 \n\t" - "vrepg %%v4,%%v1,1 \n\t" - "aebr %%f1,%%f4 \n\t" - "ste %%f1,4(%6) \n\t" - "veslg %%v4,%%v2,32 \n\t" - "vfasb %%v2,%%v2,%%v4 \n\t" - "vrepg %%v4,%%v2,1 \n\t" - "aebr %%f2,%%f4 \n\t" - "ste %%f2,8(%6) \n\t" - "veslg %%v4,%%v3,32 \n\t" - "vfasb %%v3,%%v3,%%v4 \n\t" - "vrepg %%v4,%%v3,1 \n\t" - "aebr %%f3,%%f4 \n\t" - "ste %%f3,12(%6) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" - - "vl %%v28,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" - "vl %%v29,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" - - "vl %%v30,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" - "vl %%v31,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - - "vl %%v26,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" - - "vl %%v28,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" - - "vl %%v30,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v2,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vrepg %%v2,%%v0,1 \n\t" - "aebr %%f0,%%f2 \n\t" - "ste %%f0,0(%4) \n\t" - "veslg %%v2,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v2 \n\t" - "vrepg %%v2,%%v1,1 \n\t" - "aebr %%f1,%%f2 \n\t" - "ste %%f1,4(%4) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" - - "vl %%v26,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" - - "vl %%v27,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" - - "vl %%v28,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" - - "vl %%v29,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" - - "vl %%v30,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" - - "vl %%v31,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "aebr %%f0,%%f1 \n\t" - "ste %%f0,0(%3) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - dest[i] = *src; - src += inc_src; - } -} - -static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) -{ - __asm__ volatile ( - "vlrepf %%v0,%1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - "vl %%v26, 32(%%r1,%3) \n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" - "vst %%v26, 32(%%r1,%3) \n\t" - "vl %%v27, 48(%%r1,%3) \n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" - "vst %%v27, 48(%%r1,%3) \n\t" - "vl %%v28, 64(%%r1,%3) \n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" - "vst %%v28, 64(%%r1,%3) \n\t" - "vl %%v29, 80(%%r1,%3) \n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" - "vst %%v29, 80(%%r1,%3) \n\t" - "vl %%v30, 96(%%r1,%3) \n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" - "vst %%v30, 96(%%r1,%3) \n\t" - "vl %%v31, 112(%%r1,%3) \n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" - "vst %%v31, 112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} -static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - if (inc_dest == 1) - add_y_kernel_4(n, da, src, dest); - else - { - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i] * da; - dest += inc_dest; - } - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG register i; - BLASLONG register j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - BLASLONG n0; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[2] __attribute__ ((aligned(16))); - FLOAT *xbuffer; - FLOAT *ytemp; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - xbuffer = buffer; - ytemp = buffer + (m < NBMAX ? m : NBMAX); - - n0 = n / NBMAX; - n1 = (n % NBMAX) >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(NB,x_ptr,xbuffer,inc_x); - - - FLOAT *ap[4]; - FLOAT *yp; - BLASLONG register lda4 = 4 * lda; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( n0 > 0 ) - { - BLASLONG nb1 = NBMAX / 4; - for( j=0; j 0 ) - { - add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); - y_ptr += n1 * inc_y * 4; - a_ptr += n1 * lda4 ; - } - - if ( n2 & 2 ) - { - - sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); - a_ptr += lda * 2; - *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1] * alpha; - y_ptr += inc_y; - - } - - if ( n2 & 1 ) - { - - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - // a_ptr += lda; - *y_ptr += ybuffer[0] * alpha; - // y_ptr += inc_y; - - } - a += NB; - x += NB * inc_x; - } - - if ( m3 == 0 ) return(0); - - x_ptr = x; - a_ptr = a; - if ( m3 == 3 ) - { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if ( lda == 3 && inc_y == 1 ) - { - - for ( j=0; j< ( n & -4) ; j+=4 ) - { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for ( ; j Date: Fri, 1 Feb 2019 12:58:59 +0100 Subject: [PATCH 083/189] Fix incorrect sgemv results for IBM z14 part of PR #1993 that was inadvertently misplaced into the toplevel directory --- kernel/zarch/sgemv_t_4.c | 60 +++++++++++++++------------------------- 1 file changed, 22 insertions(+), 38 deletions(-) diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index 5515d7bb7d..a3136723ae 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -158,32 +158,24 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "brctg %%r0,2b \n\t" "3: \n\t" - "vrepf %%v4,%%v0,1 \n\t" - "aebr %%f0,%%f4 \n\t" - "vrepf %%v4,%%v0,2 \n\t" - "aebr %%f0,%%f4 \n\t" - "vrepf %%v4,%%v0,3 \n\t" + "veslg %%v4,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v4 \n\t" + "vrepg %%v4,%%v0,1 \n\t" "aebr %%f0,%%f4 \n\t" "ste %%f0,0(%6) \n\t" - "vrepf %%v4,%%v1,1 \n\t" - "aebr %%f1,%%f4 \n\t" - "vrepf %%v4,%%v1,2 \n\t" - "aebr %%f1,%%f4 \n\t" - "vrepf %%v4,%%v1,3 \n\t" + "veslg %%v4,%%v1,32 \n\t" + "vfasb %%v1,%%v1,%%v4 \n\t" + "vrepg %%v4,%%v1,1 \n\t" "aebr %%f1,%%f4 \n\t" "ste %%f1,4(%6) \n\t" - "vrepf %%v4,%%v2,1 \n\t" - "aebr %%f2,%%f4 \n\t" - "vrepf %%v4,%%v2,2 \n\t" - "aebr %%f2,%%f4 \n\t" - "vrepf %%v4,%%v2,3 \n\t" + "veslg %%v4,%%v2,32 \n\t" + "vfasb %%v2,%%v2,%%v4 \n\t" + "vrepg %%v4,%%v2,1 \n\t" "aebr %%f2,%%f4 \n\t" "ste %%f2,8(%6) \n\t" - "vrepf %%v4,%%v3,1 \n\t" - "aebr %%f3,%%f4 \n\t" - "vrepf %%v4,%%v3,2 \n\t" - "aebr %%f3,%%f4 \n\t" - "vrepf %%v4,%%v3,3 \n\t" + "veslg %%v4,%%v3,32 \n\t" + "vfasb %%v3,%%v3,%%v4 \n\t" + "vrepg %%v4,%%v3,1 \n\t" "aebr %%f3,%%f4 \n\t" "ste %%f3,12(%6) " : @@ -281,18 +273,14 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "brctg %%r0,2b \n\t" "3: \n\t" - "vrepf %%v2,%%v0,1 \n\t" - "aebr %%f0,%%f2 \n\t" - "vrepf %%v2,%%v0,2 \n\t" - "aebr %%f0,%%f2 \n\t" - "vrepf %%v2,%%v0,3 \n\t" + "veslg %%v2,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vrepg %%v2,%%v0,1 \n\t" "aebr %%f0,%%f2 \n\t" "ste %%f0,0(%4) \n\t" - "vrepf %%v2,%%v1,1 \n\t" - "aebr %%f1,%%f2 \n\t" - "vrepf %%v2,%%v1,2 \n\t" - "aebr %%f1,%%f2 \n\t" - "vrepf %%v2,%%v1,3 \n\t" + "veslg %%v2,%%v1,32 \n\t" + "vfasb %%v1,%%v1,%%v2 \n\t" + "vrepg %%v2,%%v1,1 \n\t" "aebr %%f1,%%f2 \n\t" "ste %%f1,4(%4) " : @@ -349,7 +337,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) "vl %%v31,112(%%r1,%1) \n\t" "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - + "agfi %%r1,128 \n\t" "brctg %%r0,0b \n\t" @@ -370,11 +358,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) "brctg %%r0,2b \n\t" "3: \n\t" - "vrepf %%v1,%%v0,1 \n\t" - "aebr %%f0,%%f1 \n\t" - "vrepf %%v1,%%v0,2 \n\t" - "aebr %%f0,%%f1 \n\t" - "vrepf %%v1,%%v0,3 \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepg %%v1,%%v0,1 \n\t" "aebr %%f0,%%f1 \n\t" "ste %%f0,0(%3) " : @@ -823,5 +809,3 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO return(0); } - - From 4abc375a91d6a3bc97e180dca9f33750193ad281 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 1 Feb 2019 13:45:00 +0000 Subject: [PATCH 084/189] sgemv cgemv pairs --- kernel/power/KERNEL.POWER8 | 8 +- kernel/power/cgemv_n.c | 585 +++++++++++++++++++++++++++++++++++++ kernel/power/cgemv_t.c | 571 ++++++++++++++++++++++++++++++++++++ kernel/power/dgemv_t.c | 4 +- kernel/power/icamax.c | 81 ++++- kernel/power/sgemv_n.c | 465 +++++++++++++++++++++++++++++ kernel/power/sgemv_t.c | 480 ++++++++++++++++++++++++++++++ kernel/power/sgemv_t_8.c | 501 +++++++++++++++++++++++++++++++ kernel/power/zgemv_n_4.c | 22 +- kernel/power/zgemv_t_4.c | 6 +- utest/Makefile | 1 - 11 files changed, 2691 insertions(+), 33 deletions(-) create mode 100644 kernel/power/cgemv_n.c create mode 100644 kernel/power/cgemv_t.c create mode 100644 kernel/power/sgemv_n.c create mode 100644 kernel/power/sgemv_t.c create mode 100644 kernel/power/sgemv_t_8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index cbcffb8fe8..e6f69c7c47 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -147,14 +147,14 @@ CSWAPKERNEL = cswap.c ZSWAPKERNEL = zswap.c # -#SGEMVNKERNEL = ../arm/gemv_n.c +SGEMVNKERNEL = sgemv_n.c DGEMVNKERNEL = dgemv_n.c -#CGEMVNKERNEL = ../arm/zgemv_n.c +CGEMVNKERNEL = cgemv_n.c ZGEMVNKERNEL = zgemv_n_4.c # -#SGEMVTKERNEL = ../arm/gemv_t.c +SGEMVTKERNEL = sgemv_t.c DGEMVTKERNEL = dgemv_t.c -#CGEMVTKERNEL = ../arm/zgemv_t.c +CGEMVTKERNEL = cgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c new file mode 100644 index 0000000000..cb01e196e4 --- /dev/null +++ b/kernel/power/cgemv_n.c @@ -0,0 +1,585 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include +#include +#include "common.h" +#include +#define NBMAX 1024 + + +static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + + +static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; + register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; + register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; + register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; + register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; + register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; + register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; + register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; + register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; + register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; + register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; + register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; + register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; +#endif + register __vector float *vy = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) a0; + register __vector float *vptr_a1 = (__vector float *) a1; + register __vector float *vptr_a2 = (__vector float *) a2; + register __vector float *vptr_a3 = (__vector float *) a3; + BLASLONG i = 0; + for (;i< n / 2; i+=2) { + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float va0 = vptr_a0[i]; + register __vector float va1 = vptr_a1[i]; + register __vector float va2 = vptr_a2[i]; + register __vector float va3 = vptr_a3[i]; + register __vector float va0_1 = vptr_a0[i + 1]; + register __vector float va1_1 = vptr_a1[i + 1]; + register __vector float va2_1 = vptr_a2[i + 1]; + register __vector float va3_1 = vptr_a3[i + 1]; + + vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; + vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; + va0 = vec_perm(va0, va0,swap_mask); + va0_1 = vec_perm(va0_1, va0_1,swap_mask); + va1 = vec_perm(va1, va1,swap_mask); + va1_1 = vec_perm(va1_1, va1_1,swap_mask); + va2 = vec_perm(va2, va2,swap_mask); + va2_1 = vec_perm(va2_1, va2_1,swap_mask); + va3 = vec_perm(va3, va3,swap_mask); + va3_1 = vec_perm(va3_1, va3_1,swap_mask); + vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; + vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + } + +} + + + +static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; + register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; + register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; + register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; + register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; +#endif + register __vector float *vy = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) a0; + register __vector float *vptr_a1 = (__vector float *) a1; + BLASLONG i = 0; + for (;i< n / 2; i+=2) { + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float va0 = vptr_a0[i]; + register __vector float va1 = vptr_a1[i]; + register __vector float va0_1 = vptr_a0[i + 1]; + register __vector float va1_1 = vptr_a1[i + 1]; + register __vector float va0x = vec_perm(va0, va0,swap_mask); + register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); + register __vector float va1x = vec_perm(va1, va1,swap_mask); + register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); + vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; + vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + } + +} + + + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; +#endif + register __vector float *vy = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) ap; + BLASLONG i = 0; + for (;i< n / 2; i+=2) { + register __vector float vy_0 = vy[i]; + register __vector float vy_1 = vy[i + 1]; + register __vector float va0 = vptr_a0[i]; + register __vector float va0_1 = vptr_a0[i + 1]; + register __vector float va0x = vec_perm(va0, va0,swap_mask); + register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); + vy_0 += va0*vx0_r + va0x*vx0_i; + vy_1 += va0_1*vx0_r + va0x_1*vx0_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + } +} + + + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + + + if (inc_dest != 2) { + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i +static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + +static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + + for (i = 0; i < n / 2; i+=2) { + register __vector float vx_0 = v_x[i]; + register __vector float vx_1 = v_x[i+1]; + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; + vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; + vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; + vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; + vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1]; + vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1]; + vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1]; + vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1]; + + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; + + register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; + register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; + + register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; + register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; + + register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; + register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; + + register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; + register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + +#endif + +} + + +static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + + for (i = 0; i < n / 2; i+=2) { + register __vector float vx_0 = v_x[i]; + register __vector float vx_1 = v_x[i+1]; + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; + vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; + vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; + vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; + + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; + + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + +#endif + +} + + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + __vector float* va0 = (__vector float*) ap; + __vector float* v_x = (__vector float*) x; + + for (i = 0; i < n / 2; i+=2) { + register __vector float vx_0 = v_x[i]; + register __vector float vx_1 = v_x[i+1]; + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; + vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; + + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + +#endif + + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest = *src; + *(dest + 1) = *(src + 1); + dest += 2; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + + FLOAT ybuffer[8], *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 2; + y_ptr += 8; + + } + + if (n2 & 2) { + cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 1; + y_ptr += 4; + + } + + if (n2 & 1) { + cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda; + y_ptr += 2; + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + + a_ptr += lda << 2; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return (0); + } + + if (m3 == 1) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + return (0); + +} + diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index 3974ed62dd..b8589a1311 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define NBMAX 8192 -#define PREFETCH 1 +#define NBMAX 1024 +//#define PREFETCH 1 #include #define HAVE_KERNEL4x8_ASM 1 diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c index aa0531dc61..06fc5d8ad7 100644 --- a/kernel/power/icamax.c +++ b/kernel/power/icamax.c @@ -36,9 +36,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code +#if !defined(USE_MASK_PERMUTATIONS) + +static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ + __vector float result; + __asm__ ( + "vmrgew %0,%1,%2;\n" + : "=v" (result) + : "v" (a), + "v" (b) + : ); + return result; +} + +static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ + __vector float result; + __asm__ ( + "vmrgow %0,%1,%2;\n" + : "=v" (result) + : "v" (a), + "v" (b) + : ); + return result; +} + +#endif - /** * Find maximum index * Warning: requirements n>0 and n % 32 == 0 @@ -51,12 +76,16 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { BLASLONG index; BLASLONG i; +#if defined(USE_MASK_PERMUTATIONS) register __vector unsigned int static_index0 = {0,1,2,3}; +#else + register __vector unsigned int static_index0 = {2,0,3,1}; +#endif register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + register __vector unsigned int static_index1=static_index0 +temp0; + register __vector unsigned int static_index2=static_index0 +temp1; + register __vector unsigned int static_index3=static_index1 +temp1; temp0=vec_xor(temp0,temp0); temp1=temp1 <<1 ; //{16,16,16,16} register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} @@ -64,9 +93,11 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { register __vector float quadruple_values={0,0,0,0}; register __vector float * v_ptrx=(__vector float *)x; +#if defined(USE_MASK_PERMUTATIONS) register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; - for(; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*4); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c new file mode 100644 index 0000000000..96434a13f6 --- /dev/null +++ b/kernel/power/sgemv_t.c @@ -0,0 +1,480 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +#include + +static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + register __vector float temp4 = {0,0,0,0}; + register __vector float temp5 = {0,0,0,0}; + register __vector float temp6 = {0,0,0,0}; + register __vector float temp7 = {0,0,0,0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector float*) a0; + va1 = (__vector float*) a1; + va2 = (__vector float*) a2; + va3 = (__vector float*) a3; + va4 = (__vector float*) a4; + va5 = (__vector float*) a5; + va6 = (__vector float*) a6; + va7 = (__vector float*) a7; + v_x = (__vector float*) x; + + + for (i = 0; i < n/4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i] * va4[i]; + temp5 += v_x[i] * va5[i]; + temp6 += v_x[i] * va6[i]; + temp7 += v_x[i] * va7[i]; + } + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + +} + + +static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + +} + + +static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + __vector float temp1 = {0,0,0,0}; + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector float* va0 = (__vector float*) a0; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i] ; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + + FLOAT ybuffer[8], *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; + + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 3 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + aj += 3; + } + + } else { + + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr += inc_y; + aj += lda; + } + + } + + } + return (0); + } + + if (m3 == 2) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + return (0); + + } + + FLOAT xtemp = *x_ptr * alpha; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + } + + return (0); + +} + diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c new file mode 100644 index 0000000000..c9f9282586 --- /dev/null +++ b/kernel/power/sgemv_t_8.c @@ -0,0 +1,501 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" +#include +#define NBMAX 2048 + +#include + +static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + register __vector float temp4 = {0,0,0,0}; + register __vector float temp5 = {0,0,0,0}; + register __vector float temp6 = {0,0,0,0}; + register __vector float temp7 = {0,0,0,0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector float*) a0; + va1 = (__vector float*) a1; + va2 = (__vector float*) a2; + va3 = (__vector float*) a3; + va4 = (__vector float*) a4; + va5 = (__vector float*) a5; + va6 = (__vector float*) a6; + va7 = (__vector float*) a7; + v_x = (__vector float*) x; + + + for (i = 0; i < n/4; i +=2) { + register __vector float vx1=v_x[i] ; + register __vector float vx2=v_x[i+1] ; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + register __vector float va4_1=va4[i] ; + register __vector float va4_2=va4[i+1] ; + register __vector float va5_1=va5[i] ; + register __vector float va5_2=va5[i+1] ; + register __vector float va6_1=va6[i] ; + register __vector float va6_2=va6[i+1] ; + register __vector float va7_1=va7[i] ; + register __vector float va7_2=va7[i+1] ; + temp0 += vx1* va0_1 + vx2 * va0_2; + temp1 += vx1* va1_1 + vx2 * va1_2; + temp2 += vx1* va2_1 + vx2 * va2_2; + temp3 += vx1* va3_1 + vx2 * va3_2; + temp4 += vx1* va4_1 + vx2 * va4_2; + temp5 += vx1* va5_1 + vx2 * va5_2; + temp6 += vx1* va6_1 + vx2 * va6_2; + temp7 += vx1* va7_1 + vx2 * va7_2; + } + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + +} + + +static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; + temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1]; + temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + +} + + +static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + __vector float temp1 = {0,0,0,0}; + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); +} + +static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector float* va0 = (__vector float*) a0; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + } + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + +} + + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + + FLOAT ybuffer[8], *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 7; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; + + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 & 4) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp3 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 4 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; + y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3; + y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3; + y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3; + aj += 16; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; + aj += 4; + } + + } else if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3; + y_ptr += inc_y; + aj += lda; + } + + } + if (m3==4) return (0); + a_ptr += 4; + } + + if (m3 & 2 ) { + + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + if (m3==2) return (0); + a_ptr += 2; + } + if (m3 & 1) { + + FLOAT xtemp = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + + } + a_ptr += 1; + } + return (0); + +} + diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c index 8b250a7f1e..167b0a1586 100644 --- a/kernel/power/zgemv_n_4.c +++ b/kernel/power/zgemv_n_4.c @@ -389,20 +389,14 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { register __vector double va0_2 = vptr_a0[i + 2]; register __vector double va0_3 = vptr_a0[i + 3]; - vy_0 += va0*vx0_r; - vy_1 += va0_1*vx0_r; - vy_2 += va0_2*vx0_r; - vy_3 += va0_3*vx0_r; - - va0 = vec_xxpermdi(va0, va0, 2); - va0_1 = vec_xxpermdi(va0_1, va0_1, 2); - va0_2 = vec_xxpermdi(va0_2, va0_2, 2); - va0_3 = vec_xxpermdi(va0_3, va0_3, 2); - - vy_0 += va0*vx0_i; - vy_1 += va0_1*vx0_i; - vy_2 += va0_2*vx0_i; - vy_3 += va0_3*vx0_i; + register __vector double va0x = vec_xxpermdi(va0, va0, 2); + register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2); + register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2); + register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2); + vy_0 += va0*vx0_r + va0x*vx0_i; + vy_1 += va0_1*vx0_r + va0x_1*vx0_i; + vy_2 += va0_2*vx0_r + va0x_2*vx0_i; + vy_3 += va0_3*vx0_r + va0x_3*vx0_i; vy[i] = vy_0; vy[i + 1] = vy_1; diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index 5722064946..20a0812dd2 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -59,11 +59,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA i = 0; n = n << 1; while (i < n) { -// __builtin_prefetch(&x[i]); -// __builtin_prefetch(&a0[i]); -// __builtin_prefetch(&a1[i]); -// __builtin_prefetch(&a2[i]); -// __builtin_prefetch(&a3[i]); + register __vector double vx_0 = *(__vector double*) (&x[i]); register __vector double vx_1 = *(__vector double*) (&x[i + 2]); register __vector double vx_2 = *(__vector double*) (&x[i + 4]); diff --git a/utest/Makefile b/utest/Makefile index e40b3c6db5..550a655691 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -37,4 +37,3 @@ clean: -rm -f *.o $(UTESTBIN) libs: - From cd9ea45463b46d603ae4f8c8af033331c07abbc2 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 4 Feb 2019 06:57:11 +0000 Subject: [PATCH 085/189] NBMAX=4096 for gemvn, added sgemvn 8x8 for future --- kernel/power/sgemv_n.c | 2 +- kernel/power/sgemv_n_8.c | 507 +++++++++++++++++++++++++++++++++++++++ kernel/power/sgemv_t_8.c | 2 +- 3 files changed, 509 insertions(+), 2 deletions(-) create mode 100644 kernel/power/sgemv_n_8.c diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c index 56f08c2bf0..9704757fe4 100644 --- a/kernel/power/sgemv_n.c +++ b/kernel/power/sgemv_n.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define NBMAX 2048 +#define NBMAX 4096 static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c new file mode 100644 index 0000000000..d05b08f4ea --- /dev/null +++ b/kernel/power/sgemv_n_8.c @@ -0,0 +1,507 @@ +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#define NBMAX 4096 + +static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; + FLOAT x0,x1,x2,x3,x4,x5,x6,x7; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + x4 = xo[4] * *alpha; + x5 = xo[5] * *alpha; + x6 = xo[6] * *alpha; + x7 = xo[7] * *alpha; + __vector float* va0 = (__vector float*)a0; + __vector float* va1 = (__vector float*)a1; + __vector float* va2 = (__vector float*)a2; + __vector float* va3 = (__vector float*)a3; + __vector float* vb0 = (__vector float*)b0; + __vector float* vb1 = (__vector float*)b1; + __vector float* vb2 = (__vector float*)b2; + __vector float* vb3 = (__vector float*)b3; + + register __vector float v_x0 = {x0,x0,x0,x0}; + register __vector float v_x1 = {x1,x1,x1,x1}; + register __vector float v_x2 = {x2,x2,x2,x2}; + register __vector float v_x3 = {x3,x3,x3,x3}; + register __vector float v_x4 = {x4,x4,x4,x4}; + register __vector float v_x5 = {x5,x5,x5,x5}; + register __vector float v_x6 = {x6,x6,x6,x6}; + register __vector float v_x7 = {x7,x7,x7,x7}; + __vector float* v_y =(__vector float*)y; + + for ( i=0; i< n/4; i+=2) + { + register __vector float vy_1=v_y[i]; + register __vector float vy_2=v_y[i+1]; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + register __vector float vb0_1=vb0[i] ; + register __vector float vb0_2=vb0[i+1] ; + register __vector float vb1_1=vb1[i] ; + register __vector float vb1_2=vb1[i+1] ; + register __vector float vb2_1=vb2[i] ; + register __vector float vb2_2=vb2[i+1] ; + register __vector float vb3_1=vb3[i] ; + register __vector float vb3_2=vb3[i+1] ; + vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; + vy_1 += v_x4 * vb0_1 + v_x5 * vb1_1 + v_x6 * vb2_1 + v_x7 * vb3_1 ; + vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; + vy_2 += v_x4 * vb0_2 + v_x5 * vb1_2 + v_x6 * vb2_2 + v_x7 * vb3_2 ; + v_y[i] =vy_1; + v_y[i+1] =vy_2; + } + +} + +static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1,x2,x3; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + __vector float* va2 = (__vector float*)ap[2]; + __vector float* va3 = (__vector float*)ap[3]; + + for ( i=0; i< n/4; i+=2 ) + { + register __vector float vy_1=v_y[i]; + register __vector float vy_2=v_y[i+1]; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; + vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; + v_y[i] =vy_1; + v_y[i+1] =vy_2; + } + +} + +static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0,x1; + x0 = x[0] * *alpha; + x1 = x[1] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + + for ( i=0; i< n/4; i+=2 ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; + } + +} + + +static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0 ; + x0 = x[0] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap; + + for ( i=0; i< n/4; i+=2 ) + { + v_y[i] += v_x0 * va0[i] ; + v_y[i+1] += v_x0 * va0[i+1] ; + } + +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + + for ( i=0; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 7 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*4); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + + if ( m3 & 4 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + if ( lda == 4 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1]; + temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1]; + + temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12] * x_ptr[3]; + temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3]; + temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3]; + temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3]; + + a_ptr += 16; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + temp3 += a_ptr[3] * x_ptr[0] ; + a_ptr +=4; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + temp3 += a_ptr[3] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + y_ptr += inc_y; + y_ptr[0] += alpha * temp3; + y_ptr += inc_y; + a += 4; + } + + + if ( m3 & 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + a += 2; + } + + if ( m3 & 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + + + } + + + return(0); +} + + diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c index c9f9282586..e426f36c3f 100644 --- a/kernel/power/sgemv_t_8.c +++ b/kernel/power/sgemv_t_8.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#define NBMAX 2048 +#define NBMAX 4096 #include From 498ac98581accf80085c020874ad6a9513f95996 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 4 Feb 2019 15:41:56 +0000 Subject: [PATCH 086/189] Note for unused kernels --- kernel/power/sgemv_n_8.c | 6 ++++++ kernel/power/sgemv_t_8.c | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c index d05b08f4ea..9bc93ced67 100644 --- a/kernel/power/sgemv_n_8.c +++ b/kernel/power/sgemv_n_8.c @@ -26,6 +26,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/****Note*** +UnUsed kernel +This kernel works. But it was not competitive enough to be added in production +It could be used and tested in future or could provide barebone for switching to inline assembly +*/ + #include "common.h" #define NBMAX 4096 diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c index e426f36c3f..5e9cd63ac3 100644 --- a/kernel/power/sgemv_t_8.c +++ b/kernel/power/sgemv_t_8.c @@ -25,6 +25,13 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + +/****Note*** +UnUsed kernel +This kernel works. But it was not competitive enough to be added in production +It could be used and tested in future or could be used as base for switching to inline assembly +*/ + #include "common.h" #include #define NBMAX 4096 From 81daf6bc380c22bcc7ce228952e5435bc79bb0ce Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 5 Feb 2019 07:30:38 +0200 Subject: [PATCH 087/189] [ZARCH] Format source code, Fix constraints --- kernel/zarch/camax.c | 370 +++++----- kernel/zarch/camin.c | 370 +++++----- kernel/zarch/casum.c | 236 +++---- kernel/zarch/caxpy.c | 232 +++---- kernel/zarch/ccopy.c | 102 ++- kernel/zarch/cdot.c | 254 ++++--- kernel/zarch/cgemv_n_4.c | 1263 +++++++++++++++++----------------- kernel/zarch/cgemv_t_4.c | 1179 ++++++++++++++++---------------- kernel/zarch/crot.c | 413 ++++++----- kernel/zarch/cscal.c | 684 +++++++++---------- kernel/zarch/cswap.c | 263 ++++--- kernel/zarch/damax.c | 220 +++--- kernel/zarch/damax_z13.c | 292 ++++---- kernel/zarch/damin.c | 220 +++--- kernel/zarch/damin_z13.c | 292 ++++---- kernel/zarch/dasum.c | 248 ++++--- kernel/zarch/daxpy.c | 253 ++++--- kernel/zarch/dcopy.c | 76 +-- kernel/zarch/ddot.c | 196 +++--- kernel/zarch/dgemv_n_4.c | 1200 +++++++++++++++----------------- kernel/zarch/dgemv_t_4.c | 1397 ++++++++++++++++++-------------------- kernel/zarch/dmax.c | 214 +++--- kernel/zarch/dmax_z13.c | 252 ++++--- kernel/zarch/dmin.c | 214 +++--- kernel/zarch/dmin_z13.c | 252 ++++--- kernel/zarch/drot.c | 381 +++++------ kernel/zarch/dscal.c | 278 ++++---- kernel/zarch/dsdot.c | 246 +++---- kernel/zarch/dswap.c | 228 +++---- kernel/zarch/icamax.c | 515 +++++++------- kernel/zarch/icamin.c | 515 +++++++------- kernel/zarch/idamax.c | 411 ++++++----- kernel/zarch/idamin.c | 411 ++++++----- kernel/zarch/idmax.c | 385 +++++------ kernel/zarch/idmin.c | 385 +++++------ kernel/zarch/isamax.c | 496 +++++++------- kernel/zarch/isamin.c | 496 +++++++------- kernel/zarch/ismax.c | 458 ++++++------- kernel/zarch/ismin.c | 458 ++++++------- kernel/zarch/izamax.c | 409 ++++++----- kernel/zarch/izamin.c | 409 ++++++----- kernel/zarch/samax.c | 225 +++--- kernel/zarch/samin.c | 225 +++--- kernel/zarch/sasum.c | 252 ++++--- kernel/zarch/saxpy.c | 253 ++++--- kernel/zarch/scopy.c | 76 +-- kernel/zarch/sdot.c | 188 ++--- kernel/zarch/sgemv_n_4.c | 1157 +++++++++++++++---------------- kernel/zarch/sgemv_t_4.c | 1380 ++++++++++++++++++------------------- kernel/zarch/smax.c | 219 +++--- kernel/zarch/smin.c | 219 +++--- kernel/zarch/srot.c | 381 +++++------ kernel/zarch/sscal.c | 268 ++++---- kernel/zarch/sswap.c | 230 +++---- kernel/zarch/zamax.c | 333 +++++---- kernel/zarch/zamax_z13.c | 352 +++++----- kernel/zarch/zamin.c | 317 ++++----- kernel/zarch/zamin_z13.c | 336 +++++---- kernel/zarch/zasum.c | 232 +++---- kernel/zarch/zaxpy.c | 232 +++---- kernel/zarch/zcopy.c | 102 ++- kernel/zarch/zdot.c | 246 ++++--- kernel/zarch/zgemv_n_4.c | 1147 +++++++++++++++---------------- kernel/zarch/zgemv_t_4.c | 1099 +++++++++++++++--------------- kernel/zarch/zrot.c | 413 ++++++----- kernel/zarch/zscal.c | 676 +++++++++--------- kernel/zarch/zswap.c | 263 ++++--- 67 files changed, 13393 insertions(+), 14601 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 2e9648640a..40a9903e94 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vlef %%v0,0(%2),0 \n\t" - "vlef %%v16,4(%2),0 \n\t" - "vlef %%v0,8(%2),1 \n\t" - "vlef %%v16,12(%2),1 \n\t" - "vlef %%v0,16(%2),2 \n\t" - "vlef %%v16,20(%2),2 \n\t" - "vlef %%v0,24(%2),3 \n\t" - "vlef %%v16,28(%2),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v16,%%v16 \n\t" - "vfasb %%v0,%%v0,%%v16 \n\t" - "vleib %%v1,0,0 \n\t" - "vleib %%v1,1,1 \n\t" - "vleib %%v1,2,2 \n\t" - "vleib %%v1,3,3 \n\t" - "vleib %%v1,8,4 \n\t" - "vleib %%v1,9,5 \n\t" - "vleib %%v1,10,6 \n\t" - "vleib %%v1,11,7 \n\t" - "vleib %%v1,16,8 \n\t" - "vleib %%v1,17,9 \n\t" - "vleib %%v1,18,10 \n\t" - "vleib %%v1,19,11 \n\t" - "vleib %%v1,24,12 \n\t" - "vleib %%v1,25,13 \n\t" - "vleib %%v1,26,14 \n\t" - "vleib %%v1,27,15 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v2,16(%%r1,%2) \n\t" - "vpkg %%v17,%%v16,%%v2 \n\t" - "vperm %%v16,%%v16,%%v2,%%v1 \n\t" - - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v2,48(%%r1,%2) \n\t" - "vpkg %%v19,%%v18,%%v2 \n\t" - "vperm %%v18,%%v18,%%v2,%%v1 \n\t" - - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v2,80(%%r1,%2) \n\t" - "vpkg %%v21,%%v20,%%v2 \n\t" - "vperm %%v20,%%v20,%%v2,%%v1 \n\t" - - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v2,112(%%r1,%2) \n\t" - "vpkg %%v23,%%v22,%%v2 \n\t" - "vperm %%v22,%%v22,%%v2,%%v1 \n\t" - - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v2,144(%%r1,%2) \n\t" - "vpkg %%v25,%%v24,%%v2 \n\t" - "vperm %%v24,%%v24,%%v2,%%v1 \n\t" - - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v2,176(%%r1,%2) \n\t" - "vpkg %%v27,%%v26,%%v2 \n\t" - "vperm %%v26,%%v26,%%v2,%%v1 \n\t" - - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v2,208(%%r1,%2) \n\t" - "vpkg %%v29,%%v28,%%v2 \n\t" - "vperm %%v28,%%v28,%%v2,%%v1 \n\t" - - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v2,240(%%r1,%2) \n\t" - "vpkg %%v31,%%v30,%%v2 \n\t" - "vperm %%v30,%%v30,%%v2,%%v1 \n\t" - - "vflpsb %%v16,%%v16 \n\t" - "vflpsb %%v17,%%v17 \n\t" - "vflpsb %%v18,%%v18 \n\t" - "vflpsb %%v19,%%v19 \n\t" - "vflpsb %%v20,%%v20 \n\t" - "vflpsb %%v21,%%v21 \n\t" - "vflpsb %%v22,%%v22 \n\t" - "vflpsb %%v23,%%v23 \n\t" - "vflpsb %%v24,%%v24 \n\t" - "vflpsb %%v25,%%v25 \n\t" - "vflpsb %%v26,%%v26 \n\t" - "vflpsb %%v27,%%v27 \n\t" - "vflpsb %%v28,%%v28 \n\t" - "vflpsb %%v29,%%v29 \n\t" - "vflpsb %%v30,%%v30 \n\t" - "vflpsb %%v31,%%v31 \n\t" - - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v18,%%v18,%%v19 \n\t" - "vfasb %%v20,%%v20,%%v21 \n\t" - "vfasb %%v22,%%v22,%%v23 \n\t" - "vfasb %%v24,%%v24,%%v25 \n\t" - "vfasb %%v26,%%v26,%%v27 \n\t" - "vfasb %%v28,%%v28,%%v29 \n\t" - "vfasb %%v30,%%v30,%%v31 \n\t" - - "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" - "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" - "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" - "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" - - "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" - "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" - - "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" - - "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" - "ler %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) + +static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfmaxsb %%v16,%%v16,%%v24,0\n\t" + "vfmaxsb %%v18,%%v18,%%v26,0\n\t" + "vfmaxsb %%v20,%%v20,%%v28,0\n\t" + "vfmaxsb %%v22,%%v22,%%v30,0\n\t" + "vfmaxsb %%v16,%%v16,%%v20,0\n\t" + "vfmaxsb %%v18,%%v18,%%v22,0\n\t" + "vfmaxsb %%v16,%%v16,%%v18,0\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); + + return amax; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - maxf = camax_kernel_32(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - maxf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (maxf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + maxf = camax_kernel_32(n1, x); + ix = n1 * 2; + i = n1; } else { + maxf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (maxf); - maxf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) > maxf) { - maxf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) > maxf) { - maxf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) > maxf) { - maxf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (maxf); + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (maxf); + } } diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index aec59058ed..842635afc4 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vlef %%v0,0(%2),0 \n\t" - "vlef %%v16,4(%2),0 \n\t" - "vlef %%v0,8(%2),1 \n\t" - "vlef %%v16,12(%2),1 \n\t" - "vlef %%v0,16(%2),2 \n\t" - "vlef %%v16,20(%2),2 \n\t" - "vlef %%v0,24(%2),3 \n\t" - "vlef %%v16,28(%2),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v16,%%v16 \n\t" - "vfasb %%v0,%%v0,%%v16 \n\t" - "vleib %%v1,0,0 \n\t" - "vleib %%v1,1,1 \n\t" - "vleib %%v1,2,2 \n\t" - "vleib %%v1,3,3 \n\t" - "vleib %%v1,8,4 \n\t" - "vleib %%v1,9,5 \n\t" - "vleib %%v1,10,6 \n\t" - "vleib %%v1,11,7 \n\t" - "vleib %%v1,16,8 \n\t" - "vleib %%v1,17,9 \n\t" - "vleib %%v1,18,10 \n\t" - "vleib %%v1,19,11 \n\t" - "vleib %%v1,24,12 \n\t" - "vleib %%v1,25,13 \n\t" - "vleib %%v1,26,14 \n\t" - "vleib %%v1,27,15 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v2,16(%%r1,%2) \n\t" - "vpkg %%v17,%%v16,%%v2 \n\t" - "vperm %%v16,%%v16,%%v2,%%v1 \n\t" - - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v2,48(%%r1,%2) \n\t" - "vpkg %%v19,%%v18,%%v2 \n\t" - "vperm %%v18,%%v18,%%v2,%%v1 \n\t" - - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v2,80(%%r1,%2) \n\t" - "vpkg %%v21,%%v20,%%v2 \n\t" - "vperm %%v20,%%v20,%%v2,%%v1 \n\t" - - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v2,112(%%r1,%2) \n\t" - "vpkg %%v23,%%v22,%%v2 \n\t" - "vperm %%v22,%%v22,%%v2,%%v1 \n\t" - - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v2,144(%%r1,%2) \n\t" - "vpkg %%v25,%%v24,%%v2 \n\t" - "vperm %%v24,%%v24,%%v2,%%v1 \n\t" - - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v2,176(%%r1,%2) \n\t" - "vpkg %%v27,%%v26,%%v2 \n\t" - "vperm %%v26,%%v26,%%v2,%%v1 \n\t" - - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v2,208(%%r1,%2) \n\t" - "vpkg %%v29,%%v28,%%v2 \n\t" - "vperm %%v28,%%v28,%%v2,%%v1 \n\t" - - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v2,240(%%r1,%2) \n\t" - "vpkg %%v31,%%v30,%%v2 \n\t" - "vperm %%v30,%%v30,%%v2,%%v1 \n\t" - - "vflpsb %%v16,%%v16 \n\t" - "vflpsb %%v17,%%v17 \n\t" - "vflpsb %%v18,%%v18 \n\t" - "vflpsb %%v19,%%v19 \n\t" - "vflpsb %%v20,%%v20 \n\t" - "vflpsb %%v21,%%v21 \n\t" - "vflpsb %%v22,%%v22 \n\t" - "vflpsb %%v23,%%v23 \n\t" - "vflpsb %%v24,%%v24 \n\t" - "vflpsb %%v25,%%v25 \n\t" - "vflpsb %%v26,%%v26 \n\t" - "vflpsb %%v27,%%v27 \n\t" - "vflpsb %%v28,%%v28 \n\t" - "vflpsb %%v29,%%v29 \n\t" - "vflpsb %%v30,%%v30 \n\t" - "vflpsb %%v31,%%v31 \n\t" - - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v18,%%v18,%%v19 \n\t" - "vfasb %%v20,%%v20,%%v21 \n\t" - "vfasb %%v22,%%v22,%%v23 \n\t" - "vfasb %%v24,%%v24,%%v25 \n\t" - "vfasb %%v26,%%v26,%%v27 \n\t" - "vfasb %%v28,%%v28,%%v29 \n\t" - "vfasb %%v30,%%v30,%%v31 \n\t" - - "vfminsb %%v16,%%v16,%%v24,0 \n\t" - "vfminsb %%v18,%%v18,%%v26,0 \n\t" - "vfminsb %%v20,%%v20,%%v28,0 \n\t" - "vfminsb %%v22,%%v22,%%v30,0 \n\t" - - "vfminsb %%v16,%%v16,%%v20,0 \n\t" - "vfminsb %%v18,%%v18,%%v22,0 \n\t" - - "vfminsb %%v16,%%v16,%%v18,0 \n\t" - - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfminsb %%v0,%%v0,%%v16,0 \n\t" - "ler %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) + +static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); + + return amin; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - minf = camin_kernel_32(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - minf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (minf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + minf = camin_kernel_32(n1, x); + ix = n1 * 2; + i = n1; } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (minf); - minf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) < minf) { - minf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) < minf) { - minf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) < minf) { - minf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) < minf) { + minf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) < minf) { + minf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (minf); + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (minf); + } } diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c index f4ebc21bd8..f59e5a20b3 100644 --- a/kernel/zarch/casum.c +++ b/kernel/zarch/casum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,140 +28,128 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif -static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vfasb %%v0,%%v0,%%v3 \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepf %%v1,%%v0,2 \n\t" - "aebr %%f0,%%f1 \n\t" - "ler %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ip=0; - FLOAT sumf = 0.0; - BLASLONG n1; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(sumf); +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ip = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; - if ( inc_x == 1 ) - { + if (n <= 0 || inc_x <= 0) + return (sumf); - n1 = n & -32; - if ( n1 > 0 ) - { + if (inc_x == 1) { - sumf = casum_kernel_32(n1, x); - i=n1; - ip=2*n1; - } + n1 = n & -32; + if (n1 > 0) { - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - i++; - ip+=2; - } + sumf = casum_kernel_32(n1, x); + i = n1; + ip = 2 * n1; + } + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + i++; + ip += 2; } - else - { - inc_x2 = 2* inc_x; - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - ip+=inc_x2; - i++; - } + } else { + inc_x2 = 2 * inc_x; + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + ip += inc_x2; + i++; } - return(sumf); -} - + } + return (sumf); +} diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index fe5568cc83..d86342bd0f 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,148 +27,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( +static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__( #if !defined(CONJ) - "vlrepf %%v0,0(%3) \n\t" - "vlef %%v1,4(%3),0 \n\t" - "vlef %%v1,4(%3),2 \n\t" - "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,4(%3),1 \n\t" - "vlef %%v1,4(%3),3 \n\t" -#else - "vlef %%v0,0(%3),1 \n\t" - "vlef %%v0,0(%3),3 \n\t" - "vflcsb %%v0,%%v0 \n\t" - "vlef %%v0,0(%3),0 \n\t" - "vlef %%v0,0(%3),2 \n\t" - "vlrepf %%v1,4(%3) \n\t" + "vlrepf %%v0,0(%[alpha])\n\t" + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" +#else + "vlef %%v0,0(%[alpha]),1\n\t" + "vlef %%v0,0(%[alpha]),3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,0(%[alpha]),0\n\t" + "vlef %%v0,0(%[alpha]),2\n\t" + "vlrepf %%v1,4(%[alpha])\n\t" #endif - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "verllg %%v24,%%v16,32 \n\t" - "verllg %%v25,%%v17,32 \n\t" - "verllg %%v26,%%v18,32 \n\t" - "verllg %%v27,%%v19,32 \n\t" - - "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,0(%%r1,%2) \n\t" - "vst %%v29,16(%%r1,%2) \n\t" - "vst %%v30,32(%%r1,%2) \n\t" - "vst %%v31,48(%%r1,%2) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,80(%%r1,%1) \n\t" - "vl %%v18,96(%%r1,%1) \n\t" - "vl %%v19,112(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "verllg %%v24,%%v16,32 \n\t" - "verllg %%v25,%%v17,32 \n\t" - "verllg %%v26,%%v18,32 \n\t" - "verllg %%v27,%%v19,32 \n\t" - - "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,64(%%r1,%2) \n\t" - "vst %%v29,80(%%r1,%2) \n\t" - "vst %%v30,96(%%r1,%2) \n\t" - "vst %%v31,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmasb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmasb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmasb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmasb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmasb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmasb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmasb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmasb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmasb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT da[2] __attribute__ ((aligned(16))); +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2] __attribute__ ((aligned(16))); - if (n <= 0) return (0); + if (n <= 0) + return (0); - if ((inc_x == 1) && (inc_y == 1)) { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; - if (n1) { - da[0] = da_r; - da[1] = da_i; - caxpy_kernel_16(n1, x, y, da); - ix = 2 * n1; - } - i = n1; - while (i < n) { + if (n1) { + da[0] = da_r; + da[1] = da_i; + caxpy_kernel_16(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { #if !defined(CONJ) - y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); - y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); - y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - i++; - ix += 2; - - } - return (0); - + i++; + ix += 2; } + return (0); - inc_x *= 2; - inc_y *= 2; + } - while (i < n) { + inc_x *= 2; + inc_y *= 2; + + while (i < n) { #if !defined(CONJ) - y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); - y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); - y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } - return (0); + } + return (0); } - - diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c index fc0b8d6485..1b93a812eb 100644 --- a/kernel/zarch/ccopy.c +++ b/kernel/zarch/ccopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,73 +27,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,5 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","r2" - ); +static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n * 2]) x) + : "cc"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + if (n <= 0) + return (0); - if ( (inc_x == 1) && (inc_y == 1 )) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - ccopy_kernel_32(n1, x, y); - i=n1; - ix=n1*2; - iy=n1*2; - } - - while(i < n) - { - y[iy] = x[iy] ; - y[iy+1] = x[ix+1] ; - ix+=2; - iy+=2; - i++ ; - - } + BLASLONG n1 = n & -32; + if (n1 > 0) { + ccopy_kernel_32(n1, x, y); + i = n1; + ix = n1 * 2; + iy = n1 * 2; + } + while (i < n) { + y[iy] = x[iy]; + y[iy + 1] = x[ix + 1]; + ix += 2; + iy += 2; + i++; } - else - { - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + } else { - while(i < n) - { - y[iy] = x[ix] ; - y[iy+1] = x[ix+1] ; - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; - } + while (i < n) { + y[iy] = x[ix]; + y[iy + 1] = x[ix + 1]; + ix += inc_x2; + iy += inc_y2; + i++; } - - return(0); + + } + + return (0); } diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c index 3eda2979b9..64d81ae5c9 100644 --- a/kernel/zarch/cdot.c +++ b/kernel/zarch/cdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,156 +27,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "vzero %%v28 \n\t" - "vzero %%v29 \n\t" - "vzero %%v30 \n\t" - "vzero %%v31 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" - "verllg %%v21,%%v17,32 \n\t" - "verllg %%v22,%%v18,32 \n\t" - "verllg %%v23,%%v19,32 \n\t" - - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" - - "vl %%v16, 64(%%r1,%1) \n\t" - "vl %%v17, 80(%%r1,%1) \n\t" - "vl %%v18, 96(%%r1,%1) \n\t" - "vl %%v19, 112(%%r1,%1) \n\t" - "vl %%v0, 64(%%r1,%2) \n\t" - "vl %%v1, 80(%%r1,%2) \n\t" - "vl %%v2, 96(%%r1,%2) \n\t" - "vl %%v3, 112(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" - "verllg %%v21,%%v17,32 \n\t" - "verllg %%v22,%%v18,32 \n\t" - "verllg %%v23,%%v19,32 \n\t" - - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vfasb %%v24,%%v24,%%v26 \n\t" - "vfasb %%v24,%%v24,%%v28 \n\t" - "vfasb %%v24,%%v24,%%v30 \n\t" - "vrepg %%v26,%%v24,1 \n\t" - "vfasb %%v24,%%v24,%%v26 \n\t" - "vfasb %%v25,%%v25,%%v27 \n\t" - "vfasb %%v25,%%v25,%%v29 \n\t" - "vfasb %%v25,%%v25,%%v31 \n\t" - "vrepg %%v27,%%v25,1 \n\t" - "vfasb %%v25,%%v25,%%v27 \n\t" - "vstef %%v24,0(%3),0 \n\t" - "vstef %%v24,4(%3),1 \n\t" - "vstef %%v25,8(%3),1 \n\t" - "vstef %%v25,12(%3),0 " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vrepg %%v26,%%v24,1\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vfasb %%v25,%%v25,%%v29\n\t" + "vfasb %%v25,%%v25,%%v31\n\t" + "vrepg %%v27,%%v25,1\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vstef %%v24,0(%[d]),0\n\t" + "vstef %%v24,4(%[d]),1\n\t" + "vstef %%v25,8(%[d]),1\n\t" + "vstef %%v25,12(%[d]),0" + : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i; - BLASLONG ix, iy; - OPENBLAS_COMPLEX_FLOAT result; - FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; - - if (n <= 0) { - CREAL(result) = 0.0; - CIMAG(result) = 0.0; - return (result); - - } +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = { + 0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); - if ((inc_x == 1) && (inc_y == 1)) { + } - BLASLONG n1 = n & -16; + if ((inc_x == 1) && (inc_y == 1)) { - if (n1) - cdot_kernel_16(n1, x, y, dot); + BLASLONG n1 = n & -16; - i = n1; - BLASLONG j = i * 2; + if (n1) + cdot_kernel_16(n1, x, y, dot); - while (i < n) { + i = n1; + BLASLONG j = i * 2; - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; + while (i < n) { - j += 2; - i++; + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; - } + j += 2; + i++; + } - } else { - i = 0; - ix = 0; - iy = 0; - inc_x <<= 1; - inc_y <<= 1; - while (i < n) { + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { - dot[0] += x[ix] * y[iy]; - dot[1] += x[ix + 1] * y[iy + 1]; - dot[2] += x[ix] * y[iy + 1]; - dot[3] += x[ix + 1] * y[iy]; + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } } + } #if !defined(CONJ) - CREAL(result) = dot[0] - dot[1]; - CIMAG(result) = dot[2] + dot[3]; + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; #else - CREAL(result) = dot[0] + dot[1]; - CIMAG(result) = dot[2] - dot[3]; + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; #endif - return (result); + return (result); } - - diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index ed81325e1a..db91d90634 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,719 +25,720 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include #include "common.h" #define NBMAX 2048 -static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vlrepg %%v16,0(%5) \n\t" - "vlrepg %%v17,8(%5) \n\t" - "vlrepg %%v18,16(%5) \n\t" - "vlrepg %%v19,24(%5) \n\t" +static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vlrepg %%v16,0(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" + "vlrepg %%v18,16(%[x])\n\t" + "vlrepg %%v19,24(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" #else - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vlef %%v24,0(%%r1,%1),0 \n\t" - "vlef %%v24,0(%%r1,%1),1 \n\t" - "vlef %%v24,8(%%r1,%1),2 \n\t" - "vlef %%v24,8(%%r1,%1),3 \n\t" - "vlef %%v25,4(%%r1,%1),0 \n\t" - "vlef %%v25,4(%%r1,%1),1 \n\t" - "vlef %%v25,12(%%r1,%1),2 \n\t" - "vlef %%v25,12(%%r1,%1),3 \n\t" - "vlef %%v26,0(%%r1,%2),0 \n\t" - "vlef %%v26,0(%%r1,%2),1 \n\t" - "vlef %%v26,8(%%r1,%2),2 \n\t" - "vlef %%v26,8(%%r1,%2),3 \n\t" - "vlef %%v27,4(%%r1,%2),0 \n\t" - "vlef %%v27,4(%%r1,%2),1 \n\t" - "vlef %%v27,12(%%r1,%2),2 \n\t" - "vlef %%v27,12(%%r1,%2),3 \n\t" - - "vl %%v0,0(%%r1,%6) \n\t" - "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlef %%v28,0(%%r1,%3),0 \n\t" - "vlef %%v28,0(%%r1,%3),1 \n\t" - "vlef %%v28,8(%%r1,%3),2 \n\t" - "vlef %%v28,8(%%r1,%3),3 \n\t" - "vlef %%v29,4(%%r1,%3),0 \n\t" - "vlef %%v29,4(%%r1,%3),1 \n\t" - "vlef %%v29,12(%%r1,%3),2 \n\t" - "vlef %%v29,12(%%r1,%3),3 \n\t" - "vlef %%v30,0(%%r1,%4),0 \n\t" - "vlef %%v30,0(%%r1,%4),1 \n\t" - "vlef %%v30,8(%%r1,%4),2 \n\t" - "vlef %%v30,8(%%r1,%4),3 \n\t" - "vlef %%v31,4(%%r1,%4),0 \n\t" - "vlef %%v31,4(%%r1,%4),1 \n\t" - "vlef %%v31,12(%%r1,%4),2 \n\t" - "vlef %%v31,12(%%r1,%4),3 \n\t" - - "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" - "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" - "vfmasb %%v0,%%v30,%%v19,%%v0 \n\t" - "vfmasb %%v0,%%v31,%%v23,%%v0 \n\t" - "vst %%v0,0(%%r1,%6) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,0b \n\t" - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vperm %%v25,%%v24,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v24,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap1])\n\t" + "vperm %%v27,%%v26,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v26,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0\n\t" + "vl %%v28,0(%%r1,%[ap2])\n\t" + "vperm %%v29,%%v28,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v28,%%v1\n\t" + "vl %%v30,0(%%r1,%[ap3])\n\t" + "vperm %%v31,%%v30,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v30,%%v1\n\t" + "vfmasb %%v0,%%v28,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v29,%%v22,%%v0\n\t" + "vfmasb %%v0,%%v30,%%v19,%%v0\n\t" + "vfmasb %%v0,%%v31,%%v23,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vlrepg %%v16,0(%3) \n\t" - "vlrepg %%v17,8(%3) \n\t" +static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vlrepg %%v16,0(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" #else - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vlef %%v20,0(%%r1,%1),0 \n\t" - "vlef %%v20,0(%%r1,%1),1 \n\t" - "vlef %%v20,8(%%r1,%1),2 \n\t" - "vlef %%v20,8(%%r1,%1),3 \n\t" - "vlef %%v21,4(%%r1,%1),0 \n\t" - "vlef %%v21,4(%%r1,%1),1 \n\t" - "vlef %%v21,12(%%r1,%1),2 \n\t" - "vlef %%v21,12(%%r1,%1),3 \n\t" - "vlef %%v22,0(%%r1,%2),0 \n\t" - "vlef %%v22,0(%%r1,%2),1 \n\t" - "vlef %%v22,8(%%r1,%2),2 \n\t" - "vlef %%v22,8(%%r1,%2),3 \n\t" - "vlef %%v23,4(%%r1,%2),0 \n\t" - "vlef %%v23,4(%%r1,%2),1 \n\t" - "vlef %%v23,12(%%r1,%2),2 \n\t" - "vlef %%v23,12(%%r1,%2),3 \n\t" - - "vl %%v0,0(%%r1,%4) \n\t" - "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v21,%%v18,%%v0 \n\t" - "vfmasb %%v0,%%v22,%%v17,%%v0 \n\t" - "vfmasb %%v0,%%v23,%%v19,%%v0 \n\t" - "vst %%v0,0(%%r1,%4) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,0b \n\t" - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v20,0(%%r1,%[ap0])\n\t" + "vperm %%v21,%%v20,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v20,%%v1\n\t" + "vl %%v22,0(%%r1,%[ap1])\n\t" + "vperm %%v23,%%v22,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v22,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v23,%%v19,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23"); } -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vlrepg %%v16,0(%2) \n\t" +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + __asm__("vlrepg %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" - "vflcsb %%v17,%%v17 \n\t" - "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" #else - "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" - "vflcsb %%v17,%%v17 \n\t" - "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vlef %%v18,0(%%r1,%1),0 \n\t" - "vlef %%v18,0(%%r1,%1),1 \n\t" - "vlef %%v18,8(%%r1,%1),2 \n\t" - "vlef %%v18,8(%%r1,%1),3 \n\t" - "vlef %%v19,4(%%r1,%1),0 \n\t" - "vlef %%v19,4(%%r1,%1),1 \n\t" - "vlef %%v19,12(%%r1,%1),2 \n\t" - "vlef %%v19,12(%%r1,%1),3 \n\t" - - "vl %%v0,0(%%r1,%3) \n\t" - "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v19,%%v17,%%v0 \n\t" - "vst %%v0,0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,0b \n\t" - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19" - ); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v18,0(%%r1,%[ap])\n\t" + "vperm %%v19,%%v18,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v18,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v19,%%v17,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19"); } -static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) -{ - __asm__ volatile ( -#if !defined(XCONJ) - "vlrepf %%v0,%3 \n\t" - "vlef %%v1,%4,0 \n\t" - "vlef %%v1,%4,2 \n\t" - "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,%4,1 \n\t" - "vlef %%v1,%4,3 \n\t" +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, + FLOAT alpha_i) { + __asm__( +#if !defined(XCONJ) + "vlrepf %%v0,%[alpha_r]\n\t" + "vlef %%v1,%[alpha_i],0\n\t" + "vlef %%v1,%[alpha_i],2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,%[alpha_i],1\n\t" + "vlef %%v1,%[alpha_i],3\n\t" #else - "vlef %%v0,%3,1 \n\t" - "vlef %%v0,%3,3 \n\t" - "vflcsb %%v0,%%v0 \n\t" - "vlef %%v0,%3,0 \n\t" - "vlef %%v0,%3,2 \n\t" - "vlrepf %%v1,%4 \n\t" + "vlef %%v0,%[alpha_r],1\n\t" + "vlef %%v0,%[alpha_r],3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,%[alpha_r],0\n\t" + "vlef %%v0,%[alpha_r],2\n\t" + "vlrepf %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,2 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,0(%%r1,%2) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" - "verllg %%v21,%%v17,32 \n\t" - - "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t" - "vfmasb %%v23,%%v17,%%v0,%%v19 \n\t" - - "vfmasb %%v22,%%v20,%%v1,%%v22 \n\t" - "vfmasb %%v23,%%v21,%%v1,%%v23 \n\t" - - "vst %%v22,0(%%r1,%2) \n\t" - "vst %%v23,16(%%r1,%2) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,0(%%r1,%[dest])\n\t" + "vl %%v19,16(%%r1,%[dest])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "vfmasb %%v22,%%v16,%%v0,%%v18\n\t" + "vfmasb %%v23,%%v17,%%v0,%%v19\n\t" + "vfmasb %%v22,%%v20,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v21,%%v1,%%v23\n\t" + "vst %%v22,0(%%r1,%[dest])\n\t" + "vst %%v23,16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), + [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) -{ - BLASLONG i; +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, + FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; - if ( inc_dest != 2 ) - { + if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - alpha[0] = alpha_r; - alpha[1] = alpha_i; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - if ( inc_x != 2 ) - copy_x(NB,x_ptr,xbuffer,inc_x); - else - xbuffer = x_ptr; - - if ( inc_y == 2 ) - { - - for( i = 0; i < n1 ; i++) - { - cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - y_ptr += 8; - - } - - if ( n2 & 2 ) - { - cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); - a_ptr += lda * 2; - y_ptr += 4; - - } - - if ( n2 & 1 ) - { - cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); - /* a_ptr += lda; - y_ptr += 2; */ - - } - - } - else - { - - for( i = 0; i < n1 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for( i = 0; i < n2 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - - - if ( m3 == 0 ) return(0); - - x_ptr = x; - j=0; - a_ptr = a; - y_ptr = y; - - if ( m3 == 3 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while ( j < n) - { +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[8]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4; + FLOAT ybuffer[8], *xbuffer; + FLOAT alpha[2]; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + lda4 = lda << 2; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + cgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if (n2 & 2) { + cgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if (n2 & 1) { + cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) + return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } - - - if ( m3 == 2 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while ( j < ( n & -2 )) - { + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } - + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } - return(0); - } + return (0); + } + if (m3 == 1) { - if ( m3 == 1 ) - { + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while ( j < ( n & -2 )) - { + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } - return(0); + return (0); } diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c index f04a624ac7..669d78a9d5 100644 --- a/kernel/zarch/crot.c +++ b/kernel/zarch/crot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,230 +27,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepf %%v0,%3 \n\t" - "vlrepf %%v1,%4 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepf %%v0,%[c]\n\t" + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1) ) - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - crot_kernel_32(n1, x, y, &cosa, &sina); - i=n1; - ix=2*n1; - } - - while(i < n) - { - temp[0] = c*x[ix] + s*y[ix] ; - temp[1] = c*x[ix+1] + s*y[ix+1] ; - y[ix] = c*y[ix] - s*x[ix] ; - y[ix+1] = c*y[ix+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += 2 ; - i++ ; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + crot_kernel_32(n1, x, y, &cosa, &sina); + i = n1; + ix = 2 * n1; + } - } + while (i < n) { + temp[0] = c * x[ix] + s * y[ix]; + temp[1] = c * x[ix + 1] + s * y[ix + 1]; + y[ix] = c * y[ix] - s * x[ix]; + y[ix + 1] = c * y[ix + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + ix += 2; + i++; } - else - { - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - } + } else { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + while (i < n) { + temp[0] = c * x[ix] + s * y[iy]; + temp[1] = c * x[ix + 1] + s * y[iy + 1]; + y[iy] = c * y[iy] - s * x[ix]; + y[iy + 1] = c * y[iy + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index 0c15c5addb..a2d5bf2239 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013 - 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,430 +27,400 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepf %%v0,0(%1) \n\t" - "vlef %%v1,4(%1),0 \n\t" - "vlef %%v1,4(%1),2 \n\t" - "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,4(%1),1 \n\t" - "vlef %%v1,4(%1),3 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "verllg %%v24,%%v16,32 \n\t" - "verllg %%v25,%%v17,32 \n\t" - "verllg %%v26,%%v18,32 \n\t" - "verllg %%v27,%%v19,32 \n\t" - "verllg %%v28,%%v20,32 \n\t" - "verllg %%v29,%%v21,32 \n\t" - "verllg %%v30,%%v22,32 \n\t" - "verllg %%v31,%%v23,32 \n\t" - - "vfmsb %%v16,%%v16,%%v0 \n\t" - "vfmsb %%v17,%%v17,%%v0 \n\t" - "vfmsb %%v18,%%v18,%%v0 \n\t" - "vfmsb %%v19,%%v19,%%v0 \n\t" - "vfmsb %%v20,%%v20,%%v0 \n\t" - "vfmsb %%v21,%%v21,%%v0 \n\t" - "vfmsb %%v22,%%v22,%%v0 \n\t" - "vfmsb %%v23,%%v23,%%v0 \n\t" - "vfmasb %%v16,%%v24,%%v1,%%v16 \n\t" - "vfmasb %%v17,%%v25,%%v1,%%v17 \n\t" - "vfmasb %%v18,%%v26,%%v1,%%v18 \n\t" - "vfmasb %%v19,%%v27,%%v1,%%v19 \n\t" - "vfmasb %%v20,%%v28,%%v1,%%v20 \n\t" - "vfmasb %%v21,%%v29,%%v1,%%v21 \n\t" - "vfmasb %%v22,%%v30,%%v1,%%v22 \n\t" - "vfmasb %%v23,%%v31,%%v1,%%v23 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlef %%v0,4(%1),0 \n\t" - "vlef %%v0,4(%1),2 \n\t" - "vflcsb %%v0,%%v0 \n\t" - "vlef %%v0,4(%1),1 \n\t" - "vlef %%v0,4(%1),3 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "verllg %%v16,%%v16,32 \n\t" - "verllg %%v17,%%v17,32 \n\t" - "verllg %%v18,%%v18,32 \n\t" - "verllg %%v19,%%v19,32 \n\t" - "verllg %%v20,%%v20,32 \n\t" - "verllg %%v21,%%v21,32 \n\t" - "verllg %%v22,%%v22,32 \n\t" - "verllg %%v23,%%v23,32 \n\t" - - "vfmsb %%v16,%%v16,%%v0 \n\t" - "vfmsb %%v17,%%v17,%%v0 \n\t" - "vfmsb %%v18,%%v18,%%v0 \n\t" - "vfmsb %%v19,%%v19,%%v0 \n\t" - "vfmsb %%v20,%%v20,%%v0 \n\t" - "vfmsb %%v21,%%v21,%%v0 \n\t" - "vfmsb %%v22,%%v22,%%v0 \n\t" - "vfmsb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepf %%v0,0(%[alpha])\n\t" + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v24,%%v16,32\n\t" + "verllg %%v25,%%v17,32\n\t" + "verllg %%v26,%%v18,32\n\t" + "verllg %%v27,%%v19,32\n\t" + "verllg %%v28,%%v20,32\n\t" + "verllg %%v29,%%v21,32\n\t" + "verllg %%v30,%%v22,32\n\t" + "verllg %%v31,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepf %%v0,0(%1) \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfmsb %%v16,%%v16,%%v0 \n\t" - "vfmsb %%v17,%%v17,%%v0 \n\t" - "vfmsb %%v18,%%v18,%%v0 \n\t" - "vfmsb %%v19,%%v19,%%v0 \n\t" - "vfmsb %%v20,%%v20,%%v0 \n\t" - "vfmsb %%v21,%%v21,%%v0 \n\t" - "vfmsb %%v22,%%v22,%%v0 \n\t" - "vfmsb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlef %%v0,4(%[alpha]),0\n\t" + "vlef %%v0,4(%[alpha]),2\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,4(%[alpha]),1\n\t" + "vlef %%v0,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v16,%%v16,32\n\t" + "verllg %%v17,%%v17,32\n\t" + "verllg %%v18,%%v18,32\n\t" + "verllg %%v19,%%v19,32\n\t" + "verllg %%v20,%%v20,32\n\t" + "verllg %%v21,%%v21,32\n\t" + "verllg %%v22,%%v22,32\n\t" + "verllg %%v23,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepf %%v0,0(%[alpha])\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; - - for (i = 0; i < n; i += 4) - { - t0 = da_r * x[0] - da_i * x[1]; - t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; - t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; - t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; - - x[1] = da_i * x[0] + da_r * x[1]; - x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; - x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; - x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; - - x[0] = t0; - x[inc_x] = t1; - x[inc_x2] = t2; - x[inc_x3] = t3; - - x += 4 * inc_x; - } +static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - FLOAT temp0; - FLOAT temp1; - FLOAT alpha[2] __attribute__ ((aligned(16))); - - if (inc_x != 1) { - inc_x <<= 1; - - if (da_r == 0.0) { - - BLASLONG n1 = n & -2; - - if (da_i == 0.0) { +static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, + BLASLONG inc_x) { + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + } +} - while (j < n1) { +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2] __attribute__ ((aligned(16))); - x[i] = 0.0; - x[i + 1] = 0.0; - x[i + inc_x] = 0.0; - x[i + 1 + inc_x] = 0.0; - i += 2 * inc_x; - j += 2; + if (inc_x != 1) { + inc_x <<= 1; - } + if (da_r == 0.0) { - while (j < n) { + BLASLONG n1 = n & -2; - x[i] = 0.0; - x[i + 1] = 0.0; - i += inc_x; - j++; + if (da_i == 0.0) { - } + while (j < n1) { - } else { + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; - while (j < n1) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - temp1 = -da_i * x[i + 1 + inc_x]; - x[i + 1 + inc_x] = da_i * x[i + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + while (j < n) { - } + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; - while (j < n) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + } else { - } + while (j < n1) { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + } - } + while (j < n) { - } else { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + } - if (da_i == 0.0) { - BLASLONG n1 = n & -2; + } - while (j < n1) { + } else { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - temp1 = da_r * x[i + inc_x]; - x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + if (da_i == 0.0) { + BLASLONG n1 = n & -2; - } + while (j < n1) { - while (j < n) { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += inc_x; - j++; + } - } + while (j < n) { - } else { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; - BLASLONG n1 = n & -8; - if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - cscal_kernel_inc_8(n1, alpha, x, inc_x); - j = n1; - i = n1 * inc_x; - } + } - while (j < n) { + } else { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + cscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } - } + while (j < n) { - } + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; } - return (0); - } + } + } - BLASLONG n1 = n & -16; - if (n1 > 0) { + return (0); + } - alpha[0] = da_r; - alpha[1] = da_i; + BLASLONG n1 = n & -16; + if (n1 > 0) { - if (da_r == 0.0) - if (da_i == 0) - cscal_kernel_16_zero(n1, x); - else - cscal_kernel_16_zero_r(n1, alpha, x); - else - if (da_i == 0) - cscal_kernel_16_zero_i(n1, alpha, x); - else - cscal_kernel_16(n1, alpha, x); + alpha[0] = da_r; + alpha[1] = da_i; - i = n1 << 1; - j = n1; - } + if (da_r == 0.0) + if (da_i == 0) + cscal_kernel_16_zero(n1, x); + else + cscal_kernel_16_zero_r(n1, alpha, x); + else if (da_i == 0) + cscal_kernel_16_zero_i(n1, alpha, x); + else + cscal_kernel_16(n1, alpha, x); + i = n1 << 1; + j = n1; + } - if (da_r == 0.0) { + if (da_r == 0.0) { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - x[i] = 0.0; - x[i + 1] = 0.0; - i += 2; - j++; + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } + } - } + } - } else { + } else { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } - - } + } } - return (0); + } + + return (0); } diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c index 256995d500..92a81591fb 100644 --- a/kernel/zarch/cswap.c +++ b/kernel/zarch/cswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,157 +27,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2, inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1 )) - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - cswap_kernel_32(n1, x, y); - i=n1; - ix = 2* n1; - iy = 2* n1; - } - - while(i < n) - { - - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; - - ix += 2 ; - iy += 2 ; - i++ ; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, + FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + cswap_kernel_32(n1, x, y); + i = n1; + ix = 2 * n1; + iy = 2 * n1; + } + while (i < n) { - } + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; + ix += 2; + iy += 2; + i++; } - else - { - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; + } else { - while(i < n) - { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; + while (i < n) { - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; - } + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - - -} + } + return (0); +} diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 827467189e..37008f702d 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmaxdb %%v16,%%v16,%%v24,8 \n\t" - "vfmaxdb %%v17,%%v17,%%v25,8 \n\t" - "vfmaxdb %%v18,%%v18,%%v26,8 \n\t" - "vfmaxdb %%v19,%%v19,%%v27,8 \n\t" - "vfmaxdb %%v20,%%v20,%%v28,8 \n\t" - "vfmaxdb %%v21,%%v21,%%v29,8 \n\t" - "vfmaxdb %%v22,%%v22,%%v30,8 \n\t" - "vfmaxdb %%v23,%%v23,%%v31,8 \n\t" - - "vfmaxdb %%v16,%%v16,%%v20,8 \n\t" - "vfmaxdb %%v17,%%v17,%%v21,8 \n\t" - "vfmaxdb %%v18,%%v18,%%v22,8 \n\t" - "vfmaxdb %%v19,%%v19,%%v23,8 \n\t" - - "vfmaxdb %%v16,%%v16,%%v18,8 \n\t" - "vfmaxdb %%v17,%%v17,%%v19,8 \n\t" - - "vfmaxdb %%v16,%%v16,%%v17,8 \n\t" - - "vfmaxdb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmaxdb %%v0,%%v0,%%v16,8 \n\t" - "lpdr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - if (n <= 0 || inc_x <= 0) return (maxf); +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxdb %%v16,%%v16,%%v24,8\n\t" + "vfmaxdb %%v17,%%v17,%%v25,8\n\t" + "vfmaxdb %%v18,%%v18,%%v26,8\n\t" + "vfmaxdb %%v19,%%v19,%%v27,8\n\t" + "vfmaxdb %%v20,%%v20,%%v28,8\n\t" + "vfmaxdb %%v21,%%v21,%%v29,8\n\t" + "vfmaxdb %%v22,%%v22,%%v30,8\n\t" + "vfmaxdb %%v23,%%v23,%%v31,8\n\t" + "vfmaxdb %%v16,%%v16,%%v20,8\n\t" + "vfmaxdb %%v17,%%v17,%%v21,8\n\t" + "vfmaxdb %%v18,%%v18,%%v22,8\n\t" + "vfmaxdb %%v19,%%v19,%%v23,8\n\t" + "vfmaxdb %%v16,%%v16,%%v18,8\n\t" + "vfmaxdb %%v17,%%v17,%%v19,8\n\t" + "vfmaxdb %%v16,%%v16,%%v17,8\n\t" + "vfmaxdb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = damax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); + maxf = damax_kernel_32(n1, x); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); - maxf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index 95b94ee4ae..530d6e5bb6 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - if (n <= 0 || inc_x <= 0) return (maxf); +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = damax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); + maxf = damax_kernel_32(n1, x); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); - maxf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 821f9eccc8..a01791741d 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmindb %%v16,%%v16,%%v24,8 \n\t" - "vfmindb %%v17,%%v17,%%v25,8 \n\t" - "vfmindb %%v18,%%v18,%%v26,8 \n\t" - "vfmindb %%v19,%%v19,%%v27,8 \n\t" - "vfmindb %%v20,%%v20,%%v28,8 \n\t" - "vfmindb %%v21,%%v21,%%v29,8 \n\t" - "vfmindb %%v22,%%v22,%%v30,8 \n\t" - "vfmindb %%v23,%%v23,%%v31,8 \n\t" - - "vfmindb %%v16,%%v16,%%v20,8 \n\t" - "vfmindb %%v17,%%v17,%%v21,8 \n\t" - "vfmindb %%v18,%%v18,%%v22,8 \n\t" - "vfmindb %%v19,%%v19,%%v23,8 \n\t" - - "vfmindb %%v16,%%v16,%%v18,8 \n\t" - "vfmindb %%v17,%%v17,%%v19,8 \n\t" - - "vfmindb %%v16,%%v16,%%v17,8 \n\t" - - "vfmindb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmindb %%v0,%%v0,%%v16,8 \n\t" - "lpdr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - if (n <= 0 || inc_x <= 0) return (minf); +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,8\n\t" + "vfmindb %%v17,%%v17,%%v25,8\n\t" + "vfmindb %%v18,%%v18,%%v26,8\n\t" + "vfmindb %%v19,%%v19,%%v27,8\n\t" + "vfmindb %%v20,%%v20,%%v28,8\n\t" + "vfmindb %%v21,%%v21,%%v29,8\n\t" + "vfmindb %%v22,%%v22,%%v30,8\n\t" + "vfmindb %%v23,%%v23,%%v31,8\n\t" + "vfmindb %%v16,%%v16,%%v20,8\n\t" + "vfmindb %%v17,%%v17,%%v21,8\n\t" + "vfmindb %%v18,%%v18,%%v22,8\n\t" + "vfmindb %%v19,%%v19,%%v23,8\n\t" + "vfmindb %%v16,%%v16,%%v18,8\n\t" + "vfmindb %%v17,%%v17,%%v19,8\n\t" + "vfmindb %%v16,%%v16,%%v17,8\n\t" + "vfmindb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = damin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); + minf = damin_kernel_32(n1, x); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); - minf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 538690ee55..2172b6d6f5 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - if (n <= 0 || inc_x <= 0) return (minf); +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = damin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); + minf = damin_kernel_32(n1, x); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); - minf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index fea431c34f..9f69a99314 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,145 +28,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v2 \n\t" - "vfadb %%v0,%%v0,%%v3 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +#define ABS fabs + +static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT sumf = 0.0; - BLASLONG n1; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; - if (n <= 0 || inc_x <= 0) return sumf; - - if (inc_x == 1) { - - n1 = n & -32; - - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return sumf; - sumf = dasum_kernel_32(n1, x); - i = n1; - } + if (inc_x == 1) { - while (i < n) { - sumf += ABS(x[i]); - i++; - } + n1 = n & -32; - } else { - BLASLONG n1 = n & -4; - register FLOAT sum1, sum2; - sum1 = 0.0; - sum2 = 0.0; - while (j < n1) { + if (n1 > 0) { - sum1 += ABS(x[i]); - sum2 += ABS(x[i + inc_x]); - sum1 += ABS(x[i + 2 * inc_x]); - sum2 += ABS(x[i + 3 * inc_x]); + sumf = dasum_kernel_32(n1, x); + i = n1; + } - i += inc_x * 4; - j += 4; + while (i < n) { + sumf += ABS(x[i]); + i++; + } - } - sumf = sum1 + sum2; - while (j < n) { + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { - sumf += ABS(x[i]); - i += inc_x; - j++; - } + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + i += inc_x * 4; + j += 4; } - return sumf; -} + sumf = sum1 + sum2; + while (j < n) { + sumf += ABS(x[i]); + i += inc_x; + j++; + } + } + return sumf; +} diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index e8823745e4..179ef8834c 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( - "vlrepg %%v0,%3 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - - "vfmadb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,80(%%r1,%1) \n\t" - "vl %%v26,96(%%r1,%1) \n\t" - "vl %%v27,112(%%r1,%1) \n\t" - "vl %%v28,64(%%r1,%2) \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vl %%v30,96(%%r1,%2) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmadb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmadb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "vl %%v16,128(%%r1,%1) \n\t" - "vl %%v17,144(%%r1,%1) \n\t" - "vl %%v18,160(%%r1,%1) \n\t" - "vl %%v19,176(%%r1,%1) \n\t" - "vl %%v20,128(%%r1,%2) \n\t" - "vl %%v21,144(%%r1,%2) \n\t" - "vl %%v22,160(%%r1,%2) \n\t" - "vl %%v23,176(%%r1,%2) \n\t" - - "vfmadb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,192(%%r1,%1) \n\t" - "vl %%v25,208(%%r1,%1) \n\t" - "vl %%v26,224(%%r1,%1) \n\t" - "vl %%v27,240(%%r1,%1) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmadb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmadb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,128(%%r1,%2) \n\t" - "vst %%v17,144(%%r1,%2) \n\t" - "vst %%v18,160(%%r1,%2) \n\t" - "vst %%v19,176(%%r1,%2) \n\t" - "vst %%v20,192(%%r1,%2) \n\t" - "vst %%v21,208(%%r1,%2) \n\t" - "vst %%v22,224(%%r1,%2) \n\t" - "vst %%v23,240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__("vlrepg %%v0,%[alpha]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + [alpha] "m"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return 0 ; + if (n <= 0) + return 0; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -32; - if ( n1 ) - daxpy_kernel_32(n1, x, y , &da); + if (n1) + daxpy_kernel_32(n1, x, y, &da); - i = n1; - while(i < n) - { - - y[i] += da * x[i] ; - i++ ; - - } - return 0 ; + i = n1; + while (i < n) { + y[i] += da * x[i]; + i++; } + return 0; - BLASLONG n1 = n & -4; + } - while(i < n1) - { + BLASLONG n1 = n & -4; - FLOAT m1 = da * x[ix] ; - FLOAT m2 = da * x[ix+inc_x] ; - FLOAT m3 = da * x[ix+2*inc_x] ; - FLOAT m4 = da * x[ix+3*inc_x] ; + while (i < n1) { - y[iy] += m1 ; - y[iy+inc_y] += m2 ; - y[iy+2*inc_y] += m3 ; - y[iy+3*inc_y] += m4 ; + FLOAT m1 = da * x[ix]; + FLOAT m2 = da * x[ix + inc_x]; + FLOAT m3 = da * x[ix + 2 * inc_x]; + FLOAT m4 = da * x[ix + 3 * inc_x]; - ix += inc_x*4 ; - iy += inc_y*4 ; - i+=4 ; + y[iy] += m1; + y[iy + inc_y] += m2; + y[iy + 2 * inc_y] += m3; + y[iy + 3 * inc_y] += m4; - } + ix += inc_x * 4; + iy += inc_y * 4; + i += 4; - while(i < n) - { + } - y[iy] += da * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { - } - return 0 ; - -} + y[iy] += da * x[ix]; + ix += inc_x; + iy += inc_y; + i++; + } + return 0; +} diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c index bb53256931..f7cbf54b2e 100644 --- a/kernel/zarch/dcopy.c +++ b/kernel/zarch/dcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,5 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","r2" - ); +static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n]) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if (n <= 0) return 0; - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - dcopy_kernel_32(n1, x, y); - i = n1; - } + if (n <= 0) + return 0; - while (i < n) { - y[i] = x[i]; - i++; + if ((inc_x == 1) && (inc_y == 1)) { - } + BLASLONG n1 = n & -32; + if (n1 > 0) { + dcopy_kernel_32(n1, x, y); + i = n1; + } + while (i < n) { + y[i] = x[i]; + i++; - } else { + } - while (i < n) { + } else { - y[iy] = x[ix]; - ix += inc_x; - iy += inc_y; - i++; + while (i < n) { - } + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; } - return 0; + } + return 0; } diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index ff4c347a6c..f5f601717c 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,123 +27,127 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - FLOAT dot; - - __asm__ volatile ( - "vzero %%v0 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%3) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,16(%%r1,%3) \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" - "vl %%v27,48(%%r1,%3) \n\t" - "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" - "vl %%v28,64(%%r1,%3) \n\t" - "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%3) \n\t" - "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" - "vl %%v30,96(%%r1,%3) \n\t" - "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(dot) - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return dot; +static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + FLOAT dot; + + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), + [y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return dot; } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - FLOAT dot = 0.0 ; + FLOAT dot = 0.0; - if ( n <= 0 ) return(dot); + if (n <= 0) + return (dot); - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; - if ( n1 ) - dot = ddot_kernel_16(n1, x, y); + if (n1) + dot = ddot_kernel_16(n1, x, y); - i = n1; - while(i < n) - { - - dot += y[i] * x[i] ; - i++ ; - - } - return(dot); + i = n1; + while (i < n) { + dot += y[i] * x[i]; + i++; } + return (dot); - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; + } - BLASLONG n1 = n & -4; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; - while(i < n1) - { + BLASLONG n1 = n & -4; - FLOAT m1 = y[iy] * x[ix] ; - FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; + while (i < n1) { - FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; - FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; + FLOAT m1 = y[iy] * x[ix]; + FLOAT m2 = y[iy + inc_y] * x[ix + inc_x]; - ix += inc_x*4 ; - iy += inc_y*4 ; + FLOAT m3 = y[iy + 2 * inc_y] * x[ix + 2 * inc_x]; + FLOAT m4 = y[iy + 3 * inc_y] * x[ix + 3 * inc_x]; - temp1 += m1+m3; - temp2 += m2+m4; + ix += inc_x * 4; + iy += inc_y * 4; - i+=4 ; + temp1 += m1 + m3; + temp2 += m2 + m4; - } + i += 4; - while(i < n) - { + } - temp1 += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { - } - dot = temp1 + temp2; - return(dot); - -} + temp1 += y[iy] * x[ix]; + ix += inc_x; + iy += inc_y; + i++; + } + dot = temp1 + temp2; + return (dot); +} diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index ca4fd61709..c93ff9b548 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,663 +29,579 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepg %%v0,0(%5) \n\t" - "vlrepg %%v1,8(%5) \n\t" - "vlrepg %%v2,16(%5) \n\t" - "vlrepg %%v3,24(%5) \n\t" - "vlrepg %%v4,%7 \n\t" - "vfmdb %%v0,%%v0,%%v4 \n\t" - "vfmdb %%v1,%%v1,%%v4 \n\t" - "vfmdb %%v2,%%v2,%%v4 \n\t" - "vfmdb %%v3,%%v3,%%v4 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - "vl %%v20,16(%%r1,%1) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,16(%%r1,%3) \n\t" - "vl %%v23,16(%%r1,%4) \n\t" - "vl %%v24,32(%%r1,%1) \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vl %%v28,48(%%r1,%1) \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "vl %%v4,16(%%r1,%6) \n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,16(%%r1,%6) \n\t" - - "vl %%v4,32(%%r1,%6) \n\t" - "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,32(%%r1,%6) \n\t" - - "vl %%v4,48(%%r1,%6) \n\t" - "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,48(%%r1,%6) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,64(%%r1,%2) \n\t" - "vl %%v18,64(%%r1,%3) \n\t" - "vl %%v19,64(%%r1,%4) \n\t" - "vl %%v20,80(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,80(%%r1,%3) \n\t" - "vl %%v23,80(%%r1,%4) \n\t" - "vl %%v24,96(%%r1,%1) \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vl %%v28,112(%%r1,%1) \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - - "vl %%v4,64(%%r1,%6) \n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,64(%%r1,%6) \n\t" - - "vl %%v4,80(%%r1,%6) \n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,80(%%r1,%6) \n\t" - - "vl %%v4,96(%%r1,%6) \n\t" - "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,96(%%r1,%6) \n\t" - - "vl %%v4,112(%%r1,%6) \n\t" - "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,112(%%r1,%6) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - "vl %%v20,16(%%r1,%1) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,16(%%r1,%3) \n\t" - "vl %%v23,16(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "vl %%v4,16(%%r1,%6) \n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,16(%%r1,%6) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepg %%v0,0(%[x])\n\t" + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,16(%[x])\n\t" + "vlrepg %%v3,24(%[x])\n\t" + "vlrepg %%v4,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v4\n\t" + "vfmdb %%v1,%%v1,%%v4\n\t" + "vfmdb %%v2,%%v2,%%v4\n\t" + "vfmdb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepg %%v0,0(%3) \n\t" - "vlrepg %%v1,8(%3) \n\t" - "vlrepg %%v2,%5 \n\t" - "vfmdb %%v0,%%v0,%%v2 \n\t" - "vfmdb %%v1,%%v1,%%v2 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,16(%%r1,%1) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - "vl %%v20,32(%%r1,%1) \n\t" - "vl %%v21,32(%%r1,%2) \n\t" - "vl %%v22,48(%%r1,%1) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vl %%v26,80(%%r1,%1) \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vl %%v28,96(%%r1,%1) \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%1) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "vl %%v2,16(%%r1,%4) \n\t" - "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" - "vst %%v2,16(%%r1,%4) \n\t" - - "vl %%v2,32(%%r1,%4) \n\t" - "vfmadb %%v2,%%v20,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v21,%%v1,%%v2 \n\t" - "vst %%v2,32(%%r1,%4) \n\t" - - "vl %%v2,48(%%r1,%4) \n\t" - "vfmadb %%v2,%%v22,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v23,%%v1,%%v2 \n\t" - "vst %%v2,48(%%r1,%4) \n\t" - - "vl %%v2,64(%%r1,%4) \n\t" - "vfmadb %%v2,%%v24,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v25,%%v1,%%v2 \n\t" - "vst %%v2,64(%%r1,%4) \n\t" - - "vl %%v2,80(%%r1,%4) \n\t" - "vfmadb %%v2,%%v26,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v27,%%v1,%%v2 \n\t" - "vst %%v2,80(%%r1,%4) \n\t" - - "vl %%v2,96(%%r1,%4) \n\t" - "vfmadb %%v2,%%v28,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v29,%%v1,%%v2 \n\t" - "vst %%v2,96(%%r1,%4) \n\t" - - "vl %%v2,112(%%r1,%4) \n\t" - "vfmadb %%v2,%%v30,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v31,%%v1,%%v2 \n\t" - "vst %%v2,112(%%r1,%4) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,16(%%r1,%1) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "vl %%v2,16(%%r1,%4) \n\t" - "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" - "vst %%v2,16(%%r1,%4) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepg %%v0,0(%[x])\n\t" + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v2\n\t" + "vfmdb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmadb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmadb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepg %%v0,0(%2) \n\t" - "vlrepg %%v1,%4 \n\t" - "vfmdb %%v0,%%v0,%%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%1) \n\t" - "vl %%v22,96(%%r1,%1) \n\t" - "vl %%v23,112(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "vl %%v1,16(%%r1,%3) \n\t" - "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" - "vst %%v1,16(%%r1,%3) \n\t" - - "vl %%v1,32(%%r1,%3) \n\t" - "vfmadb %%v1,%%v18,%%v0,%%v1 \n\t" - "vst %%v1,32(%%r1,%3) \n\t" - - "vl %%v1,48(%%r1,%3) \n\t" - "vfmadb %%v1,%%v19,%%v0,%%v1 \n\t" - "vst %%v1,48(%%r1,%3) \n\t" - - "vl %%v1,64(%%r1,%3) \n\t" - "vfmadb %%v1,%%v20,%%v0,%%v1 \n\t" - "vst %%v1,64(%%r1,%3) \n\t" - - "vl %%v1,80(%%r1,%3) \n\t" - "vfmadb %%v1,%%v21,%%v0,%%v1 \n\t" - "vst %%v1,80(%%r1,%3) \n\t" - - "vl %%v1,96(%%r1,%3) \n\t" - "vfmadb %%v1,%%v22,%%v0,%%v1 \n\t" - "vst %%v1,96(%%r1,%3) \n\t" - - "vl %%v1,112(%%r1,%3) \n\t" - "vfmadb %%v1,%%v23,%%v0,%%v1 \n\t" - "vst %%v1,112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "vl %%v1,16(%%r1,%3) \n\t" - "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" - "vst %%v1,16(%%r1,%3) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepg %%v0,0(%[x])\n\t" + "vlrepg %%v16,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,0(%%r1,%[y])\n\t" + "vl %%v19,16(%%r1,%[y])\n\t" + "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" + "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" + "vst %%v18,0(%%r1,%[y])\n\t" + "vst %%v19,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i]; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i]; + dest += inc_dest; + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG i; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[4]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4 = lda << 2; - FLOAT xbuffer[8],*ybuffer; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - ybuffer = buffer; - - n1 = n >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*8); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - /* a_ptr += lda; - x_ptr += 1; */ - - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) { + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[4]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4 = lda << 2; + FLOAT xbuffer[8], *ybuffer; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + ybuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; } - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if (inc_y != 1) + memset(ybuffer, 0, NB * 8); + else + ybuffer = y_ptr; + + if (inc_x == 1) { + + for (i = 0; i < n1; i++) { + dgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if (n2 & 2) { + dgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); + a_ptr += lda * 2; + x_ptr += 2; + } + + if (n2 & 1) { + dgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); + /* a_ptr += lda; + x_ptr += 1; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); + a_ptr += lda; + + } + } + a += NB; + if (inc_y != 1) { + add_y(NB, ybuffer, y_ptr, inc_y); + y_ptr += NB * inc_y; + } else + y_ptr += NB; + + } + + if (m3 == 0) + return (0); + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if (lda == 3 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if (lda == 2 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return (0); + } + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if (lda == 1 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp += + a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i + + 2] * + x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3]; + + } + + for (; i < n; i++) { + temp += a_ptr[i] * x_ptr[i]; + } + + } else { + + for (i = 0; i < n; i++) { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp; + return (0); + } - return(0); + return (0); } - - diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 2d8fa0d104..24680cf1b7 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,795 +29,724 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 1,1024(%%r1,%5) \n\t" - - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - "vl %%v18,32(%%r1,%5) \n\t" - "vl %%v19,48(%%r1,%5) \n\t" - "vl %%v20,64(%%r1,%5) \n\t" - "vl %%v21,80(%%r1,%5) \n\t" - "vl %%v22,96(%%r1,%5) \n\t" - "vl %%v23,112(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" - - "vl %%v24,32(%%r1,%1) \n\t" - "vfmadb %%v0,%%v18,%%v24,%%v0 \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vfmadb %%v1,%%v18,%%v25,%%v1 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2 \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vfmadb %%v3,%%v18,%%v27,%%v3 \n\t" - - "vl %%v28,48(%%r1,%1) \n\t" - "vfmadb %%v0,%%v19,%%v28,%%v0 \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vfmadb %%v1,%%v19,%%v29,%%v1 \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vfmadb %%v2,%%v19,%%v30,%%v2 \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - "vfmadb %%v3,%%v19,%%v31,%%v3 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" - "vl %%v26,64(%%r1,%3) \n\t" - "vfmadb %%v2,%%v20,%%v26,%%v2 \n\t" - "vl %%v27,64(%%r1,%4) \n\t" - "vfmadb %%v3,%%v20,%%v27,%%v3 \n\t" - - "vl %%v28,80(%%r1,%1) \n\t" - "vfmadb %%v0,%%v21,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vfmadb %%v1,%%v21,%%v29,%%v1 \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vfmadb %%v2,%%v21,%%v30,%%v2 \n\t" - "vl %%v31,80(%%r1,%4) \n\t" - "vfmadb %%v3,%%v21,%%v31,%%v3 \n\t" - - "vl %%v24,96(%%r1,%1) \n\t" - "vfmadb %%v0,%%v22,%%v24,%%v0 \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vfmadb %%v1,%%v22,%%v25,%%v1 \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vfmadb %%v2,%%v22,%%v26,%%v2 \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vfmadb %%v3,%%v22,%%v27,%%v3 \n\t" - - "vl %%v28,112(%%r1,%1) \n\t" - "vfmadb %%v0,%%v23,%%v28,%%v0 \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vfmadb %%v1,%%v23,%%v29,%%v1 \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vfmadb %%v2,%%v23,%%v30,%%v2 \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - "vfmadb %%v3,%%v23,%%v31,%%v3 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "vrepg %%v4,%%v0,1 \n\t" - "adbr %%f0,%%f4 \n\t" - "std %%f0,0(%6) \n\t" - "vrepg %%v4,%%v1,1 \n\t" - "adbr %%f1,%%f4 \n\t" - "std %%f1,8(%6) \n\t" - "vrepg %%v4,%%v2,1 \n\t" - "adbr %%f2,%%f4 \n\t" - "std %%f2,16(%6) \n\t" - "vrepg %%v4,%%v3,1 \n\t" - "adbr %%f3,%%f4 \n\t" - "std %%f3,24(%6) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v2,%%v2,%%v6\n\t" + "vfadb %%v3,%%v3,%%v7\n\t" + "vrepg %%v4,%%v0,1\n\t" + "adbr %%f0,%%f4\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v4,%%v1,1\n\t" + "adbr %%f1,%%f4\n\t" + "std %%f1,8(%[y])\n\t" + "vrepg %%v4,%%v2,1\n\t" + "adbr %%f2,%%f4\n\t" + "std %%f2,16(%[y])\n\t" + "vrepg %%v4,%%v3,1\n\t" + "adbr %%f3,%%f4\n\t" + "std %%f3,24(%[y])" + : "=m"(*(FLOAT (*)[4]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" - - "vl %%v28,32(%%r1,%1) \n\t" - "vfmadb %%v0,%%v18,%%v28,%%v0 \n\t" - "vl %%v29,32(%%r1,%2) \n\t" - "vfmadb %%v1,%%v18,%%v29,%%v1 \n\t" - - "vl %%v30,48(%%r1,%1) \n\t" - "vfmadb %%v0,%%v19,%%v30,%%v0 \n\t" - "vl %%v31,48(%%r1,%2) \n\t" - "vfmadb %%v1,%%v19,%%v31,%%v1 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" - - "vl %%v26,80(%%r1,%1) \n\t" - "vfmadb %%v0,%%v21,%%v26,%%v0 \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vfmadb %%v1,%%v21,%%v27,%%v1 \n\t" - - "vl %%v28,96(%%r1,%1) \n\t" - "vfmadb %%v0,%%v22,%%v28,%%v0 \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vfmadb %%v1,%%v22,%%v29,%%v1 \n\t" - - "vl %%v30,112(%%r1,%1) \n\t" - "vfmadb %%v0,%%v23,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - "vfmadb %%v1,%%v23,%%v31,%%v1 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "vrepg %%v2,%%v0,1 \n\t" - "adbr %%f0,%%f2 \n\t" - "std %%f0,0(%4) \n\t" - "vrepg %%v2,%%v1,1 \n\t" - "adbr %%f1,%%f2 \n\t" - "std %%f1,8(%4) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v1,%%v1,%%v3\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v1,%%v1,%%v7\n\t" + "vrepg %%v2,%%v0,1\n\t" + "adbr %%f0,%%f2\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v2,%%v1,1\n\t" + "adbr %%f1,%%f2\n\t" + "std %%f1,8(%[y])" + : "=m"(*(FLOAT (*)[2]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - - "vl %%v26,32(%%r1,%1) \n\t" - "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" - - "vl %%v27,48(%%r1,%1) \n\t" - "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" - - "vl %%v28,64(%%r1,%1) \n\t" - "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" - - "vl %%v29,80(%%r1,%1) \n\t" - "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" - - "vl %%v30,96(%%r1,%1) \n\t" - "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" - - "vl %%v31,112(%%r1,%1) \n\t" - "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "std %%f0,0(%3) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "std %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - dest[i] = *src; - src += inc_src; - } +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + dest[i] = *src; + src += inc_src; + } } - -static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) -{ - __asm__ volatile ( - "vlrepg %%v0,%1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - "vl %%v26, 32(%%r1,%3) \n\t" - "vfmadb %%v26,%%v18,%%v0,%%v26 \n\t" - "vst %%v26, 32(%%r1,%3) \n\t" - "vl %%v27, 48(%%r1,%3) \n\t" - "vfmadb %%v27,%%v19,%%v0,%%v27 \n\t" - "vst %%v27, 48(%%r1,%3) \n\t" - "vl %%v28, 64(%%r1,%3) \n\t" - "vfmadb %%v28,%%v20,%%v0,%%v28 \n\t" - "vst %%v28, 64(%%r1,%3) \n\t" - "vl %%v29, 80(%%r1,%3) \n\t" - "vfmadb %%v29,%%v21,%%v0,%%v29 \n\t" - "vst %%v29, 80(%%r1,%3) \n\t" - "vl %%v30, 96(%%r1,%3) \n\t" - "vfmadb %%v30,%%v22,%%v0,%%v30 \n\t" - "vst %%v30, 96(%%r1,%3) \n\t" - "vl %%v31, 112(%%r1,%3) \n\t" - "vfmadb %%v31,%%v23,%%v0,%%v31 \n\t" - "vst %%v31, 112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { + __asm__("vlrepg %%v0,%[da]\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) dest) + : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - if (inc_dest == 1) - add_y_kernel_4(n, da, src, dest); - else - { - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i] * da; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, + BLASLONG inc_dest) { + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i] * da; + dest += inc_dest; } + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG register i; - BLASLONG register j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - BLASLONG n0; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[2] __attribute__ ((aligned(16))); - FLOAT *xbuffer; - FLOAT *ytemp; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - xbuffer = buffer; - ytemp = buffer + (m < NBMAX ? m : NBMAX); - - n0 = n / NBMAX; - n1 = (n % NBMAX) >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(NB,x_ptr,xbuffer,inc_x); - - - FLOAT *ap[4]; - FLOAT *yp; - BLASLONG register lda4 = 4 * lda; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( n0 > 0 ) - { - BLASLONG nb1 = NBMAX / 4; - for( j=0; j> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } - yp = ytemp; + y_ptr = y; + a_ptr = a; + x_ptr = x; - for( i = 0; i < n1 ; i++) - { - dgemv_kernel_4x4(NB,ap,xbuffer,yp); - ap[0] += lda4 ; - ap[1] += lda4 ; - ap[2] += lda4 ; - ap[3] += lda4 ; - yp += 4; - } - if ( n1 > 0 ) - { - add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); - y_ptr += n1 * inc_y * 4; - a_ptr += n1 * lda4 ; - } + if (inc_x == 1) + xbuffer = x_ptr; + else + copy_x(NB, x_ptr, xbuffer, inc_x); - if ( n2 & 2 ) - { + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; - dgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); - a_ptr += lda * 2; - *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1] * alpha; - y_ptr += inc_y; + if (n0 > 0) { + BLASLONG nb1 = NBMAX / 4; + for (j = 0; j < n0; j++) { + yp = ytemp; + for (i = 0; i < nb1; i++) { + dgemv_kernel_4x4(NB, ap, xbuffer, yp); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + yp += 4; } + add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += nb1 * inc_y * 4; + a_ptr += nb1 * lda4; - if ( n2 & 1 ) - { - - dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - // a_ptr += lda; - *y_ptr += ybuffer[0] * alpha; - // y_ptr += inc_y; + } - } - a += NB; - x += NB * inc_x; } - if ( m3 == 0 ) return(0); + yp = ytemp; - x_ptr = x; - a_ptr = a; - if ( m3 == 3 ) - { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if ( lda == 3 && inc_y == 1 ) - { - - for ( j=0; j< ( n & -4) ; j+=4 ) - { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for ( ; j 0) { + add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4; + } - if ( inc_y == 1 ) - { + if (n2 & 2) { - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; + dgemv_kernel_4x2(NB, ap, xbuffer, ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; - for ( j=0; j< ( n & -4 ); j+=4 ) - { + } - y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2; - y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2; - y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2; - y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2; - aj += lda4; - } + if (n2 & 1) { - for ( ; j< n ; j++ ) - { + dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); + // a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + // y_ptr += inc_y; - y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ; - aj += lda; - } + } + a += NB; + x += NB * inc_x; + } + + if (m3 == 0) + return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; - } - else - { + FLOAT *aj = a_ptr; + y_ptr = y; - for ( j=0; j 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = dmax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); + maxf = dmax_kernel_32(n1, x); + i = n1; } else { + maxf = x[0]; + i++; + } - maxf=x[0]; + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = x[0]; - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 83e7b02a86..87bccbe55d 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT max; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(max) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return max; +static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT max; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[max],%%f0" + : [max] "=f"(max),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return max; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - - if (n <= 0 || inc_x <= 0) return (maxf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = dmax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); + maxf = dmax_kernel_32(n1, x); + i = n1; } else { + maxf = x[0]; + i++; + } - maxf=x[0]; + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = x[0]; - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 073289186e..518cc262ce 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,133 +27,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT min; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmindb %%v16,%%v16,%%v24,0 \n\t" - "vfmindb %%v17,%%v17,%%v25,0 \n\t" - "vfmindb %%v18,%%v18,%%v26,0 \n\t" - "vfmindb %%v19,%%v19,%%v27,0 \n\t" - "vfmindb %%v20,%%v20,%%v28,0 \n\t" - "vfmindb %%v21,%%v21,%%v29,0 \n\t" - "vfmindb %%v22,%%v22,%%v30,0 \n\t" - "vfmindb %%v23,%%v23,%%v31,0 \n\t" - - "vfmindb %%v16,%%v16,%%v20,0 \n\t" - "vfmindb %%v17,%%v17,%%v21,0 \n\t" - "vfmindb %%v18,%%v18,%%v22,0 \n\t" - "vfmindb %%v19,%%v19,%%v23,0 \n\t" - - "vfmindb %%v16,%%v16,%%v18,0 \n\t" - "vfmindb %%v17,%%v17,%%v19,0 \n\t" - - "vfmindb %%v16,%%v16,%%v17,0 \n\t" - - "vfmindb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmindb %%v0,%%v0,%%v16,0 \n\t" - "ldr %0,%%f0 " - :"=f"(min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return min; +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT min; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v17,%%v17,%%v25,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v19,%%v19,%%v27,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v21,%%v21,%%v29,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v23,%%v23,%%v31,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v17,%%v17,%%v21,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v19,%%v19,%%v23,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v17,%%v17,%%v19,0\n\t" + "vfmindb %%v16,%%v16,%%v17,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return min; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) return (minf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = dmin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); + minf = dmin_kernel_32(n1, x); + i = n1; } else { + minf = x[0]; + i++; + } - minf=x[0]; + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = x[0]; - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index e64f90ee38..91561992f5 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT min; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return min; +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT min; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return min; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) return (minf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = dmin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); + minf = dmin_kernel_32(n1, x); + i = n1; } else { + minf = x[0]; + i++; + } - minf=x[0]; + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = x[0]; - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index c91f958005..8f0197f023 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepg %%v0,%3 \n\t" - "vlrepg %%v1,%4 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepg %%v0,%[c]\n\t" + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - - FLOAT temp; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + FLOAT temp; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if (n <= 0) + return (0); - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - drot_kernel_32(n1, x, y, &cosa, &sina); - i=n1; - } + if ((inc_x == 1) && (inc_y == 1)) { - while(i < n) - { - temp = c*x[i] + s*y[i] ; - y[i] = c*y[i] - s*x[i] ; - x[i] = temp ; - - i++ ; + BLASLONG n1 = n & -32; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + drot_kernel_32(n1, x, y, &cosa, &sina); + i = n1; + } - } + while (i < n) { + temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; + } else { - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; - } + ix += inc_x; + iy += inc_y; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index ccc6dd95d2..c944990b5a 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,179 +27,151 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) -{ - __asm__ volatile ( - "vlrepg %%v0,%1 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%2) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 0(%%r1,%2) \n\t" - "vl %%v25, 16(%%r1,%2) \n\t" - "vfmdb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 16(%%r1,%2) \n\t" - "vl %%v26, 32(%%r1,%2) \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 32(%%r1,%2) \n\t" - "vl %%v27, 48(%%r1,%2) \n\t" - "vfmdb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 48(%%r1,%2) \n\t" - "vl %%v24, 64(%%r1,%2) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 64(%%r1,%2) \n\t" - "vl %%v25, 80(%%r1,%2) \n\t" - "vfmdb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 80(%%r1,%2) \n\t" - "vl %%v26, 96(%%r1,%2) \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 96(%%r1,%2) \n\t" - "vl %%v27, 112(%%r1,%2) \n\t" - "vfmdb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 112(%%r1,%2) \n\t" - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v24","v25","v26","v27" - ); +static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { + __asm__("vlrepg %%v0,%[da]\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmdb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmdb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmdb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmdb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmdb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmdb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmdb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmdb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x),[da] "m"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0,j=0; - if ( n <= 0 || inc_x <=0 ) - return(0); - - - if ( inc_x == 1 ) - { - - if ( da == 0.0 ) - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - - dscal_kernel_16_zero(n1, x); - j=n1; - } - - while(j < n) - { - - x[j]=0.0; - j++; - } - - } - else - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - dscal_kernel_16(n1, da, x); - j=n1; - } - while(j < n) - { - - x[j] = da * x[j] ; - j++; - } - } +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + if (n <= 0 || inc_x <= 0) + return (0); + if (inc_x == 1) { + + if (da == 0.0) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + dscal_kernel_16_zero(n1, x); + j = n1; + } + + while (j < n) { + + x[j] = 0.0; + j++; + } + + } else { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + dscal_kernel_16(n1, da, x); + j = n1; + } + while (j < n) { + + x[j] = da * x[j]; + j++; + } } - else - { - if ( da == 0.0 ) - { + } else { - BLASLONG n1 = n & -4; + if (da == 0.0) { - while (j < n1) { + BLASLONG n1 = n & -4; - x[i]=0.0; - x[i + inc_x]=0.0; - x[i + 2 * inc_x]=0.0; - x[i + 3 * inc_x]=0.0; + while (j < n1) { - i += inc_x * 4; - j += 4; + x[i] = 0.0; + x[i + inc_x] = 0.0; + x[i + 2 * inc_x] = 0.0; + x[i + 3 * inc_x] = 0.0; - } - while(j < n) - { + i += inc_x * 4; + j += 4; - x[i]=0.0; - i += inc_x ; - j++; - } + } + while (j < n) { - } - else - { - BLASLONG n1 = n & -4; + x[i] = 0.0; + i += inc_x; + j++; + } - while (j < n1) { + } else { + BLASLONG n1 = n & -4; - x[i] = da * x[i] ; - x[i + inc_x] = da * x[i + inc_x]; - x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; - x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; + while (j < n1) { - i += inc_x * 4; - j += 4; + x[i] = da * x[i]; + x[i + inc_x] = da * x[i + inc_x]; + x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; + x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; - } + i += inc_x * 4; + j += 4; - while(j < n) - { + } - x[i] = da * x[i] ; - i += inc_x ; - j++; - } - } + while (j < n) { + x[i] = da * x[i]; + i += inc_x; + j++; + } } - return 0; - -} + } + return 0; +} diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 72950c9f44..1ac02d4b93 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018,The OpenBLAS Project +Copyright (c) 2013-2019,The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms,with or without modification,are permitted provided that the following conditions are @@ -27,144 +27,146 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - double dot; - - __asm__ volatile ( - "vzero %%v0 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vlef %%v16,0(%%r1,%2),0 \n\t" - "vlef %%v16,4(%%r1,%2),2 \n\t" - "vlef %%v17,8(%%r1,%2),0 \n\t" - "vlef %%v17,12(%%r1,%2),2 \n\t" - "vlef %%v18,16(%%r1,%2),0 \n\t" - "vlef %%v18,20(%%r1,%2),2 \n\t" - "vlef %%v19,24(%%r1,%2),0 \n\t" - "vlef %%v19,28(%%r1,%2),2 \n\t" - "vlef %%v20,32(%%r1,%2),0 \n\t" - "vlef %%v20,36(%%r1,%2),2 \n\t" - "vlef %%v21,40(%%r1,%2),0 \n\t" - "vlef %%v21,44(%%r1,%2),2 \n\t" - "vlef %%v22,48(%%r1,%2),0 \n\t" - "vlef %%v22,52(%%r1,%2),2 \n\t" - "vlef %%v23,56(%%r1,%2),0 \n\t" - "vlef %%v23,60(%%r1,%2),2 \n\t" - - "vflls %%v16,%%v16 \n\t" - "vflls %%v17,%%v17 \n\t" - "vflls %%v18,%%v18 \n\t" - "vflls %%v19,%%v19 \n\t" - "vflls %%v20,%%v20 \n\t" - "vflls %%v21,%%v21 \n\t" - "vflls %%v22,%%v22 \n\t" - "vflls %%v23,%%v23 \n\t" - - "vlef %%v24,0(%%r1,%3),0 \n\t" - "vlef %%v24,4(%%r1,%3),2 \n\t" - "vflls %%v24,%%v24 \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vlef %%v25,8(%%r1,%3),0 \n\t" - "vlef %%v25,12(%%r1,%3),2 \n\t" - "vflls %%v25,%%v25 \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - "vlef %%v26,16(%%r1,%3),0 \n\t" - "vlef %%v26,20(%%r1,%3),2 \n\t" - "vflls %%v26,%%v26 \n\t" - "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" - "vlef %%v27,24(%%r1,%3),0 \n\t" - "vlef %%v27,28(%%r1,%3),2 \n\t" - "vflls %%v27,%%v27 \n\t" - "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" - "vlef %%v28,32(%%r1,%3),0 \n\t" - "vlef %%v28,36(%%r1,%3),2 \n\t" - "vflls %%v28,%%v28 \n\t" - "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" - "vlef %%v29,40(%%r1,%3),0 \n\t" - "vlef %%v29,44(%%r1,%3),2 \n\t" - "vflls %%v29,%%v29 \n\t" - "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" - "vlef %%v30,48(%%r1,%3),0 \n\t" - "vlef %%v30,52(%%r1,%3),2 \n\t" - "vflls %%v30,%%v30 \n\t" - "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" - "vlef %%v31,56(%%r1,%3),0 \n\t" - "vlef %%v31,60(%%r1,%3),2 \n\t" - "vflls %%v31,%%v31 \n\t" - "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,64 \n\t" - "brctg %%r0,0b \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(dot) - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return dot; +static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + double dot; + + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vlef %%v16,0(%%r1,%[x]),0\n\t" + "vlef %%v16,4(%%r1,%[x]),2\n\t" + "vlef %%v17,8(%%r1,%[x]),0\n\t" + "vlef %%v17,12(%%r1,%[x]),2\n\t" + "vlef %%v18,16(%%r1,%[x]),0\n\t" + "vlef %%v18,20(%%r1,%[x]),2\n\t" + "vlef %%v19,24(%%r1,%[x]),0\n\t" + "vlef %%v19,28(%%r1,%[x]),2\n\t" + "vlef %%v20,32(%%r1,%[x]),0\n\t" + "vlef %%v20,36(%%r1,%[x]),2\n\t" + "vlef %%v21,40(%%r1,%[x]),0\n\t" + "vlef %%v21,44(%%r1,%[x]),2\n\t" + "vlef %%v22,48(%%r1,%[x]),0\n\t" + "vlef %%v22,52(%%r1,%[x]),2\n\t" + "vlef %%v23,56(%%r1,%[x]),0\n\t" + "vlef %%v23,60(%%r1,%[x]),2\n\t" + "vflls %%v16,%%v16\n\t" + "vflls %%v17,%%v17\n\t" + "vflls %%v18,%%v18\n\t" + "vflls %%v19,%%v19\n\t" + "vflls %%v20,%%v20\n\t" + "vflls %%v21,%%v21\n\t" + "vflls %%v22,%%v22\n\t" + "vflls %%v23,%%v23\n\t" + "vlef %%v24,0(%%r1,%[y]),0\n\t" + "vlef %%v24,4(%%r1,%[y]),2\n\t" + "vflls %%v24,%%v24\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vlef %%v25,8(%%r1,%[y]),0\n\t" + "vlef %%v25,12(%%r1,%[y]),2\n\t" + "vflls %%v25,%%v25\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vlef %%v26,16(%%r1,%[y]),0\n\t" + "vlef %%v26,20(%%r1,%[y]),2\n\t" + "vflls %%v26,%%v26\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vlef %%v27,24(%%r1,%[y]),0\n\t" + "vlef %%v27,28(%%r1,%[y]),2\n\t" + "vflls %%v27,%%v27\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vlef %%v28,32(%%r1,%[y]),0\n\t" + "vlef %%v28,36(%%r1,%[y]),2\n\t" + "vflls %%v28,%%v28\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vlef %%v29,40(%%r1,%[y]),0\n\t" + "vlef %%v29,44(%%r1,%[y]),2\n\t" + "vflls %%v29,%%v29\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vlef %%v30,48(%%r1,%[y]),0\n\t" + "vlef %%v30,52(%%r1,%[y]),2\n\t" + "vflls %%v30,%%v30\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vlef %%v31,56(%%r1,%[y]),0\n\t" + "vlef %%v31,60(%%r1,%[y]),2\n\t" + "vflls %%v31,%%v31\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return dot; } -double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - double dot = 0.0 ; + double dot = 0.0; - if ( n <= 0 ) return(dot); + if (n <= 0) + return (dot); - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; - if ( n1 ) - dot = dsdot_kernel_16(n1,x,y); + if (n1) + dot = dsdot_kernel_16(n1, x, y); - i = n1; - while(i < n) - { + i = n1; + while (i < n) { - dot += (double) y[i] * (double) x[i] ; - i++ ; + dot += (double) y[i] * (double) x[i]; + i++; - } - return(dot); + } + return (dot); + } - } + BLASLONG n1 = n & -2; - BLASLONG n1 = n & -2; + while (i < n1) { - while(i < n1) - { + dot += (double) y[iy] * (double) x[ix]; + dot += (double) y[iy + inc_y] * (double) x[ix + inc_x]; + ix += inc_x * 2; + iy += inc_y * 2; + i += 2; - dot += (double) y[iy] * (double) x[ix]; - dot += (double) y[iy+inc_y] * (double) x[ix+inc_x]; - ix += inc_x*2 ; - iy += inc_y*2 ; - i+=2 ; + } - } + while (i < n) { - while(i < n) - { + dot += (double) y[iy] * (double) x[ix]; + ix += inc_x; + iy += inc_y; + i++; - dot += (double) y[iy] * (double) x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); + } + return (dot); } - - diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index 8070ef41ac..60ba40bd62 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,136 +27,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n <= 0 ) return(0); +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; - if ( (inc_x == 1) && (inc_y == 1 )) - { + if (n <= 0) + return (0); - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - dswap_kernel_32(n1, x, y); - i=n1; - } + if ((inc_x == 1) && (inc_y == 1)) { - while(i < n) - { - temp = y[i]; - y[i] = x[i] ; - x[i] = temp; - i++ ; - - } + BLASLONG n1 = n & -32; + if (n1 > 0) { + dswap_kernel_32(n1, x, y); + i = n1; + } + while (i < n) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = y[iy]; - y[iy] = x[ix] ; - x[ix] = temp; - ix += inc_x ; - iy += inc_y ; - i++ ; + } else { - } + while (i < n) { + temp = y[iy]; + y[iy] = x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; } - return(0); - + + } + return (0); } diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 5129ca6ee3..1e1040a6e2 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) -static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) - __asm__ volatile ( - "vlef %%v0,0(%3),0 \n\t" - "vlef %%v1,4(%3),0 \n\t" - "vlef %%v0,8(%3),1 \n\t" - "vlef %%v1,12(%3),1 \n\t" - "vlef %%v0,16(%3),2 \n\t" - "vlef %%v1,20(%3),2 \n\t" - "vlef %%v0,24(%3),3 \n\t" - "vlef %%v1,28(%3),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v1,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,16 \n\t" - "vzero %%v4 \n\t" - "vleib %%v9,0,0 \n\t" - "vleib %%v9,1,1 \n\t" - "vleib %%v9,2,2 \n\t" - "vleib %%v9,3,3 \n\t" - "vleib %%v9,8,4 \n\t" - "vleib %%v9,9,5 \n\t" - "vleib %%v9,10,6 \n\t" - "vleib %%v9,11,7 \n\t" - "vleib %%v9,16,8 \n\t" - "vleib %%v9,17,9 \n\t" - "vleib %%v9,18,10 \n\t" - "vleib %%v9,19,11 \n\t" - "vleib %%v9,24,12 \n\t" - "vleib %%v9,25,13 \n\t" - "vleib %%v9,26,14 \n\t" - "vleib %%v9,27,15 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" +static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v28,16(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v29,48(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v28,144(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v29,176(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v30,208(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v31,240(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + return iamax; +} - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" + if (n <= 0 || inc_x <= 0) + return (max); - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" + if (inc_x == 1) { - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v0,%%v3 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + BLASLONG n1 = n & -32; + if (n1 > 0) { - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v2,%%v0 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + max = icamax_kernel_32(n1, x, &maxf); + ix = n1 * 2; + i = n1; + } else { + maxf = CABS1(x, 0); + ix += 2; + i++; + } - return iamax; -} + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (max + 1); -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; + } else { - if (n <= 0 || inc_x <= 0) return(max); - - if (inc_x == 1) { + max = 0; + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - BLASLONG n1 = n & -32; - if (n1 > 0) { + BLASLONG n1 = n & -4; + while (i < n1) { - max = icamax_kernel_32(n1, x, &maxf); - ix = n1 * 2; - i = n1; + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + max = i + 1; + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) > maxf) { + max = i + 2; + maxf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) > maxf) { + max = i + 3; + maxf = CABS1(x, ix + 3 * inc_x2); } - else - { - maxf = CABS1(x,0); - ix += 2; - i++; - } - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (max + 1); + ix += inc_x2 * 4; - } else { - - max = 0; - maxf = CABS1(x,0); - inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; + i += 4; - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; } - return (max + 1); + + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (max + 1); + } } - - diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 05068b212c..d1c0e32a1e 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) -static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) - __asm__ volatile ( - "vlef %%v0,0(%3),0 \n\t" - "vlef %%v1,4(%3),0 \n\t" - "vlef %%v0,8(%3),1 \n\t" - "vlef %%v1,12(%3),1 \n\t" - "vlef %%v0,16(%3),2 \n\t" - "vlef %%v1,20(%3),2 \n\t" - "vlef %%v0,24(%3),3 \n\t" - "vlef %%v1,28(%3),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v1,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,16 \n\t" - "vzero %%v4 \n\t" - "vleib %%v9,0,0 \n\t" - "vleib %%v9,1,1 \n\t" - "vleib %%v9,2,2 \n\t" - "vleib %%v9,3,3 \n\t" - "vleib %%v9,8,4 \n\t" - "vleib %%v9,9,5 \n\t" - "vleib %%v9,10,6 \n\t" - "vleib %%v9,11,7 \n\t" - "vleib %%v9,16,8 \n\t" - "vleib %%v9,17,9 \n\t" - "vleib %%v9,18,10 \n\t" - "vleib %%v9,19,11 \n\t" - "vleib %%v9,24,12 \n\t" - "vleib %%v9,25,13 \n\t" - "vleib %%v9,26,14 \n\t" - "vleib %%v9,27,15 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" +static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v28,16(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v29,48(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v28,144(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v29,176(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v30,208(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v31,240(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + return iamin; +} - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; + BLASLONG inc_x2; - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" + if (n <= 0 || inc_x <= 0) + return (min); - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" + if (inc_x == 1) { - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v3,%%v0 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + BLASLONG n1 = n & -32; + if (n1 > 0) { - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v0,%%v2 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + min = icamin_kernel_32(n1, x, &minf); + ix = n1 * 2; + i = n1; + } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } - return iamin; -} + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (min + 1); -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0; - BLASLONG min = 0; - BLASLONG inc_x2; + } else { - if (n <= 0 || inc_x <= 0) return(min); - - if (inc_x == 1) { + min = 0; + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - BLASLONG n1 = n & -32; - if (n1 > 0) { + BLASLONG n1 = n & -4; + while (i < n1) { - min = icamin_kernel_32(n1, x, &minf); - ix = n1 * 2; - i = n1; + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + min = i + 1; + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) < minf) { + min = i + 2; + minf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) < minf) { + min = i + 3; + minf = CABS1(x, ix + 3 * inc_x2); } - else - { - minf = CABS1(x,0); - ix += 2; - i++; - } - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (min + 1); + ix += inc_x2 * 4; - } else { - - min = 0; - minf = CABS1(x,0); - inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; + i += 4; - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; } - return (min + 1); + + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (min + 1); + } } - - diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index e5a1d3a7cc..8434c811f4 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif -static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v2,%%v0 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamax; +static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return iamax; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - - if (n <= 0 || inc_x <= 0) return (max); - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (max); - max = idamax_kernel_32(n1, x, &maxf); + if (inc_x == 1) { - i = n1; - } - else - { - maxf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); + max = idamax_kernel_32(n1, x, &maxf); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } - max = 0; - maxf = ABS(x[0]); + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); - BLASLONG n1 = n & -4; - while (j < n1) { + } else { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } + max = 0; + maxf = ABS(x[0]); - i += inc_x * 4; + BLASLONG n1 = n & -4; + while (j < n1) { - j += 4; + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } - } + i += inc_x * 4; + j += 4; + + } - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index a68f7282f8..80a37e6c25 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif -static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v0,%%v2 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamin; +static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return iamin; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - - if (n <= 0 || inc_x <= 0) return (min); - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (min); - min = idamin_kernel_32(n1, x, &minf); + if (inc_x == 1) { - i = n1; - } - else - { - minf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); + min = idamin_kernel_32(n1, x, &minf); + i = n1; } else { + minf = ABS(x[0]); + i++; + } - min = 0; - minf = ABS(x[0]); + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); - BLASLONG n1 = n & -4; - while (j < n1) { + } else { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } + min = 0; + minf = ABS(x[0]); - i += inc_x * 4; + BLASLONG n1 = n & -4; + while (j < n1) { - j += 4; + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } - } + i += inc_x * 4; + j += 4; + + } - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 4c3040779c..18cdba4376 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) -{ - BLASLONG imax; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v2,%%v0 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imax),"=m"(*max) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imax; +static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { + BLASLONG imax; + + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return imax; } - + BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) + return (max); - if (n <= 0 || inc_x <= 0) return (max); + if (inc_x == 1) { - if (inc_x == 1) { + BLASLONG n1 = n & -32; + if (n1 > 0) { - BLASLONG n1 = n & -32; - if (n1 > 0) { + max = idmax_kernel_32(n1, x, &maxf); - max = idmax_kernel_32(n1, x, &maxf); + i = n1; + } else { + maxf = x[0]; + i++; + } - i = n1; - } - else - { - maxf = x[0]; - i++; - } + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); - while (i < n) { - if (x[i] > maxf) { - max = i; - maxf = x[i]; - } - i++; - } - return (max + 1); + } else { - } else { + max = 0; + maxf = x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } - max = 0; - maxf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - max = j + 1; - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - max = j + 2; - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - max = j + 3; - maxf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index ba1776a49c..02ca427e47 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) -{ - BLASLONG imin; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v0,%%v2 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imin),"=m"(*min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imin; +static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { + BLASLONG imin; + + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return imin; } - + BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) + return (min); - if (n <= 0 || inc_x <= 0) return (min); + if (inc_x == 1) { - if (inc_x == 1) { + BLASLONG n1 = n & -32; + if (n1 > 0) { - BLASLONG n1 = n & -32; - if (n1 > 0) { + min = idmin_kernel_32(n1, x, &minf); - min = idmin_kernel_32(n1, x, &minf); + i = n1; + } else { + minf = x[0]; + i++; + } - i = n1; - } - else - { - minf = x[0]; - i++; - } + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); - while (i < n) { - if (x[i] < minf) { - min = i; - minf = x[i]; - } - i++; - } - return (min + 1); + } else { - } else { + min = 0; + minf = x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } - min = 0; - minf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] < minf) { - min = j; - minf = x[i]; - } - if (x[i + inc_x] < minf) { - min = j + 1; - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - min = j + 2; - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - min = j + 3; - minf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (x[i] < minf) { - min = j; - minf = x[i]; - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 6e0aaa162d..bbb4012aae 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif -static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; +static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v0,%%v3 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v2,%%v0 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamax; + return iamax; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - if (n <= 0 || inc_x <= 0) return (max); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (max); - max = isamax_kernel_64(n1, x, &maxf); + if (inc_x == 1) { - i = n1; - } - else - { - maxf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); + max = isamax_kernel_64(n1, x, &maxf); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); - max = 0; - maxf = ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + max = 0; + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 266c48f7ff..e8b34b934a 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif -static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; +static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v3,%%v0 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v0,%%v2 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamin; + return iamin; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - if (n <= 0 || inc_x <= 0) return (min); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (min); - min = isamin_kernel_64(n1, x, &minf); + if (inc_x == 1) { - i = n1; - } - else - { - minf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); + min = isamin_kernel_64(n1, x, &minf); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); - min = 0; - minf = ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + min = 0; + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index c968ce6fa8..a565df5031 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) -{ - BLASLONG imax; +static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) { + BLASLONG imax; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v0,%%v3 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v2,%%v0 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imax),"=m"(*max) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imax; + return imax; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - if (n <= 0 || inc_x <= 0) return (max); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (max); - max = ismax_kernel_64(n1, x, &maxf); + if (inc_x == 1) { - i = n1; - } - else - { - maxf = x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - max = i; - maxf = x[i]; - } - i++; - } - return (max + 1); + max = ismax_kernel_64(n1, x, &maxf); + i = n1; } else { + maxf = x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); - max = 0; - maxf = x[0]; + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + max = 0; + maxf = x[0]; - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - max = j + 1; - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - max = j + 2; - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - max = j + 3; - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 0145b31b31..ff72b2c641 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) -{ - BLASLONG imin; +static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) { + BLASLONG imin; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v3,%%v0 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v0,%%v2 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imin),"=m"(*min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imin; + return imin; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - if (n <= 0 || inc_x <= 0) return (min); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (min); - min = ismin_kernel_64(n1, x, &minf); + if (inc_x == 1) { - i = n1; - } - else - { - minf = x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - min = i; - minf = x[i]; - } - i++; - } - return (min + 1); + min = ismin_kernel_64(n1, x, &minf); + i = n1; } else { + minf = x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); - min = 0; - minf = x[0]; + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + min = 0; + minf = x[0]; - if (x[i] < minf) { - min = j; - minf = x[i]; - } - if (x[i + inc_x] < minf) { - min = j + 1; - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - min = j + 2; - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - min = j + 3; - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - min = j; - minf = x[i]; - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 2d1cc23653..48afb8215b 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; - - __asm__ volatile ( - "vleg %%v0,0(%3),0 \n\t" - "vleg %%v1,8(%3),0 \n\t" - "vleg %%v0,16(%3),1 \n\t" - "vleg %%v1,24(%3),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v1,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,8 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "srlg %%r0,%2,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vleg %%v16,0(%%r1,%3),0 \n\t" - "vleg %%v17,8(%%r1,%3),0 \n\t" - "vleg %%v16,16(%%r1,%3),1 \n\t" - "vleg %%v17,24(%%r1,%3),1 \n\t" - "vleg %%v18,32(%%r1,%3),0 \n\t" - "vleg %%v19,40(%%r1,%3),0 \n\t" - "vleg %%v18,48(%%r1,%3),1 \n\t" - "vleg %%v19,56(%%r1,%3),1 \n\t" - "vleg %%v20,64(%%r1,%3),0 \n\t" - "vleg %%v21,72(%%r1,%3),0 \n\t" - "vleg %%v20,80(%%r1,%3),1 \n\t" - "vleg %%v21,88(%%r1,%3),1 \n\t" - "vleg %%v22,96(%%r1,%3),0 \n\t" - "vleg %%v23,104(%%r1,%3),0 \n\t" - "vleg %%v22,112(%%r1,%3),1 \n\t" - "vleg %%v23,120(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vleg %%v16,128(%%r1,%3),0 \n\t" - "vleg %%v17,136(%%r1,%3),0 \n\t" - "vleg %%v16,144(%%r1,%3),1 \n\t" - "vleg %%v17,152(%%r1,%3),1 \n\t" - "vleg %%v18,160(%%r1,%3),0 \n\t" - "vleg %%v19,168(%%r1,%3),0 \n\t" - "vleg %%v18,176(%%r1,%3),1 \n\t" - "vleg %%v19,184(%%r1,%3),1 \n\t" - "vleg %%v20,192(%%r1,%3),0 \n\t" - "vleg %%v21,200(%%r1,%3),0 \n\t" - "vleg %%v20,208(%%r1,%3),1 \n\t" - "vleg %%v21,216(%%r1,%3),1 \n\t" - "vleg %%v22,224(%%r1,%3),0 \n\t" - "vleg %%v23,232(%%r1,%3),0 \n\t" - "vleg %%v22,240(%%r1,%3),1 \n\t" - "vleg %%v23,248(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v2,%%v0 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return iamax; + +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + + return iamax; } -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (max); + + if (inc_x == 1) { - if (n <= 0 || inc_x <= 0) return(max); - - if (inc_x == 1) { + BLASLONG n1 = n & -16; + if (n1 > 0) { - BLASLONG n1 = n & -16; - if (n1 > 0) { + max = izamax_kernel_16(n1, x, &maxf); + ix = n1 * 2; + i = n1; + } else { + maxf = CABS1(x, 0); + ix += 2; + i++; + } - max = izamax_kernel_16(n1, x, &maxf); - ix = n1 * 2; - i = n1; + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); } - else - { - maxf = CABS1(x,0); - ix += 2; - i++; - } - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += 2; - i++; + ix += 2; + i++; } - return (max + 1); + return (max + 1); + + } else { - } else { - max = 0; - maxf = CABS1(x,0); + maxf = CABS1(x, 0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + max = i + 1; + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) > maxf) { + max = i + 2; + maxf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) > maxf) { + max = i + 3; + maxf = CABS1(x, ix + 3 * inc_x2); + } + + ix += inc_x2 * 4; + + i += 4; + } - return (max + 1); + + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (max + 1); + } } - - diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 676fd7c6d9..3edbe3d58c 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; - - __asm__ volatile ( - "vleg %%v0,0(%3),0 \n\t" - "vleg %%v1,8(%3),0 \n\t" - "vleg %%v0,16(%3),1 \n\t" - "vleg %%v1,24(%3),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v1,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,8 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "srlg %%r0,%2,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vleg %%v16,0(%%r1,%3),0 \n\t" - "vleg %%v17,8(%%r1,%3),0 \n\t" - "vleg %%v16,16(%%r1,%3),1 \n\t" - "vleg %%v17,24(%%r1,%3),1 \n\t" - "vleg %%v18,32(%%r1,%3),0 \n\t" - "vleg %%v19,40(%%r1,%3),0 \n\t" - "vleg %%v18,48(%%r1,%3),1 \n\t" - "vleg %%v19,56(%%r1,%3),1 \n\t" - "vleg %%v20,64(%%r1,%3),0 \n\t" - "vleg %%v21,72(%%r1,%3),0 \n\t" - "vleg %%v20,80(%%r1,%3),1 \n\t" - "vleg %%v21,88(%%r1,%3),1 \n\t" - "vleg %%v22,96(%%r1,%3),0 \n\t" - "vleg %%v23,104(%%r1,%3),0 \n\t" - "vleg %%v22,112(%%r1,%3),1 \n\t" - "vleg %%v23,120(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vleg %%v16,128(%%r1,%3),0 \n\t" - "vleg %%v17,136(%%r1,%3),0 \n\t" - "vleg %%v16,144(%%r1,%3),1 \n\t" - "vleg %%v17,152(%%r1,%3),1 \n\t" - "vleg %%v18,160(%%r1,%3),0 \n\t" - "vleg %%v19,168(%%r1,%3),0 \n\t" - "vleg %%v18,176(%%r1,%3),1 \n\t" - "vleg %%v19,184(%%r1,%3),1 \n\t" - "vleg %%v20,192(%%r1,%3),0 \n\t" - "vleg %%v21,200(%%r1,%3),0 \n\t" - "vleg %%v20,208(%%r1,%3),1 \n\t" - "vleg %%v21,216(%%r1,%3),1 \n\t" - "vleg %%v22,224(%%r1,%3),0 \n\t" - "vleg %%v23,232(%%r1,%3),0 \n\t" - "vleg %%v22,240(%%r1,%3),1 \n\t" - "vleg %%v23,248(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v0,%%v2 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return iamin; + +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + + return iamin; } -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0; - BLASLONG min = 0; - BLASLONG inc_x2; +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (min); + + if (inc_x == 1) { - if (n <= 0 || inc_x <= 0) return(min); - - if (inc_x == 1) { + BLASLONG n1 = n & -16; + if (n1 > 0) { - BLASLONG n1 = n & -16; - if (n1 > 0) { + min = izamin_kernel_16(n1, x, &minf); + ix = n1 * 2; + i = n1; + } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } - min = izamin_kernel_16(n1, x, &minf); - ix = n1 * 2; - i = n1; + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); } - else - { - minf = CABS1(x,0); - ix += 2; - i++; - } - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; + ix += 2; + i++; } - return (min + 1); + return (min + 1); + + } else { - } else { - min = 0; - minf = CABS1(x,0); + minf = CABS1(x, 0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + min = i + 1; + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) < minf) { + min = i + 2; + minf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) < minf) { + min = i + 3; + minf = CABS1(x, ix + 3 * inc_x2); + } + + ix += inc_x2 * 4; + + i += 4; + } - return (min + 1); + + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (min + 1); + } } - - diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index b629d64c06..efbc0318c8 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif - -static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmaxsb %%v16,%%v16,%%v24,8 \n\t" - "vfmaxsb %%v17,%%v17,%%v25,8 \n\t" - "vfmaxsb %%v18,%%v18,%%v26,8 \n\t" - "vfmaxsb %%v19,%%v19,%%v27,8 \n\t" - "vfmaxsb %%v20,%%v20,%%v28,8 \n\t" - "vfmaxsb %%v21,%%v21,%%v29,8 \n\t" - "vfmaxsb %%v22,%%v22,%%v30,8 \n\t" - "vfmaxsb %%v23,%%v23,%%v31,8 \n\t" - - "vfmaxsb %%v16,%%v16,%%v20,8 \n\t" - "vfmaxsb %%v17,%%v17,%%v21,8 \n\t" - "vfmaxsb %%v18,%%v18,%%v22,8 \n\t" - "vfmaxsb %%v19,%%v19,%%v23,8 \n\t" - - "vfmaxsb %%v16,%%v16,%%v18,8 \n\t" - "vfmaxsb %%v17,%%v17,%%v19,8 \n\t" - - "vfmaxsb %%v16,%%v16,%%v17,8 \n\t" - - "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfmaxsb %%v0,%%v0,%%v16,8 \n\t" - "lper %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - if (n <= 0 || inc_x <= 0) return (maxf); +static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxsb %%v16,%%v16,%%v24,8\n\t" + "vfmaxsb %%v17,%%v17,%%v25,8\n\t" + "vfmaxsb %%v18,%%v18,%%v26,8\n\t" + "vfmaxsb %%v19,%%v19,%%v27,8\n\t" + "vfmaxsb %%v20,%%v20,%%v28,8\n\t" + "vfmaxsb %%v21,%%v21,%%v29,8\n\t" + "vfmaxsb %%v22,%%v22,%%v30,8\n\t" + "vfmaxsb %%v23,%%v23,%%v31,8\n\t" + "vfmaxsb %%v16,%%v16,%%v20,8\n\t" + "vfmaxsb %%v17,%%v17,%%v21,8\n\t" + "vfmaxsb %%v18,%%v18,%%v22,8\n\t" + "vfmaxsb %%v19,%%v19,%%v23,8\n\t" + "vfmaxsb %%v16,%%v16,%%v18,8\n\t" + "vfmaxsb %%v17,%%v17,%%v19,8\n\t" + "vfmaxsb %%v16,%%v16,%%v17,8\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = samax_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); + maxf = samax_kernel_64(n1, x); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); - maxf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index 7ce6ee657c..138836ce57 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif - -static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfminsb %%v16,%%v16,%%v24,8 \n\t" - "vfminsb %%v17,%%v17,%%v25,8 \n\t" - "vfminsb %%v18,%%v18,%%v26,8 \n\t" - "vfminsb %%v19,%%v19,%%v27,8 \n\t" - "vfminsb %%v20,%%v20,%%v28,8 \n\t" - "vfminsb %%v21,%%v21,%%v29,8 \n\t" - "vfminsb %%v22,%%v22,%%v30,8 \n\t" - "vfminsb %%v23,%%v23,%%v31,8 \n\t" - - "vfminsb %%v16,%%v16,%%v20,8 \n\t" - "vfminsb %%v17,%%v17,%%v21,8 \n\t" - "vfminsb %%v18,%%v18,%%v22,8 \n\t" - "vfminsb %%v19,%%v19,%%v23,8 \n\t" - - "vfminsb %%v16,%%v16,%%v18,8 \n\t" - "vfminsb %%v17,%%v17,%%v19,8 \n\t" - - "vfminsb %%v16,%%v16,%%v17,8 \n\t" - - "vfminsb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfminsb %%v0,%%v0,%%v16,8 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfminsb %%v0,%%v0,%%v16,8 \n\t" - "lper %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - if (n <= 0 || inc_x <= 0) return (minf); +static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,8\n\t" + "vfminsb %%v17,%%v17,%%v25,8\n\t" + "vfminsb %%v18,%%v18,%%v26,8\n\t" + "vfminsb %%v19,%%v19,%%v27,8\n\t" + "vfminsb %%v20,%%v20,%%v28,8\n\t" + "vfminsb %%v21,%%v21,%%v29,8\n\t" + "vfminsb %%v22,%%v22,%%v30,8\n\t" + "vfminsb %%v23,%%v23,%%v31,8\n\t" + "vfminsb %%v16,%%v16,%%v20,8\n\t" + "vfminsb %%v17,%%v17,%%v21,8\n\t" + "vfminsb %%v18,%%v18,%%v22,8\n\t" + "vfminsb %%v19,%%v19,%%v23,8\n\t" + "vfminsb %%v16,%%v16,%%v18,8\n\t" + "vfminsb %%v17,%%v17,%%v19,8\n\t" + "vfminsb %%v16,%%v16,%%v17,8\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = samin_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); + minf = samin_kernel_64(n1, x); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); - minf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c index 2c59ab2e5f..0c3057a929 100644 --- a/kernel/zarch/sasum.c +++ b/kernel/zarch/sasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,147 +28,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vfasb %%v0,%%v0,%%v3 \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepf %%v1,%%v0,2 \n\t" - "aebr %%f0,%%f1 \n\t" - "ler %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +#define ABS fabsf + +static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT sumf = 0.0; - BLASLONG n1; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; - if (n <= 0 || inc_x <= 0) return sumf; - - if (inc_x == 1) { - - n1 = n & -64; - - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return sumf; - sumf = sasum_kernel_64(n1, x); - i = n1; - } + if (inc_x == 1) { - while (i < n) { - sumf += ABS(x[i]); - i++; - } + n1 = n & -64; - } else { - BLASLONG n1 = n & -4; - register FLOAT sum1, sum2; - sum1 = 0.0; - sum2 = 0.0; - while (j < n1) { + if (n1 > 0) { - sum1 += ABS(x[i]); - sum2 += ABS(x[i + inc_x]); - sum1 += ABS(x[i + 2 * inc_x]); - sum2 += ABS(x[i + 3 * inc_x]); + sumf = sasum_kernel_64(n1, x); + i = n1; + } - i += inc_x * 4; - j += 4; + while (i < n) { + sumf += ABS(x[i]); + i++; + } - } - sumf = sum1 + sum2; - while (j < n) { + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { - sumf += ABS(x[i]); - i += inc_x; - j++; - } + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + i += inc_x * 4; + j += 4; } - return sumf; -} + sumf = sum1 + sum2; + while (j < n) { + sumf += ABS(x[i]); + i += inc_x; + j++; + } + } + return sumf; +} diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c index 26ead310cb..e41e87af07 100644 --- a/kernel/zarch/saxpy.c +++ b/kernel/zarch/saxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( - "vlrepf %%v0,%3 \n\t" - "srlg %%r0,%0,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - - "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,80(%%r1,%1) \n\t" - "vl %%v26,96(%%r1,%1) \n\t" - "vl %%v27,112(%%r1,%1) \n\t" - "vl %%v28,64(%%r1,%2) \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vl %%v30,96(%%r1,%2) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "vl %%v16,128(%%r1,%1) \n\t" - "vl %%v17,144(%%r1,%1) \n\t" - "vl %%v18,160(%%r1,%1) \n\t" - "vl %%v19,176(%%r1,%1) \n\t" - "vl %%v20,128(%%r1,%2) \n\t" - "vl %%v21,144(%%r1,%2) \n\t" - "vl %%v22,160(%%r1,%2) \n\t" - "vl %%v23,176(%%r1,%2) \n\t" - - "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,192(%%r1,%1) \n\t" - "vl %%v25,208(%%r1,%1) \n\t" - "vl %%v26,224(%%r1,%1) \n\t" - "vl %%v27,240(%%r1,%1) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,128(%%r1,%2) \n\t" - "vst %%v17,144(%%r1,%2) \n\t" - "vst %%v18,160(%%r1,%2) \n\t" - "vst %%v19,176(%%r1,%2) \n\t" - "vst %%v20,192(%%r1,%2) \n\t" - "vst %%v21,208(%%r1,%2) \n\t" - "vst %%v22,224(%%r1,%2) \n\t" - "vst %%v23,240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__("vlrepf %%v0,%[alpha]\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + [alpha] "m"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return 0 ; + if (n <= 0) + return 0; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -64; + BLASLONG n1 = n & -64; - if ( n1 ) - saxpy_kernel_64(n1, x, y , &da); + if (n1) + saxpy_kernel_64(n1, x, y, &da); - i = n1; - while(i < n) - { - - y[i] += da * x[i] ; - i++ ; - - } - return 0 ; + i = n1; + while (i < n) { + y[i] += da * x[i]; + i++; } + return 0; - BLASLONG n1 = n & -4; + } - while(i < n1) - { + BLASLONG n1 = n & -4; - FLOAT m1 = da * x[ix] ; - FLOAT m2 = da * x[ix+inc_x] ; - FLOAT m3 = da * x[ix+2*inc_x] ; - FLOAT m4 = da * x[ix+3*inc_x] ; + while (i < n1) { - y[iy] += m1 ; - y[iy+inc_y] += m2 ; - y[iy+2*inc_y] += m3 ; - y[iy+3*inc_y] += m4 ; + FLOAT m1 = da * x[ix]; + FLOAT m2 = da * x[ix + inc_x]; + FLOAT m3 = da * x[ix + 2 * inc_x]; + FLOAT m4 = da * x[ix + 3 * inc_x]; - ix += inc_x*4 ; - iy += inc_y*4 ; - i+=4 ; + y[iy] += m1; + y[iy + inc_y] += m2; + y[iy + 2 * inc_y] += m3; + y[iy + 3 * inc_y] += m4; - } + ix += inc_x * 4; + iy += inc_y * 4; + i += 4; - while(i < n) - { + } - y[iy] += da * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { - } - return 0 ; - -} + y[iy] += da * x[ix]; + ix += inc_x; + iy += inc_y; + i++; + } + return 0; +} diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c index ff4227595c..44d27b062c 100644 --- a/kernel/zarch/scopy.c +++ b/kernel/zarch/scopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,6 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","r2" - ); +static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],6\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n]) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if (n <= 0) return 0; - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - scopy_kernel_64(n1, x, y); - i = n1; - } + if (n <= 0) + return 0; - while (i < n) { - y[i] = x[i]; - i++; + if ((inc_x == 1) && (inc_y == 1)) { - } + BLASLONG n1 = n & -64; + if (n1 > 0) { + scopy_kernel_64(n1, x, y); + i = n1; + } + while (i < n) { + y[i] = x[i]; + i++; - } else { + } - while (i < n) { + } else { - y[iy] = x[ix]; - ix += inc_x; - iy += inc_y; - i++; + while (i < n) { - } + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; } - return 0; + } + return 0; } diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c index 5ddbc69bd6..f659b0c8a5 100644 --- a/kernel/zarch/sdot.c +++ b/kernel/zarch/sdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018,The OpenBLAS Project +Copyright (c) 2013-2019,The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms,with or without modification,are permitted provided that the following conditions are @@ -27,114 +27,118 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - FLOAT dot; - - __asm__ volatile ( - "vzero %%v0 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%3) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,16(%%r1,%3) \n\t" - "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" - "vl %%v27,48(%%r1,%3) \n\t" - "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" - "vl %%v28,64(%%r1,%3) \n\t" - "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%3) \n\t" - "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" - "vl %%v30,96(%%r1,%3) \n\t" - "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vrepf %%v1,%%v0,1 \n\t" - "vrepf %%v2,%%v0,2 \n\t" - "vrepf %%v3,%%v0,3 \n\t" - "aebr %%f0,%%f1 \n\t" - "aebr %%f0,%%f2 \n\t" - "aebr %%f0,%%f3 \n\t" - "ler %0,%%f0 " - :"=f"(dot) - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return dot; +static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + FLOAT dot; + + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "vrepf %%v1,%%v0,1\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepf %%v3,%%v0,3\n\t" + "aebr %%f0,%%f1\n\t" + "aebr %%f0,%%f2\n\t" + "aebr %%f0,%%f3\n\t" + "ler %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), + [y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return dot; } -FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - FLOAT dot = 0.0 ; + FLOAT dot = 0.0; - if ( n <= 0 ) return(dot); + if (n <= 0) + return (dot); - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -32; - if ( n1 ) - dot = sdot_kernel_32(n1,x,y); + if (n1) + dot = sdot_kernel_32(n1, x, y); - i = n1; - while(i < n) - { + i = n1; + while (i < n) { - dot += y[i] * x[i] ; - i++ ; + dot += y[i] * x[i]; + i++; - } - return(dot); + } + return (dot); + } - } + BLASLONG n1 = n & -2; - BLASLONG n1 = n & -2; + while (i < n1) { - while(i < n1) - { + dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x]; + ix += inc_x * 2; + iy += inc_y * 2; + i += 2; - dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; - ix += inc_x*2 ; - iy += inc_y*2 ; - i+=2 ; + } - } + while (i < n) { - while(i < n) - { + dot += y[iy] * x[ix]; + ix += inc_x; + iy += inc_y; + i++; - dot += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); + } + return (dot); } - - diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index 01d8414de4..86ac249931 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,640 +29,559 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepf %%v0,0(%5) \n\t" - "vlrepf %%v1,4(%5) \n\t" - "vlrepf %%v2,8(%5) \n\t" - "vlrepf %%v3,12(%5) \n\t" - "vlrepf %%v4,%7 \n\t" - "vfmsb %%v0,%%v0,%%v4 \n\t" - "vfmsb %%v1,%%v1,%%v4 \n\t" - "vfmsb %%v2,%%v2,%%v4 \n\t" - "vfmsb %%v3,%%v3,%%v4 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - "vl %%v20,16(%%r1,%1) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,16(%%r1,%3) \n\t" - "vl %%v23,16(%%r1,%4) \n\t" - "vl %%v24,32(%%r1,%1) \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vl %%v28,48(%%r1,%1) \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "vl %%v4,16(%%r1,%6) \n\t" - "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,16(%%r1,%6) \n\t" - - "vl %%v4,32(%%r1,%6) \n\t" - "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,32(%%r1,%6) \n\t" - - "vl %%v4,48(%%r1,%6) \n\t" - "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,48(%%r1,%6) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,64(%%r1,%2) \n\t" - "vl %%v18,64(%%r1,%3) \n\t" - "vl %%v19,64(%%r1,%4) \n\t" - "vl %%v20,80(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,80(%%r1,%3) \n\t" - "vl %%v23,80(%%r1,%4) \n\t" - "vl %%v24,96(%%r1,%1) \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vl %%v28,112(%%r1,%1) \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - - "vl %%v4,64(%%r1,%6) \n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,64(%%r1,%6) \n\t" - - "vl %%v4,80(%%r1,%6) \n\t" - "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,80(%%r1,%6) \n\t" - - "vl %%v4,96(%%r1,%6) \n\t" - "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,96(%%r1,%6) \n\t" - - "vl %%v4,112(%%r1,%6) \n\t" - "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,112(%%r1,%6) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepf %%v0,0(%[x])\n\t" + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,8(%[x])\n\t" + "vlrepf %%v3,12(%[x])\n\t" + "vlrepf %%v4,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v4\n\t" + "vfmsb %%v1,%%v1,%%v4\n\t" + "vfmsb %%v2,%%v2,%%v4\n\t" + "vfmsb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepf %%v0,0(%3) \n\t" - "vlrepf %%v1,4(%3) \n\t" - "vlrepf %%v2,%5 \n\t" - "vfmsb %%v0,%%v0,%%v2 \n\t" - "vfmsb %%v1,%%v1,%%v2 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,16(%%r1,%1) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - "vl %%v20,32(%%r1,%1) \n\t" - "vl %%v21,32(%%r1,%2) \n\t" - "vl %%v22,48(%%r1,%1) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vl %%v26,80(%%r1,%1) \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vl %%v28,96(%%r1,%1) \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%1) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "vl %%v2,16(%%r1,%4) \n\t" - "vfmasb %%v2,%%v18,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v19,%%v1,%%v2 \n\t" - "vst %%v2,16(%%r1,%4) \n\t" - - "vl %%v2,32(%%r1,%4) \n\t" - "vfmasb %%v2,%%v20,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v21,%%v1,%%v2 \n\t" - "vst %%v2,32(%%r1,%4) \n\t" - - "vl %%v2,48(%%r1,%4) \n\t" - "vfmasb %%v2,%%v22,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v23,%%v1,%%v2 \n\t" - "vst %%v2,48(%%r1,%4) \n\t" - - "vl %%v2,64(%%r1,%4) \n\t" - "vfmasb %%v2,%%v24,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v25,%%v1,%%v2 \n\t" - "vst %%v2,64(%%r1,%4) \n\t" - - "vl %%v2,80(%%r1,%4) \n\t" - "vfmasb %%v2,%%v26,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v27,%%v1,%%v2 \n\t" - "vst %%v2,80(%%r1,%4) \n\t" - - "vl %%v2,96(%%r1,%4) \n\t" - "vfmasb %%v2,%%v28,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v29,%%v1,%%v2 \n\t" - "vst %%v2,96(%%r1,%4) \n\t" - - "vl %%v2,112(%%r1,%4) \n\t" - "vfmasb %%v2,%%v30,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v31,%%v1,%%v2 \n\t" - "vst %%v2,112(%%r1,%4) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepf %%v0,0(%[x])\n\t" + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v2\n\t" + "vfmsb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmasb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmasb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmasb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepf %%v0,0(%2) \n\t" - "vlrepf %%v1,%4 \n\t" - "vfmsb %%v0,%%v0,%%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%1) \n\t" - "vl %%v22,96(%%r1,%1) \n\t" - "vl %%v23,112(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "vl %%v1,16(%%r1,%3) \n\t" - "vfmasb %%v1,%%v17,%%v0,%%v1 \n\t" - "vst %%v1,16(%%r1,%3) \n\t" - - "vl %%v1,32(%%r1,%3) \n\t" - "vfmasb %%v1,%%v18,%%v0,%%v1 \n\t" - "vst %%v1,32(%%r1,%3) \n\t" - - "vl %%v1,48(%%r1,%3) \n\t" - "vfmasb %%v1,%%v19,%%v0,%%v1 \n\t" - "vst %%v1,48(%%r1,%3) \n\t" - - "vl %%v1,64(%%r1,%3) \n\t" - "vfmasb %%v1,%%v20,%%v0,%%v1 \n\t" - "vst %%v1,64(%%r1,%3) \n\t" - - "vl %%v1,80(%%r1,%3) \n\t" - "vfmasb %%v1,%%v21,%%v0,%%v1 \n\t" - "vst %%v1,80(%%r1,%3) \n\t" - - "vl %%v1,96(%%r1,%3) \n\t" - "vfmasb %%v1,%%v22,%%v0,%%v1 \n\t" - "vst %%v1,96(%%r1,%3) \n\t" - - "vl %%v1,112(%%r1,%3) \n\t" - "vfmasb %%v1,%%v23,%%v0,%%v1 \n\t" - "vst %%v1,112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepf %%v0,0(%[x])\n\t" + "vlrepf %%v16,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,0(%%r1,%[y])\n\t" + "vfmasb %%v17,%%v16,%%v0,%%v17\n\t" + "vst %%v17,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i]; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i]; + dest += inc_dest; + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG i; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[4]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4 = lda << 2; - FLOAT xbuffer[8],*ybuffer; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - ybuffer = buffer; - - n1 = n >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - /* a_ptr += lda; - x_ptr += 1; */ - - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) { + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[4]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4 = lda << 2; + FLOAT xbuffer[8], *ybuffer; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + ybuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; } - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if (inc_y != 1) + memset(ybuffer, 0, NB * 4); + else + ybuffer = y_ptr; + + if (inc_x == 1) { + + for (i = 0; i < n1; i++) { + sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if (n2 & 2) { + sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); + a_ptr += lda * 2; + x_ptr += 2; + } + + if (n2 & 1) { + sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); + /* a_ptr += lda; + x_ptr += 1; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); + a_ptr += lda; + + } + } + a += NB; + if (inc_y != 1) { + add_y(NB, ybuffer, y_ptr, inc_y); + y_ptr += NB * inc_y; + } else + y_ptr += NB; + + } + + if (m3 == 0) + return (0); + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if (lda == 3 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if (lda == 2 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return (0); + } + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if (lda == 1 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp += + a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i + + 2] * + x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3]; + + } + + for (; i < n; i++) { + temp += a_ptr[i] * x_ptr[i]; + } + + } else { + + for (i = 0; i < n; i++) { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp; + return (0); + } - return(0); + return (0); } - - diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index a3136723ae..6ae9b6d7f2 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,783 +29,717 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 1,1024(%%r1,%5) \n\t" - - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - "vl %%v18,32(%%r1,%5) \n\t" - "vl %%v19,48(%%r1,%5) \n\t" - "vl %%v20,64(%%r1,%5) \n\t" - "vl %%v21,80(%%r1,%5) \n\t" - "vl %%v22,96(%%r1,%5) \n\t" - "vl %%v23,112(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" - - "vl %%v24,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" - - "vl %%v28,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - "vl %%v26,64(%%r1,%3) \n\t" - "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" - "vl %%v27,64(%%r1,%4) \n\t" - "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" - - "vl %%v28,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" - "vl %%v31,80(%%r1,%4) \n\t" - "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" - - "vl %%v24,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" - - "vl %%v28,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v4,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v4 \n\t" - "vrepg %%v4,%%v0,1 \n\t" - "aebr %%f0,%%f4 \n\t" - "ste %%f0,0(%6) \n\t" - "veslg %%v4,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v4 \n\t" - "vrepg %%v4,%%v1,1 \n\t" - "aebr %%f1,%%f4 \n\t" - "ste %%f1,4(%6) \n\t" - "veslg %%v4,%%v2,32 \n\t" - "vfasb %%v2,%%v2,%%v4 \n\t" - "vrepg %%v4,%%v2,1 \n\t" - "aebr %%f2,%%f4 \n\t" - "ste %%f2,8(%6) \n\t" - "veslg %%v4,%%v3,32 \n\t" - "vfasb %%v3,%%v3,%%v4 \n\t" - "vrepg %%v4,%%v3,1 \n\t" - "aebr %%f3,%%f4 \n\t" - "ste %%f3,12(%6) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v2,%%v2,%%v6\n\t" + "vfasb %%v3,%%v3,%%v7\n\t" + "veslg %%v4,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vrepg %%v4,%%v0,1\n\t" + "aebr %%f0,%%f4\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v4,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v4\n\t" + "vrepg %%v4,%%v1,1\n\t" + "aebr %%f1,%%f4\n\t" + "ste %%f1,4(%[y])\n\t" + "veslg %%v4,%%v2,32\n\t" + "vfasb %%v2,%%v2,%%v4\n\t" + "vrepg %%v4,%%v2,1\n\t" + "aebr %%f2,%%f4\n\t" + "ste %%f2,8(%[y])\n\t" + "veslg %%v4,%%v3,32\n\t" + "vfasb %%v3,%%v3,%%v4\n\t" + "vrepg %%v4,%%v3,1\n\t" + "aebr %%f3,%%f4\n\t" + "ste %%f3,12(%[y])" + : "=m"(*(FLOAT (*)[4]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" - - "vl %%v28,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" - "vl %%v29,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" - - "vl %%v30,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" - "vl %%v31,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - - "vl %%v26,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" - - "vl %%v28,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" - - "vl %%v30,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v2,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vrepg %%v2,%%v0,1 \n\t" - "aebr %%f0,%%f2 \n\t" - "ste %%f0,0(%4) \n\t" - "veslg %%v2,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v2 \n\t" - "vrepg %%v2,%%v1,1 \n\t" - "aebr %%f1,%%f2 \n\t" - "ste %%f1,4(%4) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v1,%%v1,%%v3\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v1,%%v1,%%v7\n\t" + "veslg %%v2,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vrepg %%v2,%%v0,1\n\t" + "aebr %%f0,%%f2\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v2,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v2\n\t" + "vrepg %%v2,%%v1,1\n\t" + "aebr %%f1,%%f2\n\t" + "ste %%f1,4(%[y])" + : "=m"(*(FLOAT (*)[2]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" - - "vl %%v26,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" - - "vl %%v27,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" - - "vl %%v28,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" - - "vl %%v29,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" - - "vl %%v30,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" - - "vl %%v31,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "aebr %%f0,%%f1 \n\t" - "ste %%f0,0(%3) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "veslg %%v1,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vrepg %%v1,%%v0,1\n\t" + "aebr %%f0,%%f1\n\t" + "ste %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - dest[i] = *src; - src += inc_src; - } + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + dest[i] = *src; + src += inc_src; + } } - -static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) -{ - __asm__ volatile ( - "vlrepf %%v0,%1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - "vl %%v26, 32(%%r1,%3) \n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" - "vst %%v26, 32(%%r1,%3) \n\t" - "vl %%v27, 48(%%r1,%3) \n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" - "vst %%v27, 48(%%r1,%3) \n\t" - "vl %%v28, 64(%%r1,%3) \n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" - "vst %%v28, 64(%%r1,%3) \n\t" - "vl %%v29, 80(%%r1,%3) \n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" - "vst %%v29, 80(%%r1,%3) \n\t" - "vl %%v30, 96(%%r1,%3) \n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" - "vst %%v30, 96(%%r1,%3) \n\t" - "vl %%v31, 112(%%r1,%3) \n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" - "vst %%v31, 112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { + __asm__("vlrepf %%v0,%[da]\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) dest) + : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - if (inc_dest == 1) - add_y_kernel_4(n, da, src, dest); - else - { - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i] * da; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, + BLASLONG inc_dest) { + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i] * da; + dest += inc_dest; } + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG register i; - BLASLONG register j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - BLASLONG n0; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[2] __attribute__ ((aligned(16))); - FLOAT *xbuffer; - FLOAT *ytemp; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - xbuffer = buffer; - ytemp = buffer + (m < NBMAX ? m : NBMAX); - - n0 = n / NBMAX; - n1 = (n % NBMAX) >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(NB,x_ptr,xbuffer,inc_x); - - - FLOAT *ap[4]; - FLOAT *yp; - BLASLONG register lda4 = 4 * lda; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( n0 > 0 ) - { - BLASLONG nb1 = NBMAX / 4; - for( j=0; j> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } - } + y_ptr = y; + a_ptr = a; + x_ptr = x; + if (inc_x == 1) + xbuffer = x_ptr; + else + copy_x(NB, x_ptr, xbuffer, inc_x); - yp = ytemp; + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x4(NB,ap,xbuffer,yp); - ap[0] += lda4 ; - ap[1] += lda4 ; - ap[2] += lda4 ; - ap[3] += lda4 ; - yp += 4; - } - if ( n1 > 0 ) - { - add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); - y_ptr += n1 * inc_y * 4; - a_ptr += n1 * lda4 ; + if (n0 > 0) { + BLASLONG nb1 = NBMAX / 4; + for (j = 0; j < n0; j++) { + + yp = ytemp; + for (i = 0; i < nb1; i++) { + sgemv_kernel_4x4(NB, ap, xbuffer, yp); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + yp += 4; } + add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += nb1 * inc_y * 4; + a_ptr += nb1 * lda4; - if ( n2 & 2 ) - { + } - sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); - a_ptr += lda * 2; - *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1] * alpha; - y_ptr += inc_y; + } - } + yp = ytemp; - if ( n2 & 1 ) - { + for (i = 0; i < n1; i++) { + sgemv_kernel_4x4(NB, ap, xbuffer, yp); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + yp += 4; + } + if (n1 > 0) { + add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4; + } - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - // a_ptr += lda; - *y_ptr += ybuffer[0] * alpha; - // y_ptr += inc_y; + if (n2 & 2) { + + sgemv_kernel_4x2(NB, ap, xbuffer, ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; - } - a += NB; - x += NB * inc_x; } - if ( m3 == 0 ) return(0); + if (n2 & 1) { - x_ptr = x; - a_ptr = a; - if ( m3 == 3 ) - { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if ( lda == 3 && inc_y == 1 ) - { - - for ( j=0; j< ( n & -4) ; j+=4 ) - { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for ( ; j 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = smax_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); + maxf = smax_kernel_64(n1, x); + i = n1; } else { + maxf = x[0]; + i++; + } - maxf=x[0]; + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = x[0]; - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index e7d83441b3..2e9c793c46 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,136 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT min; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfminsb %%v16,%%v16,%%v24,0 \n\t" - "vfminsb %%v17,%%v17,%%v25,0 \n\t" - "vfminsb %%v18,%%v18,%%v26,0 \n\t" - "vfminsb %%v19,%%v19,%%v27,0 \n\t" - "vfminsb %%v20,%%v20,%%v28,0 \n\t" - "vfminsb %%v21,%%v21,%%v29,0 \n\t" - "vfminsb %%v22,%%v22,%%v30,0 \n\t" - "vfminsb %%v23,%%v23,%%v31,0 \n\t" - - "vfminsb %%v16,%%v16,%%v20,0 \n\t" - "vfminsb %%v17,%%v17,%%v21,0 \n\t" - "vfminsb %%v18,%%v18,%%v22,0 \n\t" - "vfminsb %%v19,%%v19,%%v23,0 \n\t" - - "vfminsb %%v16,%%v16,%%v18,0 \n\t" - "vfminsb %%v17,%%v17,%%v19,0 \n\t" - - "vfminsb %%v16,%%v16,%%v17,0 \n\t" - - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfminsb %%v0,%%v0,%%v16,0 \n\t" - "ler %0,%%f0 " - :"=f"(min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return min; +static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT min; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v17,%%v17,%%v25,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v19,%%v19,%%v27,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v21,%%v21,%%v29,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v23,%%v23,%%v31,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v17,%%v17,%%v21,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v19,%%v19,%%v23,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v17,%%v17,%%v19,0\n\t" + "vfminsb %%v16,%%v16,%%v17,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return min; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) return (minf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = smin_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); + minf = smin_kernel_64(n1, x); + i = n1; } else { + minf = x[0]; + i++; + } - minf=x[0]; + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = x[0]; - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c index 763cc664ac..5b21a19dcf 100644 --- a/kernel/zarch/srot.c +++ b/kernel/zarch/srot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepf %%v0,%3 \n\t" - "vlrepf %%v1,%4 \n\t" - "srlg %%r0,%0,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepf %%v0,%[c]\n\t" + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - - FLOAT temp; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + FLOAT temp; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if (n <= 0) + return (0); - BLASLONG n1 = n & -64; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - srot_kernel_64(n1, x, y, &cosa, &sina); - i=n1; - } + if ((inc_x == 1) && (inc_y == 1)) { - while(i < n) - { - temp = c*x[i] + s*y[i] ; - y[i] = c*y[i] - s*x[i] ; - x[i] = temp ; - - i++ ; + BLASLONG n1 = n & -64; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + srot_kernel_64(n1, x, y, &cosa, &sina); + i = n1; + } - } + while (i < n) { + temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; + } else { - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; - } + ix += inc_x; + iy += inc_y; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c index c18a7e56f3..07e6845c6d 100644 --- a/kernel/zarch/sscal.c +++ b/kernel/zarch/sscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,175 +27,147 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) -{ - __asm__ volatile ( - "vlrepf %%v0,%1 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%2) \n\t" - "vfmsb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 0(%%r1,%2) \n\t" - "vl %%v25, 16(%%r1,%2) \n\t" - "vfmsb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 16(%%r1,%2) \n\t" - "vl %%v26, 32(%%r1,%2) \n\t" - "vfmsb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 32(%%r1,%2) \n\t" - "vl %%v27, 48(%%r1,%2) \n\t" - "vfmsb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 48(%%r1,%2) \n\t" - "vl %%v24, 64(%%r1,%2) \n\t" - "vfmsb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 64(%%r1,%2) \n\t" - "vl %%v25, 80(%%r1,%2) \n\t" - "vfmsb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 80(%%r1,%2) \n\t" - "vl %%v26, 96(%%r1,%2) \n\t" - "vfmsb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 96(%%r1,%2) \n\t" - "vl %%v27, 112(%%r1,%2) \n\t" - "vfmsb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 112(%%r1,%2) \n\t" - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v24","v25","v26","v27" - ); +static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) { + __asm__("vlrepf %%v0,%[da]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmsb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmsb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmsb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmsb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmsb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmsb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmsb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmsb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x),[da] "m"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0,j=0; - if ( n <= 0 || inc_x <=0 ) - return(0); - - - if ( inc_x == 1 ) - { - - if ( da == 0.0 ) - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - - sscal_kernel_32_zero(n1, x); - j=n1; - } - - while(j < n) - { - - x[j]=0.0; - j++; - } - - } - else - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - sscal_kernel_32(n1, da, x); - j=n1; - } - while(j < n) - { - - x[j] = da * x[j] ; - j++; - } - } +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + if (n <= 0 || inc_x <= 0) + return (0); + if (inc_x == 1) { + if (da == 0.0) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + sscal_kernel_32_zero(n1, x); + j = n1; + } + + while (j < n) { + + x[j] = 0.0; + j++; + } + + } else { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + sscal_kernel_32(n1, da, x); + j = n1; + } + while (j < n) { + + x[j] = da * x[j]; + j++; + } } - else - { - if ( da == 0.0 ) - { + } else { - BLASLONG n1 = n & -2; + if (da == 0.0) { - while (j < n1) { + BLASLONG n1 = n & -2; - x[i]=0.0; - x[i + inc_x]=0.0; + while (j < n1) { - i += inc_x * 2; - j += 2; + x[i] = 0.0; + x[i + inc_x] = 0.0; - } - while(j < n) - { + i += inc_x * 2; + j += 2; - x[i]=0.0; - i += inc_x ; - j++; - } + } + while (j < n) { - } - else - { - BLASLONG n1 = n & -2; + x[i] = 0.0; + i += inc_x; + j++; + } - while (j < n1) { + } else { + BLASLONG n1 = n & -2; - x[i] = da * x[i] ; - x[i + inc_x] = da * x[i + inc_x]; + while (j < n1) { - i += inc_x * 2; - j += 2; + x[i] = da * x[i]; + x[i + inc_x] = da * x[i + inc_x]; - } + i += inc_x * 2; + j += 2; - while(j < n) - { + } - x[i] = da * x[i] ; - i += inc_x ; - j++; - } - } + while (j < n) { + x[i] = da * x[i]; + i += inc_x; + j++; + } } - return 0; - -} + } + return 0; +} diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c index d0c0dc3f42..dc71131436 100644 --- a/kernel/zarch/sswap.c +++ b/kernel/zarch/sswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,138 +27,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1 )) - { +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; - BLASLONG n1 = n & -64; - if ( n1 > 0 ) - { - sswap_kernel_64(n1, x, y); - i=n1; - } + if (n <= 0) + return (0); - while(i < n) - { - temp = y[i]; - y[i] = x[i] ; - x[i] = temp; - i++ ; + if ((inc_x == 1) && (inc_y == 1)) { - } + BLASLONG n1 = n & -64; + if (n1 > 0) { + sswap_kernel_64(n1, x, y); + i = n1; + } + while (i < n) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = y[iy]; - y[iy] = x[ix] ; - x[ix] = temp; - ix += inc_x ; - iy += inc_y ; - i++ ; + } else { - } + while (i < n) { + temp = y[iy]; + y[iy] = x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; } - return(0); - - -} + } + return (0); +} diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index cc63471272..531e47a0b3 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vleg %%v24,128(%%r1,%2),0 \n\t" - "vleg %%v25,136(%%r1,%2),0 \n\t" - "vleg %%v24,144(%%r1,%2),1 \n\t" - "vleg %%v25,152(%%r1,%2),1 \n\t" - "vleg %%v26,160(%%r1,%2),0 \n\t" - "vleg %%v27,168(%%r1,%2),0 \n\t" - "vleg %%v26,176(%%r1,%2),1 \n\t" - "vleg %%v27,184(%%r1,%2),1 \n\t" - "vleg %%v28,192(%%r1,%2),0 \n\t" - "vleg %%v29,200(%%r1,%2),0 \n\t" - "vleg %%v28,208(%%r1,%2),1 \n\t" - "vleg %%v29,216(%%r1,%2),1 \n\t" - "vleg %%v30,224(%%r1,%2),0 \n\t" - "vleg %%v31,232(%%r1,%2),0 \n\t" - "vleg %%v30,240(%%r1,%2),1 \n\t" - "vleg %%v31,248(%%r1,%2),1 \n\t" - - "vflpdb %%v16,%%v16 \n\t" - "vflpdb %%v17,%%v17 \n\t" - "vflpdb %%v18,%%v18 \n\t" - "vflpdb %%v19,%%v19 \n\t" - "vflpdb %%v20,%%v20 \n\t" - "vflpdb %%v21,%%v21 \n\t" - "vflpdb %%v22,%%v22 \n\t" - "vflpdb %%v23,%%v23 \n\t" - "vflpdb %%v24,%%v24 \n\t" - "vflpdb %%v25,%%v25 \n\t" - "vflpdb %%v26,%%v26 \n\t" - "vflpdb %%v27,%%v27 \n\t" - "vflpdb %%v28,%%v28 \n\t" - "vflpdb %%v29,%%v29 \n\t" - "vflpdb %%v30,%%v30 \n\t" - "vflpdb %%v31,%%v31 \n\t" - - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v18,%%v18,%%v19 \n\t" - "vfadb %%v20,%%v20,%%v21 \n\t" - "vfadb %%v22,%%v22,%%v23 \n\t" - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" - "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" - "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" - "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" - - "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" - "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" - - "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" - - "vfmaxdb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" - "ldr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmaxdb %%v16,%%v16,%%v24,0\n\t" + "vfmaxdb %%v18,%%v18,%%v26,0\n\t" + "vfmaxdb %%v20,%%v20,%%v28,0\n\t" + "vfmaxdb %%v22,%%v22,%%v30,0\n\t" + "vfmaxdb %%v16,%%v16,%%v20,0\n\t" + "vfmaxdb %%v18,%%v18,%%v22,0\n\t" + "vfmaxdb %%v16,%%v16,%%v18,0\n\t" + "vfmaxdb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - maxf = zamax_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - maxf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (maxf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + maxf = zamax_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { - - maxf=CABS1(x,0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) > maxf) { - maxf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) > maxf) { - maxf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) > maxf) { - maxf = CABS1(x,ix+inc_x2*3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (maxf); + maxf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (maxf); + + } else { + + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (maxf); + } } diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c index ae711c1730..cac2da938f 100644 --- a/kernel/zarch/zamax_z13.c +++ b/kernel/zarch/zamax_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return amax; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); + + return amax; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - maxf = zamax_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - maxf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (maxf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + maxf = zamax_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { - - maxf=CABS1(x,0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) > maxf) { - maxf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) > maxf) { - maxf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) > maxf) { - maxf = CABS1(x,ix+inc_x2*3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (maxf); + maxf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (maxf); + + } else { + + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (maxf); + } } diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 18610daea3..940d81dd20 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vleg %%v24,128(%%r1,%2),0 \n\t" - "vleg %%v25,136(%%r1,%2),0 \n\t" - "vleg %%v24,144(%%r1,%2),1 \n\t" - "vleg %%v25,152(%%r1,%2),1 \n\t" - "vleg %%v26,160(%%r1,%2),0 \n\t" - "vleg %%v27,168(%%r1,%2),0 \n\t" - "vleg %%v26,176(%%r1,%2),1 \n\t" - "vleg %%v27,184(%%r1,%2),1 \n\t" - "vleg %%v28,192(%%r1,%2),0 \n\t" - "vleg %%v29,200(%%r1,%2),0 \n\t" - "vleg %%v28,208(%%r1,%2),1 \n\t" - "vleg %%v29,216(%%r1,%2),1 \n\t" - "vleg %%v30,224(%%r1,%2),0 \n\t" - "vleg %%v31,232(%%r1,%2),0 \n\t" - "vleg %%v30,240(%%r1,%2),1 \n\t" - "vleg %%v31,248(%%r1,%2),1 \n\t" - - "vflpdb %%v16,%%v16 \n\t" - "vflpdb %%v17,%%v17 \n\t" - "vflpdb %%v18,%%v18 \n\t" - "vflpdb %%v19,%%v19 \n\t" - "vflpdb %%v20,%%v20 \n\t" - "vflpdb %%v21,%%v21 \n\t" - "vflpdb %%v22,%%v22 \n\t" - "vflpdb %%v23,%%v23 \n\t" - "vflpdb %%v24,%%v24 \n\t" - "vflpdb %%v25,%%v25 \n\t" - "vflpdb %%v26,%%v26 \n\t" - "vflpdb %%v27,%%v27 \n\t" - "vflpdb %%v28,%%v28 \n\t" - "vflpdb %%v29,%%v29 \n\t" - "vflpdb %%v30,%%v30 \n\t" - "vflpdb %%v31,%%v31 \n\t" - - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v18,%%v18,%%v19 \n\t" - "vfadb %%v20,%%v20,%%v21 \n\t" - "vfadb %%v22,%%v22,%%v23 \n\t" - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - "vfmindb %%v16,%%v16,%%v24,0 \n\t" - "vfmindb %%v18,%%v18,%%v26,0 \n\t" - "vfmindb %%v20,%%v20,%%v28,0 \n\t" - "vfmindb %%v22,%%v22,%%v30,0 \n\t" - - "vfmindb %%v16,%%v16,%%v20,0 \n\t" - "vfmindb %%v18,%%v18,%%v22,0 \n\t" - - "vfmindb %%v16,%%v16,%%v18,0 \n\t" - - "vfmindb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmindb %%v0,%%v0,%%v16,0 \n\t" - "ldr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - minf = zamin_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - minf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (minf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + minf = zamin_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (minf); - minf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) < minf) { - minf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) < minf) { - minf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) < minf) { - minf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) < minf) { + minf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) < minf) { + minf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (minf); + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (minf); + } } diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c index f82c57e81f..7417e0b742 100644 --- a/kernel/zarch/zamin_z13.c +++ b/kernel/zarch/zamin_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return amin; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); + + return amin; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - minf = zamin_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - minf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (minf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + minf = zamin_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (minf); - minf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) < minf) { - minf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) < minf) { - minf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) < minf) { - minf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) < minf) { + minf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) < minf) { + minf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (minf); + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (minf); + } } diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 8faaf20ebc..43ae8ff8b7 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,138 +28,126 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif -static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v2 \n\t" - "vfadb %%v0,%%v0,%%v3 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ip=0; - FLOAT sumf = 0.0; - BLASLONG n1; - BLASLONG inc_x2; +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ip = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; - if (n <= 0 || inc_x <= 0) return(sumf); + if (n <= 0 || inc_x <= 0) + return (sumf); - if ( inc_x == 1 ) - { + if (inc_x == 1) { - n1 = n & -16; - if ( n1 > 0 ) - { + n1 = n & -16; + if (n1 > 0) { - sumf = zasum_kernel_16(n1, x); - i=n1; - ip=2*n1; - } - - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - i++; - ip+=2; - } + sumf = zasum_kernel_16(n1, x); + i = n1; + ip = 2 * n1; + } + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + i++; + ip += 2; } - else - { - inc_x2 = 2* inc_x; - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - ip+=inc_x2; - i++; - } + } else { + inc_x2 = 2 * inc_x; + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + ip += inc_x2; + i++; } - return(sumf); -} - + } + return (sumf); +} diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index f0e993d2f0..31549849d8 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,144 +27,136 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( +static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__( #if !defined(CONJ) - "vlrepg %%v0,0(%3) \n\t" - "vleg %%v1,8(%3),0 \n\t" - "wflcdb %%v1,%%v1 \n\t" - "vleg %%v1,8(%3),1 \n\t" -#else - "vleg %%v0,0(%3),1 \n\t" - "vflcdb %%v0,%%v0 \n\t" - "vleg %%v0,0(%3),0 \n\t" - "vlrepg %%v1,8(%3) \n\t" + "vlrepg %%v0,0(%[alpha])\n\t" + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" +#else + "vleg %%v0,0(%[alpha]),1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,0(%[alpha]),0\n\t" + "vlrepg %%v1,8(%[alpha])\n\t" #endif - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - - "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,0(%%r1,%2) \n\t" - "vst %%v29,16(%%r1,%2) \n\t" - "vst %%v30,32(%%r1,%2) \n\t" - "vst %%v31,48(%%r1,%2) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,80(%%r1,%1) \n\t" - "vl %%v18,96(%%r1,%1) \n\t" - "vl %%v19,112(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - - "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,64(%%r1,%2) \n\t" - "vst %%v29,80(%%r1,%2) \n\t" - "vst %%v30,96(%%r1,%2) \n\t" - "vst %%v31,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vpdi %%v24,%%v8,%%v8,4\n\t" + "vpdi %%v25,%%v9,%%v9,4\n\t" + "vpdi %%v26,%%v10,%%v10,4\n\t" + "vpdi %%v27,%%v11,%%v11,4\n\t" + "vpdi %%v28,%%v16,%%v16,4\n\t" + "vpdi %%v29,%%v17,%%v17,4\n\t" + "vpdi %%v30,%%v18,%%v18,4\n\t" + "vpdi %%v31,%%v19,%%v19,4\n\t" + "vfmadb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmadb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmadb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmadb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmadb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmadb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmadb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmadb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT da[2] __attribute__ ((aligned(16))); +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2] __attribute__ ((aligned(16))); - if (n <= 0) return (0); + if (n <= 0) + return (0); - if ((inc_x == 1) && (inc_y == 1)) { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -8; + BLASLONG n1 = n & -8; - if (n1) { - da[0] = da_r; - da[1] = da_i; - zaxpy_kernel_8(n1, x, y, da); - ix = 2 * n1; - } - i = n1; - while (i < n) { + if (n1) { + da[0] = da_r; + da[1] = da_i; + zaxpy_kernel_8(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { #if !defined(CONJ) - y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); - y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); - y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - i++; - ix += 2; - - } - return (0); - + i++; + ix += 2; } + return (0); - inc_x *= 2; - inc_y *= 2; + } - while (i < n) { + inc_x *= 2; + inc_y *= 2; + + while (i < n) { #if !defined(CONJ) - y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); - y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); - y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } - return (0); + } + return (0); } - - diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 8c940bba3c..2f80cedceb 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,73 +27,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,4 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","r2" - ); +static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],4\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n * 2]) x) + : "cc"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + if (n <= 0) + return (0); - if ( (inc_x == 1) && (inc_y == 1 )) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - zcopy_kernel_16(n1, x, y); - i=n1; - ix=n1*2; - iy=n1*2; - } - - while(i < n) - { - y[iy] = x[iy] ; - y[iy+1] = x[ix+1] ; - ix+=2; - iy+=2; - i++ ; - - } + BLASLONG n1 = n & -16; + if (n1 > 0) { + zcopy_kernel_16(n1, x, y); + i = n1; + ix = n1 * 2; + iy = n1 * 2; + } + while (i < n) { + y[iy] = x[iy]; + y[iy + 1] = x[ix + 1]; + ix += 2; + iy += 2; + i++; } - else - { - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + } else { - while(i < n) - { - y[iy] = x[ix] ; - y[iy+1] = x[ix+1] ; - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; - } + while (i < n) { + y[iy] = x[ix]; + y[iy + 1] = x[ix + 1]; + ix += inc_x2; + iy += inc_y2; + i++; } - - return(0); + + } + + return (0); } diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index aab18e2e9b..7a67ef734b 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,152 +27,146 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "vzero %%v28 \n\t" - "vzero %%v29 \n\t" - "vzero %%v30 \n\t" - "vzero %%v31 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" - - "vl %%v16, 64(%%r1,%1) \n\t" - "vl %%v17, 80(%%r1,%1) \n\t" - "vl %%v18, 96(%%r1,%1) \n\t" - "vl %%v19, 112(%%r1,%1) \n\t" - "vl %%v0, 64(%%r1,%2) \n\t" - "vl %%v1, 80(%%r1,%2) \n\t" - "vl %%v2, 96(%%r1,%2) \n\t" - "vl %%v3, 112(%%r1,%2) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vfadb %%v24,%%v24,%%v26 \n\t" - "vfadb %%v24,%%v24,%%v28 \n\t" - "vfadb %%v24,%%v24,%%v30 \n\t" - "vfadb %%v25,%%v25,%%v27 \n\t" - "vfadb %%v25,%%v25,%%v29 \n\t" - "vfadb %%v25,%%v25,%%v31 \n\t" - "vsteg %%v24,0(%3),0 \n\t" - "vsteg %%v24,8(%3),1 \n\t" - "vsteg %%v25,16(%3),1 \n\t" - "vsteg %%v25,24(%3),0 " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v25,%%v25,%%v27\n\t" + "vfadb %%v25,%%v25,%%v29\n\t" + "vfadb %%v25,%%v25,%%v31\n\t" + "vsteg %%v24,0(%[d]),0\n\t" + "vsteg %%v24,8(%[d]),1\n\t" + "vsteg %%v25,16(%[d]),1\n\t" + "vsteg %%v25,24(%[d]),0" + : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i; - BLASLONG ix, iy; - OPENBLAS_COMPLEX_FLOAT result; - FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; - - if (n <= 0) { - CREAL(result) = 0.0; - CIMAG(result) = 0.0; - return (result); - - } +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = { + 0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); - if ((inc_x == 1) && (inc_y == 1)) { + } - BLASLONG n1 = n & -8; + if ((inc_x == 1) && (inc_y == 1)) { - if (n1) - zdot_kernel_8(n1, x, y, dot); + BLASLONG n1 = n & -8; - i = n1; - BLASLONG j = i * 2; + if (n1) + zdot_kernel_8(n1, x, y, dot); - while (i < n) { + i = n1; + BLASLONG j = i * 2; - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; + while (i < n) { - j += 2; - i++; + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; - } + j += 2; + i++; + } - } else { - i = 0; - ix = 0; - iy = 0; - inc_x <<= 1; - inc_y <<= 1; - while (i < n) { + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { - dot[0] += x[ix] * y[iy]; - dot[1] += x[ix + 1] * y[iy + 1]; - dot[2] += x[ix] * y[iy + 1]; - dot[3] += x[ix + 1] * y[iy]; + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } } + } #if !defined(CONJ) - CREAL(result) = dot[0] - dot[1]; - CIMAG(result) = dot[2] + dot[3]; + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; #else - CREAL(result) = dot[0] + dot[1]; - CIMAG(result) = dot[2] - dot[3]; + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; #endif - return (result); + return (result); } - - diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 9472b5d5a4..7f21985ecf 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,691 +25,632 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include #include "common.h" #define NBMAX 1024 -static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vl %%v16,0(%5) \n\t" - "vl %%v17,16(%5) \n\t" - "vl %%v18,32(%5) \n\t" - "vl %%v19,48(%5) \n\t" +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" + "vl %%v18,32(%[x])\n\t" + "vl %%v19,48(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v20,8(%5),0 \n\t" - "wflcdb %%v20,%%v20 \n\t" - "vleg %%v20,0(%5),1 \n\t" - "vleg %%v21,24(%5),0 \n\t" - "wflcdb %%v21,%%v21 \n\t" - "vleg %%v21,16(%5),1 \n\t" - "vleg %%v22,40(%5),0 \n\t" - "wflcdb %%v22,%%v22 \n\t" - "vleg %%v22,32(%5),1 \n\t" - "vleg %%v23,56(%5),0 \n\t" - "wflcdb %%v23,%%v23 \n\t" - "vleg %%v23,48(%5),1 \n\t" + "vleg %%v20,8(%[x]),0\n\t" + "wflcdb %%v20,%%v20\n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "wflcdb %%v21,%%v21\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "wflcdb %%v22,%%v22\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vleg %%v23,56(%[x]),0\n\t" + "wflcdb %%v23,%%v23\n\t" + "vleg %%v23,48(%[x]),1\n\t" #else - "vleg %%v20,0(%5),1 \n\t" - "vflcdb %%v20,%%v20 \n\t" - "vleg %%v20,8(%5),0 \n\t" - "vleg %%v21,16(%5),1 \n\t" - "vflcdb %%v21,%%v21 \n\t" - "vleg %%v21,24(%5),0 \n\t" - "vleg %%v22,32(%5),1 \n\t" - "vflcdb %%v22,%%v22 \n\t" - "vleg %%v22,40(%5),0 \n\t" - "vleg %%v23,48(%5),1 \n\t" - "vflcdb %%v23,%%v23 \n\t" - "vleg %%v23,56(%5),0 \n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vflcdb %%v20,%%v20\n\t" + "vleg %%v20,8(%[x]),0\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vflcdb %%v21,%%v21\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vflcdb %%v22,%%v22\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "vleg %%v23,48(%[x]),1\n\t" + "vflcdb %%v23,%%v23\n\t" + "vleg %%v23,56(%[x]),0\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vlrepg %%v24,0(%%r1,%1) \n\t" - "vlrepg %%v25,8(%%r1,%1) \n\t" - "vlrepg %%v26,0(%%r1,%2) \n\t" - "vlrepg %%v27,8(%%r1,%2) \n\t" - - "vl %%v0,0(%%r1,%6) \n\t" - "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlrepg %%v28,0(%%r1,%3) \n\t" - "vlrepg %%v29,8(%%r1,%3) \n\t" - "vlrepg %%v30,0(%%r1,%4) \n\t" - "vlrepg %%v31,8(%%r1,%4) \n\t" - - "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" - "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" - "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" - "vst %%v0,0(%%r1,%6) \n\t" - - "vlrepg %%v24,16(%%r1,%1) \n\t" - "vlrepg %%v25,24(%%r1,%1) \n\t" - "vlrepg %%v26,16(%%r1,%2) \n\t" - "vlrepg %%v27,24(%%r1,%2) \n\t" - - "vl %%v0,16(%%r1,%6) \n\t" - "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlrepg %%v28,16(%%r1,%3) \n\t" - "vlrepg %%v29,24(%%r1,%3) \n\t" - "vlrepg %%v30,16(%%r1,%4) \n\t" - "vlrepg %%v31,24(%%r1,%4) \n\t" - - "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" - "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" - "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" - "vst %%v0,16(%%r1,%6) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v24,0(%%r1,%[ap0])\n\t" + "vlrepg %%v25,8(%%r1,%[ap0])\n\t" + "vlrepg %%v26,0(%%r1,%[ap1])\n\t" + "vlrepg %%v27,8(%%r1,%[ap1])\n\t" + "vlrepg %%v28,16(%%r1,%[ap0])\n\t" + "vlrepg %%v29,24(%%r1,%[ap0])\n\t" + "vlrepg %%v30,16(%%r1,%[ap1])\n\t" + "vlrepg %%v31,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v20,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v21,%%v1\n\t" + "vlrepg %%v24,0(%%r1,%[ap2])\n\t" + "vlrepg %%v25,8(%%r1,%[ap2])\n\t" + "vlrepg %%v26,0(%%r1,%[ap3])\n\t" + "vlrepg %%v27,8(%%r1,%[ap3])\n\t" + "vlrepg %%v28,16(%%r1,%[ap2])\n\t" + "vlrepg %%v29,24(%%r1,%[ap2])\n\t" + "vlrepg %%v30,16(%%r1,%[ap3])\n\t" + "vlrepg %%v31,24(%%r1,%[ap3])\n\t" + "vfmadb %%v0,%%v24,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v22,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v22,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vl %%v16,0(%3) \n\t" - "vl %%v17,16(%3) \n\t" +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v18,8(%3),0 \n\t" - "wflcdb %%v18,%%v18 \n\t" - "vleg %%v18,0(%3),1 \n\t" - "vleg %%v19,24(%3),0 \n\t" - "wflcdb %%v19,%%v19 \n\t" - "vleg %%v19,16(%3),1 \n\t" + "vleg %%v18,8(%[x]),0\n\t" + "wflcdb %%v18,%%v18\n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vleg %%v19,24(%[x]),0\n\t" + "wflcdb %%v19,%%v19\n\t" + "vleg %%v19,16(%[x]),1\n\t" #else - "vleg %%v18,0(%3),1 \n\t" - "vflcdb %%v18,%%v18 \n\t" - "vleg %%v18,8(%3),0 \n\t" - "vleg %%v19,16(%3),1 \n\t" - "vflcdb %%v19,%%v19 \n\t" - "vleg %%v19,24(%3),0 \n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vflcdb %%v18,%%v18\n\t" + "vleg %%v18,8(%[x]),0\n\t" + "vleg %%v19,16(%[x]),1\n\t" + "vflcdb %%v19,%%v19\n\t" + "vleg %%v19,24(%[x]),0\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vlrepg %%v20,0(%%r1,%1) \n\t" - "vlrepg %%v21,8(%%r1,%1) \n\t" - "vlrepg %%v22,0(%%r1,%2) \n\t" - "vlrepg %%v23,8(%%r1,%2) \n\t" - - "vl %%v0,0(%%r1,%4) \n\t" - "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" - "vst %%v0,0(%%r1,%4) \n\t" - - "vlrepg %%v20,16(%%r1,%1) \n\t" - "vlrepg %%v21,24(%%r1,%1) \n\t" - "vlrepg %%v22,16(%%r1,%2) \n\t" - "vlrepg %%v23,24(%%r1,%2) \n\t" - - "vl %%v0,16(%%r1,%4) \n\t" - "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" - "vst %%v0,16(%%r1,%4) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v20,0(%%r1,%[ap0])\n\t" + "vlrepg %%v21,8(%%r1,%[ap0])\n\t" + "vlrepg %%v22,0(%%r1,%[ap1])\n\t" + "vlrepg %%v23,8(%%r1,%[ap1])\n\t" + "vlrepg %%v24,16(%%r1,%[ap0])\n\t" + "vlrepg %%v25,24(%%r1,%[ap0])\n\t" + "vlrepg %%v26,16(%%r1,%[ap1])\n\t" + "vlrepg %%v27,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v24,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v25,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27"); } -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vl %%v16,0(%2) \n\t" +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + __asm__("vl %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v17,8(%2),0 \n\t" - "wflcdb %%v17,%%v17 \n\t" - "vleg %%v17,0(%2),1 \n\t" + "vleg %%v17,8(%[x]),0\n\t" + "wflcdb %%v17,%%v17\n\t" + "vleg %%v17,0(%[x]),1\n\t" #else - "vleg %%v17,0(%2),1 \n\t" - "vflcdb %%v17,%%v17 \n\t" - "vleg %%v17,8(%2),0 \n\t" + "vleg %%v17,0(%[x]),1\n\t" + "vflcdb %%v17,%%v17\n\t" + "vleg %%v17,8(%[x]),0\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vlrepg %%v18,0(%%r1,%1) \n\t" - "vlrepg %%v19,8(%%r1,%1) \n\t" - - "vl %%v0,0(%%r1,%3) \n\t" - "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" - "vst %%v0,0(%%r1,%3) \n\t" - - "vlrepg %%v18,16(%%r1,%1) \n\t" - "vlrepg %%v19,24(%%r1,%1) \n\t" - - "vl %%v0,16(%%r1,%3) \n\t" - "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" - "vst %%v0,16(%%r1,%3) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v18,0(%%r1,%[ap])\n\t" + "vlrepg %%v19,8(%%r1,%[ap])\n\t" + "vlrepg %%v20,16(%%r1,%[ap])\n\t" + "vlrepg %%v21,24(%%r1,%[ap])\n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21"); } -static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) -{ - __asm__ volatile ( -#if !defined(XCONJ) - "vlrepg %%v0,%3 \n\t" - "vleg %%v1,%4,0 \n\t" - "wflcdb %%v1,%%v1 \n\t" - "vleg %%v1,%4,1 \n\t" +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, + FLOAT alpha_i) { + __asm__( +#if !defined(XCONJ) + "vlrepg %%v0,%[alpha_r]\n\t" + "vleg %%v1,%[alpha_i],0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,%[alpha_i],1\n\t" #else - "vleg %%v0,%3,1 \n\t" - "vflcdb %%v0,%%v0 \n\t" - "vleg %%v0,%3,0 \n\t" - "vlrepg %%v1,%4 \n\t" + "vleg %%v0,%[alpha_r],1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,%[alpha_r],0\n\t" + "vlrepg %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,2 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - - "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,0(%%r1,%2) \n\t" - "vst %%v29,16(%%r1,%2) \n\t" - "vst %%v30,32(%%r1,%2) \n\t" - "vst %%v31,48(%%r1,%2) \n\t" - - "agfi %%r1,64 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,0(%%r1,%[dest])\n\t" + "vl %%v21,16(%%r1,%[dest])\n\t" + "vl %%v22,32(%%r1,%[dest])\n\t" + "vl %%v23,48(%%r1,%[dest])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vfmadb %%v28,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v28,%%v24,%%v1,%%v28\n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" + "vst %%v28,0(%%r1,%[dest])\n\t" + "vst %%v29,16(%%r1,%[dest])\n\t" + "vst %%v30,32(%%r1,%[dest])\n\t" + "vst %%v31,48(%%r1,%[dest])\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), + [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) -{ - BLASLONG i; +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, + FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; - if ( inc_dest != 2 ) - { + if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - alpha[0] = alpha_r; - alpha[1] = alpha_i; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - if ( inc_x != 2 ) - copy_x(NB,x_ptr,xbuffer,inc_x); - else - xbuffer = x_ptr; - - if ( inc_y == 2 ) - { - - for( i = 0; i < n1 ; i++) - { - zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - y_ptr += 8; - - } - - if ( n2 & 2 ) - { - zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); - a_ptr += lda * 2; - y_ptr += 4; - - } - - if ( n2 & 1 ) - { - zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); - /* a_ptr += lda; - y_ptr += 2; */ - - } - - } - else - { - - for( i = 0; i < n1 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for( i = 0; i < n2 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - - - if ( m3 == 0 ) return(0); - - x_ptr = x; - j=0; - a_ptr = a; - y_ptr = y; - - if ( m3 == 3 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while ( j < n) - { +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[8]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4; + FLOAT ybuffer[8], *xbuffer; + FLOAT alpha[2]; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + lda4 = lda << 2; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + zgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if (n2 & 2) { + zgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if (n2 & 1) { + zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) + return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } - - - if ( m3 == 2 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while ( j < ( n & -2 )) - { + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } - + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - - return(0); - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } - if ( m3 == 1 ) - { + if (m3 == 1) { - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; - while ( j < ( n & -2 )) - { + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } - return(0); + return (0); } diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index 75027a06c0..aa7f166052 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,230 +27,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepg %%v0,%3 \n\t" - "vlrepg %%v1,%4 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepg %%v0,%[c]\n\t" + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1) ) - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - zrot_kernel_16(n1, x, y, &cosa, &sina); - i=n1; - ix=2*n1; - } - - while(i < n) - { - temp[0] = c*x[ix] + s*y[ix] ; - temp[1] = c*x[ix+1] + s*y[ix+1] ; - y[ix] = c*y[ix] - s*x[ix] ; - y[ix+1] = c*y[ix+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += 2 ; - i++ ; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + zrot_kernel_16(n1, x, y, &cosa, &sina); + i = n1; + ix = 2 * n1; + } - } + while (i < n) { + temp[0] = c * x[ix] + s * y[ix]; + temp[1] = c * x[ix + 1] + s * y[ix + 1]; + y[ix] = c * y[ix] - s * x[ix]; + y[ix + 1] = c * y[ix + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + ix += 2; + i++; } - else - { - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - } + } else { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + while (i < n) { + temp[0] = c * x[ix] + s * y[iy]; + temp[1] = c * x[ix + 1] + s * y[iy + 1]; + y[iy] = c * y[iy] - s * x[ix]; + y[iy + 1] = c * y[iy + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 4d8ee960fd..fbcc0c5b9a 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013 - 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,426 +27,396 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepg %%v0,0(%1) \n\t" - "vleg %%v1,8(%1),0 \n\t" - "wflcdb %%v1,%%v1 \n\t" - "vleg %%v1,8(%1),1 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - "vpdi %%v28,%%v20,%%v20,4 \n\t" - "vpdi %%v29,%%v21,%%v21,4 \n\t" - "vpdi %%v30,%%v22,%%v22,4 \n\t" - "vpdi %%v31,%%v23,%%v23,4 \n\t" - - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v0 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v0 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v0 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v0 \n\t" - "vfmadb %%v16,%%v24,%%v1,%%v16 \n\t" - "vfmadb %%v17,%%v25,%%v1,%%v17 \n\t" - "vfmadb %%v18,%%v26,%%v1,%%v18 \n\t" - "vfmadb %%v19,%%v27,%%v1,%%v19 \n\t" - "vfmadb %%v20,%%v28,%%v1,%%v20 \n\t" - "vfmadb %%v21,%%v29,%%v1,%%v21 \n\t" - "vfmadb %%v22,%%v30,%%v1,%%v22 \n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vleg %%v0,8(%1),0 \n\t" - "wflcdb %%v0,%%v0 \n\t" - "vleg %%v0,8(%1),1 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vpdi %%v16,%%v16,%%v16,4 \n\t" - "vpdi %%v17,%%v17,%%v17,4 \n\t" - "vpdi %%v18,%%v18,%%v18,4 \n\t" - "vpdi %%v19,%%v19,%%v19,4 \n\t" - "vpdi %%v20,%%v20,%%v20,4 \n\t" - "vpdi %%v21,%%v21,%%v21,4 \n\t" - "vpdi %%v22,%%v22,%%v22,4 \n\t" - "vpdi %%v23,%%v23,%%v23,4 \n\t" - - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v0 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v0 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v0 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepg %%v0,0(%[alpha])\n\t" + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vpdi %%v28,%%v20,%%v20,4\n\t" + "vpdi %%v29,%%v21,%%v21,4\n\t" + "vpdi %%v30,%%v22,%%v22,4\n\t" + "vpdi %%v31,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepg %%v0,0(%1) \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v0 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v0 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v0 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vleg %%v0,8(%[alpha]),0\n\t" + "wflcdb %%v0,%%v0\n\t" + "vleg %%v0,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v16,%%v16,%%v16,4\n\t" + "vpdi %%v17,%%v17,%%v17,4\n\t" + "vpdi %%v18,%%v18,%%v18,4\n\t" + "vpdi %%v19,%%v19,%%v19,4\n\t" + "vpdi %%v20,%%v20,%%v20,4\n\t" + "vpdi %%v21,%%v21,%%v21,4\n\t" + "vpdi %%v22,%%v22,%%v22,4\n\t" + "vpdi %%v23,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepg %%v0,0(%[alpha])\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; - - for (i = 0; i < n; i += 4) - { - t0 = da_r * x[0] - da_i * x[1]; - t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; - t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; - t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; - - x[1] = da_i * x[0] + da_r * x[1]; - x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; - x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; - x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; - - x[0] = t0; - x[inc_x] = t1; - x[inc_x2] = t2; - x[inc_x3] = t3; - - x += 4 * inc_x; - } +static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - FLOAT temp0; - FLOAT temp1; - FLOAT alpha[2] __attribute__ ((aligned(16))); - - if (inc_x != 1) { - inc_x <<= 1; - - if (da_r == 0.0) { - - BLASLONG n1 = n & -2; - - if (da_i == 0.0) { +static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, + BLASLONG inc_x) { + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + } +} - while (j < n1) { +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2] __attribute__ ((aligned(16))); - x[i] = 0.0; - x[i + 1] = 0.0; - x[i + inc_x] = 0.0; - x[i + 1 + inc_x] = 0.0; - i += 2 * inc_x; - j += 2; + if (inc_x != 1) { + inc_x <<= 1; - } + if (da_r == 0.0) { - while (j < n) { + BLASLONG n1 = n & -2; - x[i] = 0.0; - x[i + 1] = 0.0; - i += inc_x; - j++; + if (da_i == 0.0) { - } + while (j < n1) { - } else { + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; - while (j < n1) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - temp1 = -da_i * x[i + 1 + inc_x]; - x[i + 1 + inc_x] = da_i * x[i + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + while (j < n) { - } + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; - while (j < n) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + } else { - } + while (j < n1) { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + } - } + while (j < n) { - } else { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + } - if (da_i == 0.0) { - BLASLONG n1 = n & -2; + } - while (j < n1) { + } else { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - temp1 = da_r * x[i + inc_x]; - x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + if (da_i == 0.0) { + BLASLONG n1 = n & -2; - } + while (j < n1) { - while (j < n) { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += inc_x; - j++; + } - } + while (j < n) { - } else { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; - BLASLONG n1 = n & -8; - if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - zscal_kernel_inc_8(n1, alpha, x, inc_x); - j = n1; - i = n1 * inc_x; - } + } - while (j < n) { + } else { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + zscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } - } + while (j < n) { - } + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; } - return (0); - } + } + } - BLASLONG n1 = n & -8; - if (n1 > 0) { + return (0); + } - alpha[0] = da_r; - alpha[1] = da_i; + BLASLONG n1 = n & -8; + if (n1 > 0) { - if (da_r == 0.0) - if (da_i == 0) - zscal_kernel_8_zero(n1, x); - else - zscal_kernel_8_zero_r(n1, alpha, x); - else - if (da_i == 0) - zscal_kernel_8_zero_i(n1, alpha, x); - else - zscal_kernel_8(n1, alpha, x); + alpha[0] = da_r; + alpha[1] = da_i; - i = n1 << 1; - j = n1; - } + if (da_r == 0.0) + if (da_i == 0) + zscal_kernel_8_zero(n1, x); + else + zscal_kernel_8_zero_r(n1, alpha, x); + else if (da_i == 0) + zscal_kernel_8_zero_i(n1, alpha, x); + else + zscal_kernel_8(n1, alpha, x); + i = n1 << 1; + j = n1; + } - if (da_r == 0.0) { + if (da_r == 0.0) { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - x[i] = 0.0; - x[i + 1] = 0.0; - i += 2; - j++; + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } + } - } + } - } else { + } else { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } - - } + } } - return (0); + } + + return (0); } diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index a16b87cdc7..0f38103be7 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,157 +27,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2, inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1 )) - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - zswap_kernel_16(n1, x, y); - i=n1; - ix = 2* n1; - iy = 2* n1; - } - - while(i < n) - { - - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; - - ix += 2 ; - iy += 2 ; - i++ ; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, + FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + zswap_kernel_16(n1, x, y); + i = n1; + ix = 2 * n1; + iy = 2 * n1; + } + while (i < n) { - } + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; + ix += 2; + iy += 2; + i++; } - else - { - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; + } else { - while(i < n) - { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; + while (i < n) { - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; - } + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - - -} + } + return (0); +} From 61526480f906c2d9b4c6a5d2d28be21d0f96ca62 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 5 Feb 2019 07:51:19 +0200 Subject: [PATCH 088/189] [ZARCH] Fix copy constraint --- kernel/zarch/ccopy.c | 2 +- kernel/zarch/dcopy.c | 2 +- kernel/zarch/scopy.c | 2 +- kernel/zarch/zcopy.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c index 1b93a812eb..d17bddcc86 100644 --- a/kernel/zarch/ccopy.c +++ b/kernel/zarch/ccopy.c @@ -36,7 +36,7 @@ static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) : "m"(*(const FLOAT (*)[n * 2]) x) : "cc"); } diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c index f7cbf54b2e..b6a740c431 100644 --- a/kernel/zarch/dcopy.c +++ b/kernel/zarch/dcopy.c @@ -36,7 +36,7 @@ static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) : "m"(*(const FLOAT (*)[n]) x) : "cc"); } diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c index 44d27b062c..4e4993737d 100644 --- a/kernel/zarch/scopy.c +++ b/kernel/zarch/scopy.c @@ -36,7 +36,7 @@ static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) : "m"(*(const FLOAT (*)[n]) x) : "cc"); } diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 2f80cedceb..50ff186461 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -36,7 +36,7 @@ static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) : "m"(*(const FLOAT (*)[n * 2]) x) : "cc"); } From f4b82d7bc4c20da29c19b2eece602002bd5fe4af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Feb 2019 13:30:13 +0100 Subject: [PATCH 089/189] Include complex rather than complex.h in C++ contexts to avoid name clashes e.g. with boost headers that use I as a generic placeholder. Fixes #1992 as suggested by aprokop in that issue ticket. --- lapack-netlib/LAPACKE/include/lapacke.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 6ded78c8b7..11740e1132 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -86,7 +86,11 @@ lapack_complex_float lapack_make_complex_float( float re, float im ); /* Complex type (double precision) */ #ifndef lapack_complex_double +#ifndef __cplusplus #include +#else +#include +#endif #define lapack_complex_double double _Complex #endif From 11a43e81161e5bd3f90e38a1127b1562406e85cd Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 5 Feb 2019 19:17:08 +0200 Subject: [PATCH 090/189] [ZARCH] Set alignment hint for vl/vst --- kernel/zarch/damax.c | 34 ++--- kernel/zarch/damax_z13.c | 34 ++--- kernel/zarch/damin.c | 34 ++--- kernel/zarch/damin_z13.c | 34 ++--- kernel/zarch/dasum.c | 32 ++--- kernel/zarch/daxpy.c | 96 +++++++-------- kernel/zarch/ddot.c | 32 ++--- kernel/zarch/dgemv_n_4.c | 260 +++++++++++++++++++-------------------- kernel/zarch/dgemv_t_4.c | 260 +++++++++++++++++++-------------------- kernel/zarch/dmax.c | 34 ++--- kernel/zarch/dmax_z13.c | 34 ++--- kernel/zarch/dmin.c | 34 ++--- kernel/zarch/dmin_z13.c | 34 ++--- kernel/zarch/drot.c | 128 +++++++++---------- kernel/zarch/dscal.c | 48 ++++---- kernel/zarch/dswap.c | 128 +++++++++---------- kernel/zarch/idamax.c | 34 ++--- kernel/zarch/idamin.c | 34 ++--- kernel/zarch/idmax.c | 34 ++--- kernel/zarch/idmin.c | 34 ++--- kernel/zarch/zasum.c | 32 ++--- kernel/zarch/zaxpy.c | 48 ++++---- kernel/zarch/zdot.c | 32 ++--- kernel/zarch/zgemv_n_4.c | 62 +++++----- kernel/zarch/zgemv_t_4.c | 40 +++--- kernel/zarch/zrot.c | 128 +++++++++---------- kernel/zarch/zscal.c | 112 ++++++++--------- kernel/zarch/zswap.c | 128 +++++++++---------- 28 files changed, 987 insertions(+), 987 deletions(-) diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 37008f702d..2598145c31 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmaxdb %%v16,%%v16,%%v24,8\n\t" "vfmaxdb %%v17,%%v17,%%v25,8\n\t" "vfmaxdb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index 530d6e5bb6..f7e11c3cea 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index a01791741d..25f018c662 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmindb %%v16,%%v16,%%v24,8\n\t" "vfmindb %%v17,%%v17,%%v25,8\n\t" "vfmindb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 2172b6d6f5..091aceb37d 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 9f69a99314..641949963e 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -45,14 +45,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[x]),3\n\t" + "vl %%v17, 144(%%r1,%[x]),3\n\t" + "vl %%v18, 160(%%r1,%[x]),3\n\t" + "vl %%v19, 176(%%r1,%[x]),3\n\t" + "vl %%v20, 192(%%r1,%[x]),3\n\t" + "vl %%v21, 208(%%r1,%[x]),3\n\t" + "vl %%v22, 224(%%r1,%[x]),3\n\t" + "vl %%v23, 240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 179ef8834c..c02ad0aac3 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -34,22 +34,22 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,0(%%r1,%[y])\n\t" - "vl %%v21,16(%%r1,%[y])\n\t" - "vl %%v22,32(%%r1,%[y])\n\t" - "vl %%v23,48(%%r1,%[y])\n\t" - "vl %%v24,64(%%r1,%[x])\n\t" - "vl %%v25,80(%%r1,%[x])\n\t" - "vl %%v26,96(%%r1,%[x])\n\t" - "vl %%v27,112(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,0(%%r1,%[y]),3\n\t" + "vl %%v21,16(%%r1,%[y]),3\n\t" + "vl %%v22,32(%%r1,%[y]),3\n\t" + "vl %%v23,48(%%r1,%[y]),3\n\t" + "vl %%v24,64(%%r1,%[x]),3\n\t" + "vl %%v25,80(%%r1,%[x]),3\n\t" + "vl %%v26,96(%%r1,%[x]),3\n\t" + "vl %%v27,112(%%r1,%[x]),3\n\t" + "vl %%v28,64(%%r1,%[y]),3\n\t" + "vl %%v29,80(%%r1,%[y]),3\n\t" + "vl %%v30,96(%%r1,%[y]),3\n\t" + "vl %%v31,112(%%r1,%[y]),3\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -58,30 +58,30 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y])\n\t" - "vst %%v17,16(%%r1,%[y])\n\t" - "vst %%v18,32(%%r1,%[y])\n\t" - "vst %%v19,48(%%r1,%[y])\n\t" - "vst %%v24,64(%%r1,%[y])\n\t" - "vst %%v25,80(%%r1,%[y])\n\t" - "vst %%v26,96(%%r1,%[y])\n\t" - "vst %%v27,112(%%r1,%[y])\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,128(%%r1,%[y])\n\t" - "vl %%v21,144(%%r1,%[y])\n\t" - "vl %%v22,160(%%r1,%[y])\n\t" - "vl %%v23,176(%%r1,%[y])\n\t" - "vl %%v24,192(%%r1,%[x])\n\t" - "vl %%v25,208(%%r1,%[x])\n\t" - "vl %%v26,224(%%r1,%[x])\n\t" - "vl %%v27,240(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[y])\n\t" - "vl %%v29,208(%%r1,%[y])\n\t" - "vl %%v30,224(%%r1,%[y])\n\t" - "vl %%v31,240(%%r1,%[y])\n\t" + "vst %%v16,0(%%r1,%[y]),3\n\t" + "vst %%v17,16(%%r1,%[y]),3\n\t" + "vst %%v18,32(%%r1,%[y]),3\n\t" + "vst %%v19,48(%%r1,%[y]),3\n\t" + "vst %%v24,64(%%r1,%[y]),3\n\t" + "vst %%v25,80(%%r1,%[y]),3\n\t" + "vst %%v26,96(%%r1,%[y]),3\n\t" + "vst %%v27,112(%%r1,%[y]),3\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,128(%%r1,%[y]),3\n\t" + "vl %%v21,144(%%r1,%[y]),3\n\t" + "vl %%v22,160(%%r1,%[y]),3\n\t" + "vl %%v23,176(%%r1,%[y]),3\n\t" + "vl %%v24,192(%%r1,%[x]),3\n\t" + "vl %%v25,208(%%r1,%[x]),3\n\t" + "vl %%v26,224(%%r1,%[x]),3\n\t" + "vl %%v27,240(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[y]),3\n\t" + "vl %%v29,208(%%r1,%[y]),3\n\t" + "vl %%v30,224(%%r1,%[y]),3\n\t" + "vl %%v31,240(%%r1,%[y]),3\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -90,14 +90,14 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y])\n\t" - "vst %%v17,144(%%r1,%[y])\n\t" - "vst %%v18,160(%%r1,%[y])\n\t" - "vst %%v19,176(%%r1,%[y])\n\t" - "vst %%v24,192(%%r1,%[y])\n\t" - "vst %%v25,208(%%r1,%[y])\n\t" - "vst %%v26,224(%%r1,%[y])\n\t" - "vst %%v27,240(%%r1,%[y])\n\t" + "vst %%v16,128(%%r1,%[y]),3\n\t" + "vst %%v17,144(%%r1,%[y]),3\n\t" + "vst %%v18,160(%%r1,%[y]),3\n\t" + "vst %%v19,176(%%r1,%[y]),3\n\t" + "vst %%v24,192(%%r1,%[y]),3\n\t" + "vst %%v25,208(%%r1,%[y]),3\n\t" + "vst %%v26,224(%%r1,%[y]),3\n\t" + "vst %%v27,240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index f5f601717c..0dd8ed08a1 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -43,22 +43,22 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[x])\n\t" "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[y]),3\n\t" + "vl %%v25,16(%%r1,%[y]),3\n\t" + "vl %%v26,32(%%r1,%[y]),3\n\t" + "vl %%v27,48(%%r1,%[y]),3\n\t" + "vl %%v28,64(%%r1,%[y]),3\n\t" + "vl %%v29,80(%%r1,%[y]),3\n\t" + "vl %%v30,96(%%r1,%[y]),3\n\t" + "vl %%v31,112(%%r1,%[y]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index c93ff9b548..87ed6ecd1f 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -52,26 +52,26 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vl %%v6,32(%%r1,%[y])\n\t" - "vl %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,0(%%r1,%[ap2]),3\n\t" + "vl %%v19,0(%%r1,%[ap3]),3\n\t" + "vl %%v20,16(%%r1,%[ap0]),3\n\t" + "vl %%v21,16(%%r1,%[ap1]),3\n\t" + "vl %%v22,16(%%r1,%[ap2]),3\n\t" + "vl %%v23,16(%%r1,%[ap3]),3\n\t" + "vl %%v24,32(%%r1,%[ap0]),3\n\t" + "vl %%v25,32(%%r1,%[ap1]),3\n\t" + "vl %%v26,32(%%r1,%[ap2]),3\n\t" + "vl %%v27,32(%%r1,%[ap3]),3\n\t" + "vl %%v28,48(%%r1,%[ap0]),3\n\t" + "vl %%v29,48(%%r1,%[ap1]),3\n\t" + "vl %%v30,48(%%r1,%[ap2]),3\n\t" + "vl %%v31,48(%%r1,%[ap3]),3\n\t" + "vl %%v4,0(%%r1,%[y]),3\n\t" + "vl %%v5,16(%%r1,%[y]),3\n\t" + "vl %%v6,32(%%r1,%[y]),3\n\t" + "vl %%v7,48(%%r1,%[y]),3\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -88,30 +88,30 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "vst %%v6,32(%%r1,%[y])\n\t" - "vst %%v7,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[ap0])\n\t" - "vl %%v17,64(%%r1,%[ap1])\n\t" - "vl %%v18,64(%%r1,%[ap2])\n\t" - "vl %%v19,64(%%r1,%[ap3])\n\t" - "vl %%v20,80(%%r1,%[ap0])\n\t" - "vl %%v21,80(%%r1,%[ap1])\n\t" - "vl %%v22,80(%%r1,%[ap2])\n\t" - "vl %%v23,80(%%r1,%[ap3])\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vl %%v4,64(%%r1,%[y])\n\t" - "vl %%v5,80(%%r1,%[y])\n\t" - "vl %%v6,96(%%r1,%[y])\n\t" - "vl %%v7,112(%%r1,%[y])\n\t" + "vst %%v4,0(%%r1,%[y]),3\n\t" + "vst %%v5,16(%%r1,%[y]),3\n\t" + "vst %%v6,32(%%r1,%[y]),3\n\t" + "vst %%v7,48(%%r1,%[y]),3\n\t" + "vl %%v16,64(%%r1,%[ap0]),3\n\t" + "vl %%v17,64(%%r1,%[ap1]),3\n\t" + "vl %%v18,64(%%r1,%[ap2]),3\n\t" + "vl %%v19,64(%%r1,%[ap3]),3\n\t" + "vl %%v20,80(%%r1,%[ap0]),3\n\t" + "vl %%v21,80(%%r1,%[ap1]),3\n\t" + "vl %%v22,80(%%r1,%[ap2]),3\n\t" + "vl %%v23,80(%%r1,%[ap3]),3\n\t" + "vl %%v24,96(%%r1,%[ap0]),3\n\t" + "vl %%v25,96(%%r1,%[ap1]),3\n\t" + "vl %%v26,96(%%r1,%[ap2]),3\n\t" + "vl %%v27,96(%%r1,%[ap3]),3\n\t" + "vl %%v28,112(%%r1,%[ap0]),3\n\t" + "vl %%v29,112(%%r1,%[ap1]),3\n\t" + "vl %%v30,112(%%r1,%[ap2]),3\n\t" + "vl %%v31,112(%%r1,%[ap3]),3\n\t" + "vl %%v4,64(%%r1,%[y]),3\n\t" + "vl %%v5,80(%%r1,%[y]),3\n\t" + "vl %%v6,96(%%r1,%[y]),3\n\t" + "vl %%v7,112(%%r1,%[y]),3\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -128,10 +128,10 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y])\n\t" - "vst %%v5,80(%%r1,%[y])\n\t" - "vst %%v6,96(%%r1,%[y])\n\t" - "vst %%v7,112(%%r1,%[y])\n\t" + "vst %%v4,64(%%r1,%[y]),3\n\t" + "vst %%v5,80(%%r1,%[y]),3\n\t" + "vst %%v6,96(%%r1,%[y]),3\n\t" + "vst %%v7,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -141,16 +141,16 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,0(%%r1,%[ap2]),3\n\t" + "vl %%v19,0(%%r1,%[ap3]),3\n\t" + "vl %%v20,16(%%r1,%[ap0]),3\n\t" + "vl %%v21,16(%%r1,%[ap1]),3\n\t" + "vl %%v22,16(%%r1,%[ap2]),3\n\t" + "vl %%v23,16(%%r1,%[ap3]),3\n\t" + "vl %%v4,0(%%r1,%[y]),3\n\t" + "vl %%v5,16(%%r1,%[y]),3\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" @@ -159,8 +159,8 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v4,0(%%r1,%[y]),3\n\t" + "vst %%v5,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -193,30 +193,30 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v20,32(%%r1,%[ap0])\n\t" - "vl %%v21,32(%%r1,%[ap1])\n\t" - "vl %%v22,48(%%r1,%[ap0])\n\t" - "vl %%v23,48(%%r1,%[ap1])\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vl %%v4,32(%%r1,%[y])\n\t" - "vl %%v5,48(%%r1,%[y])\n\t" - "vl %%v6,64(%%r1,%[y])\n\t" - "vl %%v7,80(%%r1,%[y])\n\t" - "vl %%v8,96(%%r1,%[y])\n\t" - "vl %%v9,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,16(%%r1,%[ap0]),3\n\t" + "vl %%v19,16(%%r1,%[ap1]),3\n\t" + "vl %%v20,32(%%r1,%[ap0]),3\n\t" + "vl %%v21,32(%%r1,%[ap1]),3\n\t" + "vl %%v22,48(%%r1,%[ap0]),3\n\t" + "vl %%v23,48(%%r1,%[ap1]),3\n\t" + "vl %%v24,64(%%r1,%[ap0]),3\n\t" + "vl %%v25,64(%%r1,%[ap1]),3\n\t" + "vl %%v26,80(%%r1,%[ap0]),3\n\t" + "vl %%v27,80(%%r1,%[ap1]),3\n\t" + "vl %%v28,96(%%r1,%[ap0]),3\n\t" + "vl %%v29,96(%%r1,%[ap1]),3\n\t" + "vl %%v30,112(%%r1,%[ap0]),3\n\t" + "vl %%v31,112(%%r1,%[ap1]),3\n\t" + "vl %%v2,0(%%r1,%[y]),3\n\t" + "vl %%v3,16(%%r1,%[y]),3\n\t" + "vl %%v4,32(%%r1,%[y]),3\n\t" + "vl %%v5,48(%%r1,%[y]),3\n\t" + "vl %%v6,64(%%r1,%[y]),3\n\t" + "vl %%v7,80(%%r1,%[y]),3\n\t" + "vl %%v8,96(%%r1,%[y]),3\n\t" + "vl %%v9,112(%%r1,%[y]),3\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" @@ -233,14 +233,14 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "vst %%v4,32(%%r1,%[y])\n\t" - "vst %%v5,48(%%r1,%[y])\n\t" - "vst %%v6,64(%%r1,%[y])\n\t" - "vst %%v7,80(%%r1,%[y])\n\t" - "vst %%v8,96(%%r1,%[y])\n\t" - "vst %%v9,112(%%r1,%[y])\n\t" + "vst %%v2,0(%%r1,%[y]),3\n\t" + "vst %%v3,16(%%r1,%[y]),3\n\t" + "vst %%v4,32(%%r1,%[y]),3\n\t" + "vst %%v5,48(%%r1,%[y]),3\n\t" + "vst %%v6,64(%%r1,%[y]),3\n\t" + "vst %%v7,80(%%r1,%[y]),3\n\t" + "vst %%v8,96(%%r1,%[y]),3\n\t" + "vst %%v9,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -250,18 +250,18 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,16(%%r1,%[ap0]),3\n\t" + "vl %%v19,16(%%r1,%[ap1]),3\n\t" + "vl %%v2,0(%%r1,%[y]),3\n\t" + "vl %%v3,16(%%r1,%[y]),3\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v2,0(%%r1,%[y]),3\n\t" + "vst %%v3,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -289,22 +289,22 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,32(%%r1,%[a0])\n\t" - "vl %%v19,48(%%r1,%[a0])\n\t" - "vl %%v20,64(%%r1,%[a0])\n\t" - "vl %%v21,80(%%r1,%[a0])\n\t" - "vl %%v22,96(%%r1,%[a0])\n\t" - "vl %%v23,112(%%r1,%[a0])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0]),3\n\t" + "vl %%v17,16(%%r1,%[a0]),3\n\t" + "vl %%v18,32(%%r1,%[a0]),3\n\t" + "vl %%v19,48(%%r1,%[a0]),3\n\t" + "vl %%v20,64(%%r1,%[a0]),3\n\t" + "vl %%v21,80(%%r1,%[a0]),3\n\t" + "vl %%v22,96(%%r1,%[a0]),3\n\t" + "vl %%v23,112(%%r1,%[a0]),3\n\t" + "vl %%v24,0(%%r1,%[y]),3\n\t" + "vl %%v25,16(%%r1,%[y]),3\n\t" + "vl %%v26,32(%%r1,%[y]),3\n\t" + "vl %%v27,48(%%r1,%[y]),3\n\t" + "vl %%v28,64(%%r1,%[y]),3\n\t" + "vl %%v29,80(%%r1,%[y]),3\n\t" + "vl %%v30,96(%%r1,%[y]),3\n\t" + "vl %%v31,112(%%r1,%[y]),3\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" @@ -313,14 +313,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y])\n\t" - "vst %%v25,16(%%r1,%[y])\n\t" - "vst %%v26,32(%%r1,%[y])\n\t" - "vst %%v27,48(%%r1,%[y])\n\t" - "vst %%v28,64(%%r1,%[y])\n\t" - "vst %%v29,80(%%r1,%[y])\n\t" - "vst %%v30,96(%%r1,%[y])\n\t" - "vst %%v31,112(%%r1,%[y])\n\t" + "vst %%v24,0(%%r1,%[y]),3\n\t" + "vst %%v25,16(%%r1,%[y]),3\n\t" + "vst %%v26,32(%%r1,%[y]),3\n\t" + "vst %%v27,48(%%r1,%[y]),3\n\t" + "vst %%v28,64(%%r1,%[y]),3\n\t" + "vst %%v29,80(%%r1,%[y]),3\n\t" + "vst %%v30,96(%%r1,%[y]),3\n\t" + "vst %%v31,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -330,14 +330,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,0(%%r1,%[y])\n\t" - "vl %%v19,16(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0]),3\n\t" + "vl %%v17,16(%%r1,%[a0]),3\n\t" + "vl %%v18,0(%%r1,%[y]),3\n\t" + "vl %%v19,16(%%r1,%[y]),3\n\t" "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" - "vst %%v18,0(%%r1,%[y])\n\t" - "vst %%v19,16(%%r1,%[y])\n\t" + "vst %%v18,0(%%r1,%[y]),3\n\t" + "vst %%v19,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 24680cf1b7..9fd3c09d69 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -50,77 +50,77 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" + "vl %%v26,0(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" + "vl %%v27,0(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" + "vl %%v28,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" + "vl %%v29,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" + "vl %%v30,16(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" + "vl %%v31,16(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v24,32(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v25,32(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v26,32(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v27,32(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v28,48(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v29,48(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v30,48(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v31,48(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v24,64(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v25,64(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2])\n\t" + "vl %%v26,64(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3])\n\t" + "vl %%v27,64(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0])\n\t" + "vl %%v28,80(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1])\n\t" + "vl %%v29,80(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2])\n\t" + "vl %%v30,80(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3])\n\t" + "vl %%v31,80(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v24,96(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v25,96(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v26,96(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v27,96(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v28,112(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v29,112(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v30,112(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v31,112(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -131,23 +131,23 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" + "vl %%v26,0(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" + "vl %%v27,0(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" + "vl %%v28,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" + "vl %%v29,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" + "vl %%v30,16(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" + "vl %%v31,16(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -198,45 +198,45 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" + "vl %%v26,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" + "vl %%v27,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0])\n\t" + "vl %%v28,32(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1])\n\t" + "vl %%v29,32(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0])\n\t" + "vl %%v30,48(%%r1,%[ap0]),3\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1])\n\t" + "vl %%v31,48(%%r1,%[ap1]),3\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v24,64(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v25,64(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v26,80(%%r1,%[ap0]),3\n\t" "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v27,80(%%r1,%[ap1]),3\n\t" "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v28,96(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v29,96(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v30,112(%%r1,%[ap0]),3\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v31,112(%%r1,%[ap1]),3\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -247,15 +247,15 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" + "vl %%v26,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" + "vl %%v27,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -299,29 +299,29 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[a0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" + "vl %%v25,16(%%r1,%[a0]),3\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0])\n\t" + "vl %%v26,32(%%r1,%[a0]),3\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0])\n\t" + "vl %%v27,48(%%r1,%[a0]),3\n\t" "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0])\n\t" + "vl %%v28,64(%%r1,%[a0]),3\n\t" "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0])\n\t" + "vl %%v29,80(%%r1,%[a0]),3\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0])\n\t" + "vl %%v30,96(%%r1,%[a0]),3\n\t" "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0])\n\t" + "vl %%v31,112(%%r1,%[a0]),3\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -332,11 +332,11 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[a0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" + "vl %%v25,16(%%r1,%[a0]),3\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -378,38 +378,38 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,64(%%r1,%[src])\n\t" - "vl %%v21,80(%%r1,%[src])\n\t" - "vl %%v22,96(%%r1,%[src])\n\t" - "vl %%v23,112(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src]),3\n\t" + "vl %%v17,16(%%r1,%[src]),3\n\t" + "vl %%v18,32(%%r1,%[src]),3\n\t" + "vl %%v19,48(%%r1,%[src]),3\n\t" + "vl %%v20,64(%%r1,%[src]),3\n\t" + "vl %%v21,80(%%r1,%[src]),3\n\t" + "vl %%v22,96(%%r1,%[src]),3\n\t" + "vl %%v23,112(%%r1,%[src]),3\n\t" + "vl %%v24, 0(%%r1,%[dest]),3\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" + "vst %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v25, 16(%%r1,%[dest]),3\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "vl %%v26, 32(%%r1,%[dest])\n\t" + "vst %%v25, 16(%%r1,%[dest]),3\n\t" + "vl %%v26, 32(%%r1,%[dest]),3\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest])\n\t" - "vl %%v27, 48(%%r1,%[dest])\n\t" + "vst %%v26, 32(%%r1,%[dest]),3\n\t" + "vl %%v27, 48(%%r1,%[dest]),3\n\t" "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest])\n\t" - "vl %%v28, 64(%%r1,%[dest])\n\t" + "vst %%v27, 48(%%r1,%[dest]),3\n\t" + "vl %%v28, 64(%%r1,%[dest]),3\n\t" "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest])\n\t" - "vl %%v29, 80(%%r1,%[dest])\n\t" + "vst %%v28, 64(%%r1,%[dest]),3\n\t" + "vl %%v29, 80(%%r1,%[dest]),3\n\t" "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest])\n\t" - "vl %%v30, 96(%%r1,%[dest])\n\t" + "vst %%v29, 80(%%r1,%[dest]),3\n\t" + "vl %%v30, 96(%%r1,%[dest]),3\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest])\n\t" - "vl %%v31, 112(%%r1,%[dest])\n\t" + "vst %%v30, 96(%%r1,%[dest]),3\n\t" + "vl %%v31, 112(%%r1,%[dest]),3\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest])\n\t" + "vst %%v31, 112(%%r1,%[dest]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -419,14 +419,14 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src]),3\n\t" + "vl %%v17,16(%%r1,%[src]),3\n\t" + "vl %%v24, 0(%%r1,%[dest]),3\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" + "vst %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v25, 16(%%r1,%[dest]),3\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" + "vst %%v25, 16(%%r1,%[dest]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index 65ed31f01b..cc0f23c877 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmaxdb %%v16,%%v16,%%v24,0\n\t" "vfmaxdb %%v17,%%v17,%%v25,0\n\t" "vfmaxdb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 87bccbe55d..83d827d35f 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" @@ -59,14 +59,14 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 518cc262ce..754828b7c9 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmindb %%v16,%%v16,%%v24,0\n\t" "vfmindb %%v17,%%v17,%%v25,0\n\t" "vfmindb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index 91561992f5..ff0fca48c2 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" @@ -59,14 +59,14 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index 8f0197f023..de2207fcd1 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -35,14 +35,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x]),3\n\t" + "vl %%v25, 16(%%r1,%[x]),3\n\t" + "vl %%v26, 32(%%r1,%[x]),3\n\t" + "vl %%v27, 48(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[y]),3\n\t" + "vl %%v17, 16(%%r1,%[y]),3\n\t" + "vl %%v18, 32(%%r1,%[y]),3\n\t" + "vl %%v19, 48(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" + "vst %%v28, 0(%%r1,%[x]),3\n\t" + "vst %%v29, 16(%%r1,%[x]),3\n\t" + "vst %%v30, 32(%%r1,%[x]),3\n\t" + "vst %%v31, 48(%%r1,%[x]),3\n\t" + "vst %%v20, 0(%%r1,%[y]),3\n\t" + "vst %%v21, 16(%%r1,%[y]),3\n\t" + "vst %%v22, 32(%%r1,%[y]),3\n\t" + "vst %%v23, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 64(%%r1,%[x]),3\n\t" + "vl %%v25, 80(%%r1,%[x]),3\n\t" + "vl %%v26, 96(%%r1,%[x]),3\n\t" + "vl %%v27, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 64(%%r1,%[y]),3\n\t" + "vl %%v17, 80(%%r1,%[y]),3\n\t" + "vl %%v18, 96(%%r1,%[y]),3\n\t" + "vl %%v19, 112(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" + "vst %%v28, 64(%%r1,%[x]),3\n\t" + "vst %%v29, 80(%%r1,%[x]),3\n\t" + "vst %%v30, 96(%%r1,%[x]),3\n\t" + "vst %%v31, 112(%%r1,%[x]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[y]),3\n\t" + "vl %%v17, 144(%%r1,%[y]),3\n\t" + "vl %%v18, 160(%%r1,%[y]),3\n\t" + "vl %%v19, 176(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" + "vst %%v28, 128(%%r1,%[x]),3\n\t" + "vst %%v29, 144(%%r1,%[x]),3\n\t" + "vst %%v30, 160(%%r1,%[x]),3\n\t" + "vst %%v31, 176(%%r1,%[x]),3\n\t" + "vst %%v20, 128(%%r1,%[y]),3\n\t" + "vst %%v21, 144(%%r1,%[y]),3\n\t" + "vst %%v22, 160(%%r1,%[y]),3\n\t" + "vst %%v23, 176(%%r1,%[y]),3\n\t" + "vl %%v24, 192(%%r1,%[x]),3\n\t" + "vl %%v25, 208(%%r1,%[x]),3\n\t" + "vl %%v26, 224(%%r1,%[x]),3\n\t" + "vl %%v27, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 192(%%r1,%[y]),3\n\t" + "vl %%v17, 208(%%r1,%[y]),3\n\t" + "vl %%v18, 224(%%r1,%[y]),3\n\t" + "vl %%v19, 240(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[x]),3\n\t" + "vst %%v29, 208(%%r1,%[x]),3\n\t" + "vst %%v30, 224(%%r1,%[x]),3\n\t" + "vst %%v31, 240(%%r1,%[x]),3\n\t" + "vst %%v20, 192(%%r1,%[y]),3\n\t" + "vst %%v21, 208(%%r1,%[y]),3\n\t" + "vst %%v22, 224(%%r1,%[y]),3\n\t" + "vst %%v23, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index c944990b5a..bc58569d59 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -33,30 +33,30 @@ static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x]),3\n\t" "vfmdb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x])\n\t" - "vl %%v25,16(%%r1,%[x])\n\t" + "vst %%v24,0(%%r1,%[x]),3\n\t" + "vl %%v25,16(%%r1,%[x]),3\n\t" "vfmdb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x])\n\t" - "vl %%v26,32(%%r1,%[x])\n\t" + "vst %%v25,16(%%r1,%[x]),3\n\t" + "vl %%v26,32(%%r1,%[x]),3\n\t" "vfmdb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x])\n\t" - "vl %%v27,48(%%r1,%[x])\n\t" + "vst %%v26,32(%%r1,%[x]),3\n\t" + "vl %%v27,48(%%r1,%[x]),3\n\t" "vfmdb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[x])\n\t" + "vst %%v27,48(%%r1,%[x]),3\n\t" + "vl %%v28,64(%%r1,%[x]),3\n\t" "vfmdb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x])\n\t" - "vl %%v29,80(%%r1,%[x])\n\t" + "vst %%v28,64(%%r1,%[x]),3\n\t" + "vl %%v29,80(%%r1,%[x]),3\n\t" "vfmdb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x])\n\t" - "vl %%v30,96(%%r1,%[x])\n\t" + "vst %%v29,80(%%r1,%[x]),3\n\t" + "vl %%v30,96(%%r1,%[x]),3\n\t" "vfmdb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" + "vst %%v30,96(%%r1,%[x]),3\n\t" + "vl %%v31,112(%%r1,%[x]),3\n\t" "vfmdb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x])\n\t" + "vst %%v31,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) @@ -71,14 +71,14 @@ static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x]),3\n\t" + "vst %%v0,16(%%r1,%[x]),3\n\t" + "vst %%v0,32(%%r1,%[x]),3\n\t" + "vst %%v0,48(%%r1,%[x]),3\n\t" + "vst %%v0,64(%%r1,%[x]),3\n\t" + "vst %%v0,80(%%r1,%[x]),3\n\t" + "vst %%v0,96(%%r1,%[x]),3\n\t" + "vst %%v0,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index 60ba40bd62..f4da46dc14 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -33,70 +33,70 @@ static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v28, 192(%%r1,%[x]),3\n\t" + "vl %%v29, 208(%%r1,%[x]),3\n\t" + "vl %%v30, 224(%%r1,%[x]),3\n\t" + "vl %%v31, 240(%%r1,%[x]),3\n\t" + "vl %%v0, 0(%%r1,%[y]),3\n\t" + "vl %%v1, 16(%%r1,%[y]),3\n\t" + "vl %%v2, 32(%%r1,%[y]),3\n\t" + "vl %%v3, 48(%%r1,%[y]),3\n\t" + "vl %%v4, 64(%%r1,%[y]),3\n\t" + "vl %%v5, 80(%%r1,%[y]),3\n\t" + "vl %%v6, 96(%%r1,%[y]),3\n\t" + "vl %%v7, 112(%%r1,%[y]),3\n\t" + "vst %%v0, 0(%%r1,%[x]),3\n\t" + "vst %%v1, 16(%%r1,%[x]),3\n\t" + "vst %%v2, 32(%%r1,%[x]),3\n\t" + "vst %%v3, 48(%%r1,%[x]),3\n\t" + "vst %%v4, 64(%%r1,%[x]),3\n\t" + "vst %%v5, 80(%%r1,%[x]),3\n\t" + "vst %%v6, 96(%%r1,%[x]),3\n\t" + "vst %%v7, 112(%%r1,%[x]),3\n\t" + "vl %%v0, 128(%%r1,%[y]),3\n\t" + "vl %%v1, 144(%%r1,%[y]),3\n\t" + "vl %%v2, 160(%%r1,%[y]),3\n\t" + "vl %%v3, 176(%%r1,%[y]),3\n\t" + "vl %%v4, 192(%%r1,%[y]),3\n\t" + "vl %%v5, 208(%%r1,%[y]),3\n\t" + "vl %%v6, 224(%%r1,%[y]),3\n\t" + "vl %%v7, 240(%%r1,%[y]),3\n\t" + "vst %%v0, 128(%%r1,%[x]),3\n\t" + "vst %%v1, 144(%%r1,%[x]),3\n\t" + "vst %%v2, 160(%%r1,%[x]),3\n\t" + "vst %%v3, 176(%%r1,%[x]),3\n\t" + "vst %%v4, 192(%%r1,%[x]),3\n\t" + "vst %%v5, 208(%%r1,%[x]),3\n\t" + "vst %%v6, 224(%%r1,%[x]),3\n\t" + "vst %%v7, 240(%%r1,%[x]),3\n\t" + "vst %%v16, 0(%%r1,%[y]),3\n\t" + "vst %%v17, 16(%%r1,%[y]),3\n\t" + "vst %%v18, 32(%%r1,%[y]),3\n\t" + "vst %%v19, 48(%%r1,%[y]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vst %%v24, 128(%%r1,%[y]),3\n\t" + "vst %%v25, 144(%%r1,%[y]),3\n\t" + "vst %%v26, 160(%%r1,%[y]),3\n\t" + "vst %%v27, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[y]),3\n\t" + "vst %%v29, 208(%%r1,%[y]),3\n\t" + "vst %%v30, 224(%%r1,%[y]),3\n\t" + "vst %%v31, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 8434c811f4..bd0f181152 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 80a37e6c25..4884d1e3a7 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 18cdba4376..a6b95bf3e3 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { BLASLONG imax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 02ca427e47..c3f36d964f 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { BLASLONG imin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 43ae8ff8b7..83e5e93c99 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -45,14 +45,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[x]),3\n\t" + "vl %%v17, 144(%%r1,%[x]),3\n\t" + "vl %%v18, 160(%%r1,%[x]),3\n\t" + "vl %%v19, 176(%%r1,%[x]),3\n\t" + "vl %%v20, 192(%%r1,%[x]),3\n\t" + "vl %%v21, 208(%%r1,%[x]),3\n\t" + "vl %%v22, 224(%%r1,%[x]),3\n\t" + "vl %%v23, 240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 31549849d8..77bb09a2e2 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -45,22 +45,22 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x])\n\t" - "vl %%v9,16(%%r1,%[x])\n\t" - "vl %%v10,32(%%r1,%[x])\n\t" - "vl %%v11,48(%%r1,%[x])\n\t" - "vl %%v12,0(%%r1,%[y])\n\t" - "vl %%v13,16(%%r1,%[y])\n\t" - "vl %%v14,32(%%r1,%[y])\n\t" - "vl %%v15,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[x])\n\t" - "vl %%v17,80(%%r1,%[x])\n\t" - "vl %%v18,96(%%r1,%[x])\n\t" - "vl %%v19,112(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[y])\n\t" - "vl %%v21,80(%%r1,%[y])\n\t" - "vl %%v22,96(%%r1,%[y])\n\t" - "vl %%v23,112(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x]),3\n\t" + "vl %%v9,16(%%r1,%[x]),3\n\t" + "vl %%v10,32(%%r1,%[x]),3\n\t" + "vl %%v11,48(%%r1,%[x]),3\n\t" + "vl %%v12,0(%%r1,%[y]),3\n\t" + "vl %%v13,16(%%r1,%[y]),3\n\t" + "vl %%v14,32(%%r1,%[y]),3\n\t" + "vl %%v15,48(%%r1,%[y]),3\n\t" + "vl %%v16,64(%%r1,%[x]),3\n\t" + "vl %%v17,80(%%r1,%[x]),3\n\t" + "vl %%v18,96(%%r1,%[x]),3\n\t" + "vl %%v19,112(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[y]),3\n\t" + "vl %%v21,80(%%r1,%[y]),3\n\t" + "vl %%v22,96(%%r1,%[y]),3\n\t" + "vl %%v23,112(%%r1,%[y]),3\n\t" "vpdi %%v24,%%v8,%%v8,4\n\t" "vpdi %%v25,%%v9,%%v9,4\n\t" "vpdi %%v26,%%v10,%%v10,4\n\t" @@ -85,14 +85,14 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y])\n\t" - "vst %%v9,16(%%r1,%[y])\n\t" - "vst %%v10,32(%%r1,%[y])\n\t" - "vst %%v11,48(%%r1,%[y])\n\t" - "vst %%v16,64(%%r1,%[y])\n\t" - "vst %%v17,80(%%r1,%[y])\n\t" - "vst %%v18,96(%%r1,%[y])\n\t" - "vst %%v19,112(%%r1,%[y])\n\t" + "vst %%v8,0(%%r1,%[y]),3\n\t" + "vst %%v9,16(%%r1,%[y]),3\n\t" + "vst %%v10,32(%%r1,%[y]),3\n\t" + "vst %%v11,48(%%r1,%[y]),3\n\t" + "vst %%v16,64(%%r1,%[y]),3\n\t" + "vst %%v17,80(%%r1,%[y]),3\n\t" + "vst %%v18,96(%%r1,%[y]),3\n\t" + "vst %%v19,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 7a67ef734b..8cfbaadb83 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -41,14 +41,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v0, 0(%%r1,%[y]),3\n\t" + "vl %%v1, 16(%%r1,%[y]),3\n\t" + "vl %%v2, 32(%%r1,%[y]),3\n\t" + "vl %%v3, 48(%%r1,%[y]),3\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" @@ -61,14 +61,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x])\n\t" - "vl %%v17, 80(%%r1,%[x])\n\t" - "vl %%v18, 96(%%r1,%[x])\n\t" - "vl %%v19, 112(%%r1,%[x])\n\t" - "vl %%v0, 64(%%r1,%[y])\n\t" - "vl %%v1, 80(%%r1,%[y])\n\t" - "vl %%v2, 96(%%r1,%[y])\n\t" - "vl %%v3, 112(%%r1,%[y])\n\t" + "vl %%v16, 64(%%r1,%[x]),3\n\t" + "vl %%v17, 80(%%r1,%[x]),3\n\t" + "vl %%v18, 96(%%r1,%[x]),3\n\t" + "vl %%v19, 112(%%r1,%[x]),3\n\t" + "vl %%v0, 64(%%r1,%[y]),3\n\t" + "vl %%v1, 80(%%r1,%[y]),3\n\t" + "vl %%v2, 96(%%r1,%[y]),3\n\t" + "vl %%v3, 112(%%r1,%[y]),3\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 7f21985ecf..4b64fc8a56 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 1024 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" - "vl %%v18,32(%[x])\n\t" - "vl %%v19,48(%[x])\n\t" + __asm__("vl %%v16,0(%[x]),3\n\t" + "vl %%v17,16(%[x]),3\n\t" + "vl %%v18,32(%[x]),3\n\t" + "vl %%v19,48(%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v20,8(%[x]),0\n\t" "wflcdb %%v20,%%v20\n\t" @@ -69,8 +69,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y]),3\n\t" + "vl %%v1,16(%%r1,%[y]),3\n\t" "vlrepg %%v24,0(%%r1,%[ap0])\n\t" "vlrepg %%v25,8(%%r1,%[ap0])\n\t" "vlrepg %%v26,0(%%r1,%[ap1])\n\t" @@ -103,8 +103,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" + "vst %%v0,0(%%r1,%[y]),3\n\t" + "vst %%v1,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -119,8 +119,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" + __asm__("vl %%v16,0(%[x]),3\n\t" + "vl %%v17,16(%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v18,8(%[x]),0\n\t" "wflcdb %%v18,%%v18\n\t" @@ -142,8 +142,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y]),3\n\t" + "vl %%v1,16(%%r1,%[y]),3\n\t" "vlrepg %%v20,0(%%r1,%[ap0])\n\t" "vlrepg %%v21,8(%%r1,%[ap0])\n\t" "vlrepg %%v22,0(%%r1,%[ap1])\n\t" @@ -160,8 +160,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" + "vst %%v0,0(%%r1,%[y]),3\n\t" + "vst %%v1,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -173,7 +173,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x])\n\t" + __asm__("vl %%v16,0(%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v17,8(%[x]),0\n\t" "wflcdb %%v17,%%v17\n\t" @@ -188,8 +188,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y]),3\n\t" + "vl %%v1,16(%%r1,%[y]),3\n\t" "vlrepg %%v18,0(%%r1,%[ap])\n\t" "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vlrepg %%v20,16(%%r1,%[ap])\n\t" @@ -198,8 +198,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" + "vst %%v0,0(%%r1,%[y]),3\n\t" + "vst %%v1,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -227,14 +227,14 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,0(%%r1,%[dest])\n\t" - "vl %%v21,16(%%r1,%[dest])\n\t" - "vl %%v22,32(%%r1,%[dest])\n\t" - "vl %%v23,48(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src]),3\n\t" + "vl %%v17,16(%%r1,%[src]),3\n\t" + "vl %%v18,32(%%r1,%[src]),3\n\t" + "vl %%v19,48(%%r1,%[src]),3\n\t" + "vl %%v20,0(%%r1,%[dest]),3\n\t" + "vl %%v21,16(%%r1,%[dest]),3\n\t" + "vl %%v22,32(%%r1,%[dest]),3\n\t" + "vl %%v23,48(%%r1,%[dest]),3\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -247,10 +247,10 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" - "vst %%v28,0(%%r1,%[dest])\n\t" - "vst %%v29,16(%%r1,%[dest])\n\t" - "vst %%v30,32(%%r1,%[dest])\n\t" - "vst %%v31,48(%%r1,%[dest])\n\t" + "vst %%v28,0(%%r1,%[dest]),3\n\t" + "vst %%v29,16(%%r1,%[dest]),3\n\t" + "vst %%v30,32(%%r1,%[dest]),3\n\t" + "vst %%v31,48(%%r1,%[dest]),3\n\t" "agfi %%r1,64\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c index 7b3e6c1fc6..429824dcf8 100644 --- a/kernel/zarch/zgemv_t_4.c +++ b/kernel/zarch/zgemv_t_4.c @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -73,7 +73,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vl %%v0,16(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -120,10 +120,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v24,0(%[alpha]),0\n\t" "vlrepg %%v25,8(%[alpha])\n\t" #endif - "vl %%v26,0(%[y])\n\t" - "vl %%v27,16(%[y])\n\t" - "vl %%v28,32(%[y])\n\t" - "vl %%v29,48(%[y])\n\t" + "vl %%v26,0(%[y]),3\n\t" + "vl %%v27,16(%[y]),3\n\t" + "vl %%v28,32(%[y]),3\n\t" + "vl %%v29,48(%[y]),3\n\t" "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" @@ -132,10 +132,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" - "vst %%v26,0(%[y])\n\t" - "vst %%v27,16(%[y])\n\t" - "vst %%v28,32(%[y])\n\t" - "vst %%v29,48(%[y])" + "vst %%v26,0(%[y]),3\n\t" + "vst %%v27,16(%[y]),3\n\t" + "vst %%v28,32(%[y]),3\n\t" + "vst %%v29,48(%[y]),3" : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -160,7 +160,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -178,7 +178,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vl %%v0,16(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -213,14 +213,14 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v20,0(%[alpha]),0\n\t" "vlrepg %%v21,8(%[alpha])\n\t" #endif - "vl %%v22,0(%[y])\n\t" - "vl %%v23,16(%[y])\n\t" + "vl %%v22,0(%[y]),3\n\t" + "vl %%v23,16(%[y]),3\n\t" "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" - "vst %%v22,0(%[y])\n\t" - "vst %%v23,16(%[y])\n\t" + "vst %%v22,0(%[y]),3\n\t" + "vst %%v23,16(%[y]),3\n\t" : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -239,7 +239,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -253,7 +253,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vl %%v0,16(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -282,10 +282,10 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vleg %%v18,0(%[alpha]),0\n\t" "vlrepg %%v19,8(%[alpha])\n\t" #endif - "vl %%v0,0(%[y])\n\t" + "vl %%v0,0(%[y]),3\n\t" "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" - "vst %%v0,0(%[y])\n\t" + "vst %%v0,0(%[y]),3\n\t" : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index aa7f166052..ea81e4741c 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -35,14 +35,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x]),3\n\t" + "vl %%v25, 16(%%r1,%[x]),3\n\t" + "vl %%v26, 32(%%r1,%[x]),3\n\t" + "vl %%v27, 48(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[y]),3\n\t" + "vl %%v17, 16(%%r1,%[y]),3\n\t" + "vl %%v18, 32(%%r1,%[y]),3\n\t" + "vl %%v19, 48(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" + "vst %%v28, 0(%%r1,%[x]),3\n\t" + "vst %%v29, 16(%%r1,%[x]),3\n\t" + "vst %%v30, 32(%%r1,%[x]),3\n\t" + "vst %%v31, 48(%%r1,%[x]),3\n\t" + "vst %%v20, 0(%%r1,%[y]),3\n\t" + "vst %%v21, 16(%%r1,%[y]),3\n\t" + "vst %%v22, 32(%%r1,%[y]),3\n\t" + "vst %%v23, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 64(%%r1,%[x]),3\n\t" + "vl %%v25, 80(%%r1,%[x]),3\n\t" + "vl %%v26, 96(%%r1,%[x]),3\n\t" + "vl %%v27, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 64(%%r1,%[y]),3\n\t" + "vl %%v17, 80(%%r1,%[y]),3\n\t" + "vl %%v18, 96(%%r1,%[y]),3\n\t" + "vl %%v19, 112(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" + "vst %%v28, 64(%%r1,%[x]),3\n\t" + "vst %%v29, 80(%%r1,%[x]),3\n\t" + "vst %%v30, 96(%%r1,%[x]),3\n\t" + "vst %%v31, 112(%%r1,%[x]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[y]),3\n\t" + "vl %%v17, 144(%%r1,%[y]),3\n\t" + "vl %%v18, 160(%%r1,%[y]),3\n\t" + "vl %%v19, 176(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" + "vst %%v28, 128(%%r1,%[x]),3\n\t" + "vst %%v29, 144(%%r1,%[x]),3\n\t" + "vst %%v30, 160(%%r1,%[x]),3\n\t" + "vst %%v31, 176(%%r1,%[x]),3\n\t" + "vst %%v20, 128(%%r1,%[y]),3\n\t" + "vst %%v21, 144(%%r1,%[y]),3\n\t" + "vst %%v22, 160(%%r1,%[y]),3\n\t" + "vst %%v23, 176(%%r1,%[y]),3\n\t" + "vl %%v24, 192(%%r1,%[x]),3\n\t" + "vl %%v25, 208(%%r1,%[x]),3\n\t" + "vl %%v26, 224(%%r1,%[x]),3\n\t" + "vl %%v27, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 192(%%r1,%[y]),3\n\t" + "vl %%v17, 208(%%r1,%[y]),3\n\t" + "vl %%v18, 224(%%r1,%[y]),3\n\t" + "vl %%v19, 240(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[x]),3\n\t" + "vst %%v29, 208(%%r1,%[x]),3\n\t" + "vst %%v30, 224(%%r1,%[x]),3\n\t" + "vst %%v31, 240(%%r1,%[x]),3\n\t" + "vst %%v20, 192(%%r1,%[y]),3\n\t" + "vst %%v21, 208(%%r1,%[y]),3\n\t" + "vst %%v22, 224(%%r1,%[y]),3\n\t" + "vst %%v23, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index fbcc0c5b9a..7fd62a1ac5 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -36,14 +36,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -68,14 +68,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" + "vst %%v16,0(%%r1,%[x]),3\n\t" + "vst %%v17,16(%%r1,%[x]),3\n\t" + "vst %%v18,32(%%r1,%[x]),3\n\t" + "vst %%v19,48(%%r1,%[x]),3\n\t" + "vst %%v20,64(%%r1,%[x]),3\n\t" + "vst %%v21,80(%%r1,%[x]),3\n\t" + "vst %%v22,96(%%r1,%[x]),3\n\t" + "vst %%v23,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -93,14 +93,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vpdi %%v16,%%v16,%%v16,4\n\t" "vpdi %%v17,%%v17,%%v17,4\n\t" "vpdi %%v18,%%v18,%%v18,4\n\t" @@ -117,14 +117,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" + "vst %%v16,0(%%r1,%[x]),3\n\t" + "vst %%v17,16(%%r1,%[x]),3\n\t" + "vst %%v18,32(%%r1,%[x]),3\n\t" + "vst %%v19,48(%%r1,%[x]),3\n\t" + "vst %%v20,64(%%r1,%[x]),3\n\t" + "vst %%v21,80(%%r1,%[x]),3\n\t" + "vst %%v22,96(%%r1,%[x]),3\n\t" + "vst %%v23,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -139,14 +139,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfmdb %%v16,%%v16,%%v0\n\t" "vfmdb %%v17,%%v17,%%v0\n\t" "vfmdb %%v18,%%v18,%%v0\n\t" @@ -155,14 +155,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" + "vst %%v16,0(%%r1,%[x]),3\n\t" + "vst %%v17,16(%%r1,%[x]),3\n\t" + "vst %%v18,32(%%r1,%[x]),3\n\t" + "vst %%v19,48(%%r1,%[x]),3\n\t" + "vst %%v20,64(%%r1,%[x]),3\n\t" + "vst %%v21,80(%%r1,%[x]),3\n\t" + "vst %%v22,96(%%r1,%[x]),3\n\t" + "vst %%v23,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -177,14 +177,14 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x]),3\n\t" + "vst %%v0,16(%%r1,%[x]),3\n\t" + "vst %%v0,32(%%r1,%[x]),3\n\t" + "vst %%v0,48(%%r1,%[x]),3\n\t" + "vst %%v0,64(%%r1,%[x]),3\n\t" + "vst %%v0,80(%%r1,%[x]),3\n\t" + "vst %%v0,96(%%r1,%[x]),3\n\t" + "vst %%v0,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 0f38103be7..0252ab8db9 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -33,70 +33,70 @@ static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v28, 192(%%r1,%[x]),3\n\t" + "vl %%v29, 208(%%r1,%[x]),3\n\t" + "vl %%v30, 224(%%r1,%[x]),3\n\t" + "vl %%v31, 240(%%r1,%[x]),3\n\t" + "vl %%v0, 0(%%r1,%[y]),3\n\t" + "vl %%v1, 16(%%r1,%[y]),3\n\t" + "vl %%v2, 32(%%r1,%[y]),3\n\t" + "vl %%v3, 48(%%r1,%[y]),3\n\t" + "vl %%v4, 64(%%r1,%[y]),3\n\t" + "vl %%v5, 80(%%r1,%[y]),3\n\t" + "vl %%v6, 96(%%r1,%[y]),3\n\t" + "vl %%v7, 112(%%r1,%[y]),3\n\t" + "vst %%v0, 0(%%r1,%[x]),3\n\t" + "vst %%v1, 16(%%r1,%[x]),3\n\t" + "vst %%v2, 32(%%r1,%[x]),3\n\t" + "vst %%v3, 48(%%r1,%[x]),3\n\t" + "vst %%v4, 64(%%r1,%[x]),3\n\t" + "vst %%v5, 80(%%r1,%[x]),3\n\t" + "vst %%v6, 96(%%r1,%[x]),3\n\t" + "vst %%v7, 112(%%r1,%[x]),3\n\t" + "vl %%v0, 128(%%r1,%[y]),3\n\t" + "vl %%v1, 144(%%r1,%[y]),3\n\t" + "vl %%v2, 160(%%r1,%[y]),3\n\t" + "vl %%v3, 176(%%r1,%[y]),3\n\t" + "vl %%v4, 192(%%r1,%[y]),3\n\t" + "vl %%v5, 208(%%r1,%[y]),3\n\t" + "vl %%v6, 224(%%r1,%[y]),3\n\t" + "vl %%v7, 240(%%r1,%[y]),3\n\t" + "vst %%v0, 128(%%r1,%[x]),3\n\t" + "vst %%v1, 144(%%r1,%[x]),3\n\t" + "vst %%v2, 160(%%r1,%[x]),3\n\t" + "vst %%v3, 176(%%r1,%[x]),3\n\t" + "vst %%v4, 192(%%r1,%[x]),3\n\t" + "vst %%v5, 208(%%r1,%[x]),3\n\t" + "vst %%v6, 224(%%r1,%[x]),3\n\t" + "vst %%v7, 240(%%r1,%[x]),3\n\t" + "vst %%v16, 0(%%r1,%[y]),3\n\t" + "vst %%v17, 16(%%r1,%[y]),3\n\t" + "vst %%v18, 32(%%r1,%[y]),3\n\t" + "vst %%v19, 48(%%r1,%[y]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vst %%v24, 128(%%r1,%[y]),3\n\t" + "vst %%v25, 144(%%r1,%[y]),3\n\t" + "vst %%v26, 160(%%r1,%[y]),3\n\t" + "vst %%v27, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[y]),3\n\t" + "vst %%v29, 208(%%r1,%[y]),3\n\t" + "vst %%v30, 224(%%r1,%[y]),3\n\t" + "vst %%v31, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) From 1391fc46d2c38bb74ed69b7a527ab8865161c915 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Feb 2019 19:29:33 +0100 Subject: [PATCH 091/189] fix second instance of complex.h for c++ as well --- lapack-netlib/LAPACKE/include/lapacke.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 6ded78c8b7..c5ea465e0d 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -70,7 +70,11 @@ /* Complex type (single precision) */ #ifndef lapack_complex_float +#ifndef __cplusplus #include +#else +#include +#endif #define lapack_complex_float float _Complex #endif @@ -86,7 +90,11 @@ lapack_complex_float lapack_make_complex_float( float re, float im ); /* Complex type (double precision) */ #ifndef lapack_complex_double +#ifndef __cplusplus #include +#else +#include +#endif #define lapack_complex_double double _Complex #endif From d70ae3ab433bda46708f02bf74c03c861bfb546f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Feb 2019 20:06:34 +0100 Subject: [PATCH 092/189] Make c_check robust against old or incomplete perl installations by catching and working around failures to load modules, and avoiding object-oriented syntax in tempfile creation. Fixes #1989 --- c_check | 85 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/c_check b/c_check index 9dc237bebc..38f9170ca1 100644 --- a/c_check +++ b/c_check @@ -1,7 +1,7 @@ #!/usr/bin/perl -use File::Basename; -use File::Temp qw(tempfile); +#use File::Basename; +# use File::Temp qw(tempfile); # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); @@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64"); $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); $hostarch = "zarch" if ($hostarch eq "s390x"); -$tmpf = new File::Temp( UNLINK => 1 ); +#$tmpf = new File::Temp( UNLINK => 1 ); $binary = $ENV{"BINARY"}; $makefile = shift(@ARGV); @@ -31,12 +31,25 @@ if ($?) { $cross_suffix = ""; -if (dirname($compiler_name) ne ".") { - $cross_suffix .= dirname($compiler_name) . "/"; -} +eval "use File::Basename"; +if ($@){ + warn "could not load PERL module File::Basename, emulating its functionality"; + my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 ); + if ($dirnam ne ".") { + $cross_suffix .= $dirnam . "/"; + } + my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1); + if ($basnam =~ /([^\s]*-)(.*)/) { + $cross_suffix .= $1; + } +} else { + if (dirname($compiler_name) ne ".") { + $cross_suffix .= dirname($compiler_name) . "/"; + } -if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { - $cross_suffix .= $1; + if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { + $cross_suffix .= $1; + } } $compiler = ""; @@ -171,20 +184,26 @@ if ($?) { $have_msa = 0; if (($architecture eq "mips") || ($architecture eq "mips64")) { - $code = '"addvi.b $w0, $w1, 1"'; - $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; - print $tmpf "#include \n\n"; - print $tmpf "void main(void){ __asm__ volatile($code); }\n"; - - $args = "$msa_flags -o $tmpf.o -x c $tmpf"; - my @cmd = ("$compiler_name $args"); - system(@cmd) == 0; - if ($? != 0) { - $have_msa = 0; + eval "use File::Temp qw(tempfile)"; + if ($@){ + warn "could not load PERL module File::Temp, so could not check MSA capatibility"; } else { - $have_msa = 1; + $tmpf = new File::Temp( UNLINK => 1 ); + $code = '"addvi.b $w0, $w1, 1"'; + $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; + print $tmpf "#include \n\n"; + print $tmpf "void main(void){ __asm__ volatile($code); }\n"; + + $args = "$msa_flags -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args"); + system(@cmd) == 0; + if ($? != 0) { + $have_msa = 0; + } else { + $have_msa = 1; + } + unlink("$tmpf.o"); } - unlink("$tmpf.o"); } $architecture = x86 if ($data =~ /ARCH_X86/); @@ -204,17 +223,25 @@ $binformat = bin64 if ($data =~ /BINARY_64/); $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { - $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; - print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; - $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; - my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); - system(@cmd) == 0; - if ($? != 0) { - $no_avx512 = 1; - } else { + eval "use File::Temp qw(tempfile)"; + if ($@){ + warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512"; $no_avx512 = 0; + } else { +# $tmpf = new File::Temp( UNLINK => 1 ); + ($fh,$tmpf) = tempfile( UNLINK => 1 ); + $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; + print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; + $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $no_avx512 = 1; + } else { + $no_avx512 = 0; + } + unlink("tmpf.o"); } - unlink("tmpf.o"); } $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; From 5952e586ceaa7ea68376f1580c6c96edca55804b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Feb 2019 23:51:40 +0100 Subject: [PATCH 093/189] Support DYNAMIC_LIST option in cmake e.g. cmake -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST="NEHALEM;HASWELL;ZEN" .. original issue was #1639 --- cmake/arch.cmake | 3 +++ cmake/system.cmake | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 63fb86fa21..470ea2a8f3 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -74,6 +74,9 @@ if (DYNAMIC_ARCH) if (NOT NO_AVX512) set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) endif () + if (DYNAMIC_LIST) + set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) + endif () endif () if (NOT DYNAMIC_CORE) diff --git a/cmake/system.cmake b/cmake/system.cmake index 4cee7bd18f..7fda2adb92 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -187,6 +187,13 @@ if (DYNAMIC_ARCH) endif () endif () +if (DYNAMIC_LIST) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST") + foreach(DCORE ${DYNAMIC_LIST}) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}") + endforeach () +endif () + if (NO_LAPACK) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK") #Disable LAPACK C interface From 70397701652743587a88b20837c3b6e2c1da74f0 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 6 Feb 2019 20:11:44 +0200 Subject: [PATCH 094/189] [ZARCH] Undo the last commit --- kernel/zarch/damax.c | 34 ++--- kernel/zarch/damax_z13.c | 34 ++--- kernel/zarch/damin.c | 34 ++--- kernel/zarch/damin_z13.c | 34 ++--- kernel/zarch/dasum.c | 32 ++--- kernel/zarch/daxpy.c | 96 +++++++-------- kernel/zarch/ddot.c | 32 ++--- kernel/zarch/dgemv_n_4.c | 260 +++++++++++++++++++-------------------- kernel/zarch/dgemv_t_4.c | 260 +++++++++++++++++++-------------------- kernel/zarch/dmax.c | 34 ++--- kernel/zarch/dmax_z13.c | 34 ++--- kernel/zarch/dmin.c | 34 ++--- kernel/zarch/dmin_z13.c | 34 ++--- kernel/zarch/drot.c | 128 +++++++++---------- kernel/zarch/dscal.c | 48 ++++---- kernel/zarch/dswap.c | 128 +++++++++---------- kernel/zarch/idamax.c | 34 ++--- kernel/zarch/idamin.c | 34 ++--- kernel/zarch/idmax.c | 34 ++--- kernel/zarch/idmin.c | 34 ++--- kernel/zarch/zasum.c | 32 ++--- kernel/zarch/zaxpy.c | 48 ++++---- kernel/zarch/zdot.c | 32 ++--- kernel/zarch/zgemv_n_4.c | 62 +++++----- kernel/zarch/zgemv_t_4.c | 40 +++--- kernel/zarch/zrot.c | 128 +++++++++---------- kernel/zarch/zscal.c | 112 ++++++++--------- kernel/zarch/zswap.c | 128 +++++++++---------- 28 files changed, 987 insertions(+), 987 deletions(-) diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 2598145c31..37008f702d 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,128(%%r1,%[x]),3\n\t" - "vl %%v25,144(%%r1,%[x]),3\n\t" - "vl %%v26,160(%%r1,%[x]),3\n\t" - "vl %%v27,176(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[x]),3\n\t" - "vl %%v29,208(%%r1,%[x]),3\n\t" - "vl %%v30,224(%%r1,%[x]),3\n\t" - "vl %%v31,240(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" "vfmaxdb %%v16,%%v16,%%v24,8\n\t" "vfmaxdb %%v17,%%v17,%%v25,8\n\t" "vfmaxdb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index f7e11c3cea..530d6e5bb6 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 25f018c662..a01791741d 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,128(%%r1,%[x]),3\n\t" - "vl %%v25,144(%%r1,%[x]),3\n\t" - "vl %%v26,160(%%r1,%[x]),3\n\t" - "vl %%v27,176(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[x]),3\n\t" - "vl %%v29,208(%%r1,%[x]),3\n\t" - "vl %%v30,224(%%r1,%[x]),3\n\t" - "vl %%v31,240(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" "vfmindb %%v16,%%v16,%%v24,8\n\t" "vfmindb %%v17,%%v17,%%v25,8\n\t" "vfmindb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 091aceb37d..2172b6d6f5 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 641949963e..9f69a99314 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -45,14 +45,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v20, 64(%%r1,%[x]),3\n\t" - "vl %%v21, 80(%%r1,%[x]),3\n\t" - "vl %%v22, 96(%%r1,%[x]),3\n\t" - "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x]),3\n\t" - "vl %%v17, 144(%%r1,%[x]),3\n\t" - "vl %%v18, 160(%%r1,%[x]),3\n\t" - "vl %%v19, 176(%%r1,%[x]),3\n\t" - "vl %%v20, 192(%%r1,%[x]),3\n\t" - "vl %%v21, 208(%%r1,%[x]),3\n\t" - "vl %%v22, 224(%%r1,%[x]),3\n\t" - "vl %%v23, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index c02ad0aac3..179ef8834c 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -34,22 +34,22 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,0(%%r1,%[y]),3\n\t" - "vl %%v21,16(%%r1,%[y]),3\n\t" - "vl %%v22,32(%%r1,%[y]),3\n\t" - "vl %%v23,48(%%r1,%[y]),3\n\t" - "vl %%v24,64(%%r1,%[x]),3\n\t" - "vl %%v25,80(%%r1,%[x]),3\n\t" - "vl %%v26,96(%%r1,%[x]),3\n\t" - "vl %%v27,112(%%r1,%[x]),3\n\t" - "vl %%v28,64(%%r1,%[y]),3\n\t" - "vl %%v29,80(%%r1,%[y]),3\n\t" - "vl %%v30,96(%%r1,%[y]),3\n\t" - "vl %%v31,112(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -58,30 +58,30 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y]),3\n\t" - "vst %%v17,16(%%r1,%[y]),3\n\t" - "vst %%v18,32(%%r1,%[y]),3\n\t" - "vst %%v19,48(%%r1,%[y]),3\n\t" - "vst %%v24,64(%%r1,%[y]),3\n\t" - "vst %%v25,80(%%r1,%[y]),3\n\t" - "vst %%v26,96(%%r1,%[y]),3\n\t" - "vst %%v27,112(%%r1,%[y]),3\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,128(%%r1,%[y]),3\n\t" - "vl %%v21,144(%%r1,%[y]),3\n\t" - "vl %%v22,160(%%r1,%[y]),3\n\t" - "vl %%v23,176(%%r1,%[y]),3\n\t" - "vl %%v24,192(%%r1,%[x]),3\n\t" - "vl %%v25,208(%%r1,%[x]),3\n\t" - "vl %%v26,224(%%r1,%[x]),3\n\t" - "vl %%v27,240(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[y]),3\n\t" - "vl %%v29,208(%%r1,%[y]),3\n\t" - "vl %%v30,224(%%r1,%[y]),3\n\t" - "vl %%v31,240(%%r1,%[y]),3\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -90,14 +90,14 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y]),3\n\t" - "vst %%v17,144(%%r1,%[y]),3\n\t" - "vst %%v18,160(%%r1,%[y]),3\n\t" - "vst %%v19,176(%%r1,%[y]),3\n\t" - "vst %%v24,192(%%r1,%[y]),3\n\t" - "vst %%v25,208(%%r1,%[y]),3\n\t" - "vst %%v26,224(%%r1,%[y]),3\n\t" - "vst %%v27,240(%%r1,%[y]),3\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index 0dd8ed08a1..f5f601717c 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -43,22 +43,22 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[x])\n\t" "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[y]),3\n\t" - "vl %%v25,16(%%r1,%[y]),3\n\t" - "vl %%v26,32(%%r1,%[y]),3\n\t" - "vl %%v27,48(%%r1,%[y]),3\n\t" - "vl %%v28,64(%%r1,%[y]),3\n\t" - "vl %%v29,80(%%r1,%[y]),3\n\t" - "vl %%v30,96(%%r1,%[y]),3\n\t" - "vl %%v31,112(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index 87ed6ecd1f..c93ff9b548 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -52,26 +52,26 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0]),3\n\t" - "vl %%v17,0(%%r1,%[ap1]),3\n\t" - "vl %%v18,0(%%r1,%[ap2]),3\n\t" - "vl %%v19,0(%%r1,%[ap3]),3\n\t" - "vl %%v20,16(%%r1,%[ap0]),3\n\t" - "vl %%v21,16(%%r1,%[ap1]),3\n\t" - "vl %%v22,16(%%r1,%[ap2]),3\n\t" - "vl %%v23,16(%%r1,%[ap3]),3\n\t" - "vl %%v24,32(%%r1,%[ap0]),3\n\t" - "vl %%v25,32(%%r1,%[ap1]),3\n\t" - "vl %%v26,32(%%r1,%[ap2]),3\n\t" - "vl %%v27,32(%%r1,%[ap3]),3\n\t" - "vl %%v28,48(%%r1,%[ap0]),3\n\t" - "vl %%v29,48(%%r1,%[ap1]),3\n\t" - "vl %%v30,48(%%r1,%[ap2]),3\n\t" - "vl %%v31,48(%%r1,%[ap3]),3\n\t" - "vl %%v4,0(%%r1,%[y]),3\n\t" - "vl %%v5,16(%%r1,%[y]),3\n\t" - "vl %%v6,32(%%r1,%[y]),3\n\t" - "vl %%v7,48(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -88,30 +88,30 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y]),3\n\t" - "vst %%v5,16(%%r1,%[y]),3\n\t" - "vst %%v6,32(%%r1,%[y]),3\n\t" - "vst %%v7,48(%%r1,%[y]),3\n\t" - "vl %%v16,64(%%r1,%[ap0]),3\n\t" - "vl %%v17,64(%%r1,%[ap1]),3\n\t" - "vl %%v18,64(%%r1,%[ap2]),3\n\t" - "vl %%v19,64(%%r1,%[ap3]),3\n\t" - "vl %%v20,80(%%r1,%[ap0]),3\n\t" - "vl %%v21,80(%%r1,%[ap1]),3\n\t" - "vl %%v22,80(%%r1,%[ap2]),3\n\t" - "vl %%v23,80(%%r1,%[ap3]),3\n\t" - "vl %%v24,96(%%r1,%[ap0]),3\n\t" - "vl %%v25,96(%%r1,%[ap1]),3\n\t" - "vl %%v26,96(%%r1,%[ap2]),3\n\t" - "vl %%v27,96(%%r1,%[ap3]),3\n\t" - "vl %%v28,112(%%r1,%[ap0]),3\n\t" - "vl %%v29,112(%%r1,%[ap1]),3\n\t" - "vl %%v30,112(%%r1,%[ap2]),3\n\t" - "vl %%v31,112(%%r1,%[ap3]),3\n\t" - "vl %%v4,64(%%r1,%[y]),3\n\t" - "vl %%v5,80(%%r1,%[y]),3\n\t" - "vl %%v6,96(%%r1,%[y]),3\n\t" - "vl %%v7,112(%%r1,%[y]),3\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -128,10 +128,10 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y]),3\n\t" - "vst %%v5,80(%%r1,%[y]),3\n\t" - "vst %%v6,96(%%r1,%[y]),3\n\t" - "vst %%v7,112(%%r1,%[y]),3\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -141,16 +141,16 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0]),3\n\t" - "vl %%v17,0(%%r1,%[ap1]),3\n\t" - "vl %%v18,0(%%r1,%[ap2]),3\n\t" - "vl %%v19,0(%%r1,%[ap3]),3\n\t" - "vl %%v20,16(%%r1,%[ap0]),3\n\t" - "vl %%v21,16(%%r1,%[ap1]),3\n\t" - "vl %%v22,16(%%r1,%[ap2]),3\n\t" - "vl %%v23,16(%%r1,%[ap3]),3\n\t" - "vl %%v4,0(%%r1,%[y]),3\n\t" - "vl %%v5,16(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" @@ -159,8 +159,8 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vst %%v4,0(%%r1,%[y]),3\n\t" - "vst %%v5,16(%%r1,%[y]),3\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -193,30 +193,30 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0]),3\n\t" - "vl %%v17,0(%%r1,%[ap1]),3\n\t" - "vl %%v18,16(%%r1,%[ap0]),3\n\t" - "vl %%v19,16(%%r1,%[ap1]),3\n\t" - "vl %%v20,32(%%r1,%[ap0]),3\n\t" - "vl %%v21,32(%%r1,%[ap1]),3\n\t" - "vl %%v22,48(%%r1,%[ap0]),3\n\t" - "vl %%v23,48(%%r1,%[ap1]),3\n\t" - "vl %%v24,64(%%r1,%[ap0]),3\n\t" - "vl %%v25,64(%%r1,%[ap1]),3\n\t" - "vl %%v26,80(%%r1,%[ap0]),3\n\t" - "vl %%v27,80(%%r1,%[ap1]),3\n\t" - "vl %%v28,96(%%r1,%[ap0]),3\n\t" - "vl %%v29,96(%%r1,%[ap1]),3\n\t" - "vl %%v30,112(%%r1,%[ap0]),3\n\t" - "vl %%v31,112(%%r1,%[ap1]),3\n\t" - "vl %%v2,0(%%r1,%[y]),3\n\t" - "vl %%v3,16(%%r1,%[y]),3\n\t" - "vl %%v4,32(%%r1,%[y]),3\n\t" - "vl %%v5,48(%%r1,%[y]),3\n\t" - "vl %%v6,64(%%r1,%[y]),3\n\t" - "vl %%v7,80(%%r1,%[y]),3\n\t" - "vl %%v8,96(%%r1,%[y]),3\n\t" - "vl %%v9,112(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" @@ -233,14 +233,14 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y]),3\n\t" - "vst %%v3,16(%%r1,%[y]),3\n\t" - "vst %%v4,32(%%r1,%[y]),3\n\t" - "vst %%v5,48(%%r1,%[y]),3\n\t" - "vst %%v6,64(%%r1,%[y]),3\n\t" - "vst %%v7,80(%%r1,%[y]),3\n\t" - "vst %%v8,96(%%r1,%[y]),3\n\t" - "vst %%v9,112(%%r1,%[y]),3\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -250,18 +250,18 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0]),3\n\t" - "vl %%v17,0(%%r1,%[ap1]),3\n\t" - "vl %%v18,16(%%r1,%[ap0]),3\n\t" - "vl %%v19,16(%%r1,%[ap1]),3\n\t" - "vl %%v2,0(%%r1,%[y]),3\n\t" - "vl %%v3,16(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vst %%v2,0(%%r1,%[y]),3\n\t" - "vst %%v3,16(%%r1,%[y]),3\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -289,22 +289,22 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0]),3\n\t" - "vl %%v17,16(%%r1,%[a0]),3\n\t" - "vl %%v18,32(%%r1,%[a0]),3\n\t" - "vl %%v19,48(%%r1,%[a0]),3\n\t" - "vl %%v20,64(%%r1,%[a0]),3\n\t" - "vl %%v21,80(%%r1,%[a0]),3\n\t" - "vl %%v22,96(%%r1,%[a0]),3\n\t" - "vl %%v23,112(%%r1,%[a0]),3\n\t" - "vl %%v24,0(%%r1,%[y]),3\n\t" - "vl %%v25,16(%%r1,%[y]),3\n\t" - "vl %%v26,32(%%r1,%[y]),3\n\t" - "vl %%v27,48(%%r1,%[y]),3\n\t" - "vl %%v28,64(%%r1,%[y]),3\n\t" - "vl %%v29,80(%%r1,%[y]),3\n\t" - "vl %%v30,96(%%r1,%[y]),3\n\t" - "vl %%v31,112(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" @@ -313,14 +313,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y]),3\n\t" - "vst %%v25,16(%%r1,%[y]),3\n\t" - "vst %%v26,32(%%r1,%[y]),3\n\t" - "vst %%v27,48(%%r1,%[y]),3\n\t" - "vst %%v28,64(%%r1,%[y]),3\n\t" - "vst %%v29,80(%%r1,%[y]),3\n\t" - "vst %%v30,96(%%r1,%[y]),3\n\t" - "vst %%v31,112(%%r1,%[y]),3\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -330,14 +330,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[a0]),3\n\t" - "vl %%v17,16(%%r1,%[a0]),3\n\t" - "vl %%v18,0(%%r1,%[y]),3\n\t" - "vl %%v19,16(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,0(%%r1,%[y])\n\t" + "vl %%v19,16(%%r1,%[y])\n\t" "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" - "vst %%v18,0(%%r1,%[y]),3\n\t" - "vst %%v19,16(%%r1,%[y]),3\n\t" + "vst %%v18,0(%%r1,%[y])\n\t" + "vst %%v19,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 9fd3c09d69..24680cf1b7 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -50,77 +50,77 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[ap0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1]),3\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2]),3\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3]),3\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0]),3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1]),3\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2]),3\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3]),3\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0]),3\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1]),3\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2]),3\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3]),3\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0]),3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1]),3\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2]),3\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3]),3\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0]),3\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1]),3\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2]),3\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3]),3\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0]),3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1]),3\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2]),3\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3]),3\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0]),3\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1]),3\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2]),3\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3]),3\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0]),3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1]),3\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2]),3\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3]),3\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -131,23 +131,23 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[ap0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1]),3\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2]),3\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3]),3\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0]),3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1]),3\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2]),3\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3]),3\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -198,45 +198,45 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[ap0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1]),3\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0]),3\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1]),3\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0]),3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1]),3\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0]),3\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1]),3\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0]),3\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1]),3\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0]),3\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1]),3\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0]),3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1]),3\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0]),3\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1]),3\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -247,15 +247,15 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[ap0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1]),3\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0]),3\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1]),3\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -299,29 +299,29 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[a0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0]),3\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0]),3\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0]),3\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0]),3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0]),3\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0]),3\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0]),3\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -332,11 +332,11 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[a0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0]),3\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -378,38 +378,38 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src]),3\n\t" - "vl %%v17,16(%%r1,%[src]),3\n\t" - "vl %%v18,32(%%r1,%[src]),3\n\t" - "vl %%v19,48(%%r1,%[src]),3\n\t" - "vl %%v20,64(%%r1,%[src]),3\n\t" - "vl %%v21,80(%%r1,%[src]),3\n\t" - "vl %%v22,96(%%r1,%[src]),3\n\t" - "vl %%v23,112(%%r1,%[src]),3\n\t" - "vl %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest]),3\n\t" - "vl %%v25, 16(%%r1,%[dest]),3\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest]),3\n\t" - "vl %%v26, 32(%%r1,%[dest]),3\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest]),3\n\t" - "vl %%v27, 48(%%r1,%[dest]),3\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest]),3\n\t" - "vl %%v28, 64(%%r1,%[dest]),3\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest]),3\n\t" - "vl %%v29, 80(%%r1,%[dest]),3\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest]),3\n\t" - "vl %%v30, 96(%%r1,%[dest]),3\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest]),3\n\t" - "vl %%v31, 112(%%r1,%[dest]),3\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest]),3\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -419,14 +419,14 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[src]),3\n\t" - "vl %%v17,16(%%r1,%[src]),3\n\t" - "vl %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest]),3\n\t" - "vl %%v25, 16(%%r1,%[dest]),3\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest]),3\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index cc0f23c877..65ed31f01b 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,128(%%r1,%[x]),3\n\t" - "vl %%v25,144(%%r1,%[x]),3\n\t" - "vl %%v26,160(%%r1,%[x]),3\n\t" - "vl %%v27,176(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[x]),3\n\t" - "vl %%v29,208(%%r1,%[x]),3\n\t" - "vl %%v30,224(%%r1,%[x]),3\n\t" - "vl %%v31,240(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" "vfmaxdb %%v16,%%v16,%%v24,0\n\t" "vfmaxdb %%v17,%%v17,%%v25,0\n\t" "vfmaxdb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 83d827d35f..87bccbe55d 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" @@ -59,14 +59,14 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 754828b7c9..518cc262ce 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,128(%%r1,%[x]),3\n\t" - "vl %%v25,144(%%r1,%[x]),3\n\t" - "vl %%v26,160(%%r1,%[x]),3\n\t" - "vl %%v27,176(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[x]),3\n\t" - "vl %%v29,208(%%r1,%[x]),3\n\t" - "vl %%v30,224(%%r1,%[x]),3\n\t" - "vl %%v31,240(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" "vfmindb %%v16,%%v16,%%v24,0\n\t" "vfmindb %%v17,%%v17,%%v25,0\n\t" "vfmindb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index ff0fca48c2..91561992f5 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" @@ -59,14 +59,14 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index de2207fcd1..8f0197f023 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -35,14 +35,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x]),3\n\t" - "vl %%v25, 16(%%r1,%[x]),3\n\t" - "vl %%v26, 32(%%r1,%[x]),3\n\t" - "vl %%v27, 48(%%r1,%[x]),3\n\t" - "vl %%v16, 0(%%r1,%[y]),3\n\t" - "vl %%v17, 16(%%r1,%[y]),3\n\t" - "vl %%v18, 32(%%r1,%[y]),3\n\t" - "vl %%v19, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x]),3\n\t" - "vst %%v29, 16(%%r1,%[x]),3\n\t" - "vst %%v30, 32(%%r1,%[x]),3\n\t" - "vst %%v31, 48(%%r1,%[x]),3\n\t" - "vst %%v20, 0(%%r1,%[y]),3\n\t" - "vst %%v21, 16(%%r1,%[y]),3\n\t" - "vst %%v22, 32(%%r1,%[y]),3\n\t" - "vst %%v23, 48(%%r1,%[y]),3\n\t" - "vl %%v24, 64(%%r1,%[x]),3\n\t" - "vl %%v25, 80(%%r1,%[x]),3\n\t" - "vl %%v26, 96(%%r1,%[x]),3\n\t" - "vl %%v27, 112(%%r1,%[x]),3\n\t" - "vl %%v16, 64(%%r1,%[y]),3\n\t" - "vl %%v17, 80(%%r1,%[y]),3\n\t" - "vl %%v18, 96(%%r1,%[y]),3\n\t" - "vl %%v19, 112(%%r1,%[y]),3\n\t" + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x]),3\n\t" - "vst %%v29, 80(%%r1,%[x]),3\n\t" - "vst %%v30, 96(%%r1,%[x]),3\n\t" - "vst %%v31, 112(%%r1,%[x]),3\n\t" - "vst %%v20, 64(%%r1,%[y]),3\n\t" - "vst %%v21, 80(%%r1,%[y]),3\n\t" - "vst %%v22, 96(%%r1,%[y]),3\n\t" - "vst %%v23, 112(%%r1,%[y]),3\n\t" - "vl %%v24, 128(%%r1,%[x]),3\n\t" - "vl %%v25, 144(%%r1,%[x]),3\n\t" - "vl %%v26, 160(%%r1,%[x]),3\n\t" - "vl %%v27, 176(%%r1,%[x]),3\n\t" - "vl %%v16, 128(%%r1,%[y]),3\n\t" - "vl %%v17, 144(%%r1,%[y]),3\n\t" - "vl %%v18, 160(%%r1,%[y]),3\n\t" - "vl %%v19, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x]),3\n\t" - "vst %%v29, 144(%%r1,%[x]),3\n\t" - "vst %%v30, 160(%%r1,%[x]),3\n\t" - "vst %%v31, 176(%%r1,%[x]),3\n\t" - "vst %%v20, 128(%%r1,%[y]),3\n\t" - "vst %%v21, 144(%%r1,%[y]),3\n\t" - "vst %%v22, 160(%%r1,%[y]),3\n\t" - "vst %%v23, 176(%%r1,%[y]),3\n\t" - "vl %%v24, 192(%%r1,%[x]),3\n\t" - "vl %%v25, 208(%%r1,%[x]),3\n\t" - "vl %%v26, 224(%%r1,%[x]),3\n\t" - "vl %%v27, 240(%%r1,%[x]),3\n\t" - "vl %%v16, 192(%%r1,%[y]),3\n\t" - "vl %%v17, 208(%%r1,%[y]),3\n\t" - "vl %%v18, 224(%%r1,%[y]),3\n\t" - "vl %%v19, 240(%%r1,%[y]),3\n\t" + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x]),3\n\t" - "vst %%v29, 208(%%r1,%[x]),3\n\t" - "vst %%v30, 224(%%r1,%[x]),3\n\t" - "vst %%v31, 240(%%r1,%[x]),3\n\t" - "vst %%v20, 192(%%r1,%[y]),3\n\t" - "vst %%v21, 208(%%r1,%[y]),3\n\t" - "vst %%v22, 224(%%r1,%[y]),3\n\t" - "vst %%v23, 240(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index bc58569d59..c944990b5a 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -33,30 +33,30 @@ static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" "vfmdb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x]),3\n\t" - "vl %%v25,16(%%r1,%[x]),3\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" "vfmdb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x]),3\n\t" - "vl %%v26,32(%%r1,%[x]),3\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" "vfmdb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x]),3\n\t" - "vl %%v27,48(%%r1,%[x]),3\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" "vfmdb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x]),3\n\t" - "vl %%v28,64(%%r1,%[x]),3\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" "vfmdb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x]),3\n\t" - "vl %%v29,80(%%r1,%[x]),3\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" "vfmdb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x]),3\n\t" - "vl %%v30,96(%%r1,%[x]),3\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" "vfmdb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x]),3\n\t" - "vl %%v31,112(%%r1,%[x]),3\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" "vfmdb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x]),3\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) @@ -71,14 +71,14 @@ static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x]),3\n\t" - "vst %%v0,16(%%r1,%[x]),3\n\t" - "vst %%v0,32(%%r1,%[x]),3\n\t" - "vst %%v0,48(%%r1,%[x]),3\n\t" - "vst %%v0,64(%%r1,%[x]),3\n\t" - "vst %%v0,80(%%r1,%[x]),3\n\t" - "vst %%v0,96(%%r1,%[x]),3\n\t" - "vst %%v0,112(%%r1,%[x]),3\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index f4da46dc14..60ba40bd62 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -33,70 +33,70 @@ static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v20, 64(%%r1,%[x]),3\n\t" - "vl %%v21, 80(%%r1,%[x]),3\n\t" - "vl %%v22, 96(%%r1,%[x]),3\n\t" - "vl %%v23, 112(%%r1,%[x]),3\n\t" - "vl %%v24, 128(%%r1,%[x]),3\n\t" - "vl %%v25, 144(%%r1,%[x]),3\n\t" - "vl %%v26, 160(%%r1,%[x]),3\n\t" - "vl %%v27, 176(%%r1,%[x]),3\n\t" - "vl %%v28, 192(%%r1,%[x]),3\n\t" - "vl %%v29, 208(%%r1,%[x]),3\n\t" - "vl %%v30, 224(%%r1,%[x]),3\n\t" - "vl %%v31, 240(%%r1,%[x]),3\n\t" - "vl %%v0, 0(%%r1,%[y]),3\n\t" - "vl %%v1, 16(%%r1,%[y]),3\n\t" - "vl %%v2, 32(%%r1,%[y]),3\n\t" - "vl %%v3, 48(%%r1,%[y]),3\n\t" - "vl %%v4, 64(%%r1,%[y]),3\n\t" - "vl %%v5, 80(%%r1,%[y]),3\n\t" - "vl %%v6, 96(%%r1,%[y]),3\n\t" - "vl %%v7, 112(%%r1,%[y]),3\n\t" - "vst %%v0, 0(%%r1,%[x]),3\n\t" - "vst %%v1, 16(%%r1,%[x]),3\n\t" - "vst %%v2, 32(%%r1,%[x]),3\n\t" - "vst %%v3, 48(%%r1,%[x]),3\n\t" - "vst %%v4, 64(%%r1,%[x]),3\n\t" - "vst %%v5, 80(%%r1,%[x]),3\n\t" - "vst %%v6, 96(%%r1,%[x]),3\n\t" - "vst %%v7, 112(%%r1,%[x]),3\n\t" - "vl %%v0, 128(%%r1,%[y]),3\n\t" - "vl %%v1, 144(%%r1,%[y]),3\n\t" - "vl %%v2, 160(%%r1,%[y]),3\n\t" - "vl %%v3, 176(%%r1,%[y]),3\n\t" - "vl %%v4, 192(%%r1,%[y]),3\n\t" - "vl %%v5, 208(%%r1,%[y]),3\n\t" - "vl %%v6, 224(%%r1,%[y]),3\n\t" - "vl %%v7, 240(%%r1,%[y]),3\n\t" - "vst %%v0, 128(%%r1,%[x]),3\n\t" - "vst %%v1, 144(%%r1,%[x]),3\n\t" - "vst %%v2, 160(%%r1,%[x]),3\n\t" - "vst %%v3, 176(%%r1,%[x]),3\n\t" - "vst %%v4, 192(%%r1,%[x]),3\n\t" - "vst %%v5, 208(%%r1,%[x]),3\n\t" - "vst %%v6, 224(%%r1,%[x]),3\n\t" - "vst %%v7, 240(%%r1,%[x]),3\n\t" - "vst %%v16, 0(%%r1,%[y]),3\n\t" - "vst %%v17, 16(%%r1,%[y]),3\n\t" - "vst %%v18, 32(%%r1,%[y]),3\n\t" - "vst %%v19, 48(%%r1,%[y]),3\n\t" - "vst %%v20, 64(%%r1,%[y]),3\n\t" - "vst %%v21, 80(%%r1,%[y]),3\n\t" - "vst %%v22, 96(%%r1,%[y]),3\n\t" - "vst %%v23, 112(%%r1,%[y]),3\n\t" - "vst %%v24, 128(%%r1,%[y]),3\n\t" - "vst %%v25, 144(%%r1,%[y]),3\n\t" - "vst %%v26, 160(%%r1,%[y]),3\n\t" - "vst %%v27, 176(%%r1,%[y]),3\n\t" - "vst %%v28, 192(%%r1,%[y]),3\n\t" - "vst %%v29, 208(%%r1,%[y]),3\n\t" - "vst %%v30, 224(%%r1,%[y]),3\n\t" - "vst %%v31, 240(%%r1,%[y]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index bd0f181152..8434c811f4 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 4884d1e3a7..80a37e6c25 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index a6b95bf3e3..18cdba4376 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { BLASLONG imax; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index c3f36d964f..02ca427e47 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { BLASLONG imin; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 83e5e93c99..43ae8ff8b7 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -45,14 +45,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v20, 64(%%r1,%[x]),3\n\t" - "vl %%v21, 80(%%r1,%[x]),3\n\t" - "vl %%v22, 96(%%r1,%[x]),3\n\t" - "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x]),3\n\t" - "vl %%v17, 144(%%r1,%[x]),3\n\t" - "vl %%v18, 160(%%r1,%[x]),3\n\t" - "vl %%v19, 176(%%r1,%[x]),3\n\t" - "vl %%v20, 192(%%r1,%[x]),3\n\t" - "vl %%v21, 208(%%r1,%[x]),3\n\t" - "vl %%v22, 224(%%r1,%[x]),3\n\t" - "vl %%v23, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 77bb09a2e2..31549849d8 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -45,22 +45,22 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x]),3\n\t" - "vl %%v9,16(%%r1,%[x]),3\n\t" - "vl %%v10,32(%%r1,%[x]),3\n\t" - "vl %%v11,48(%%r1,%[x]),3\n\t" - "vl %%v12,0(%%r1,%[y]),3\n\t" - "vl %%v13,16(%%r1,%[y]),3\n\t" - "vl %%v14,32(%%r1,%[y]),3\n\t" - "vl %%v15,48(%%r1,%[y]),3\n\t" - "vl %%v16,64(%%r1,%[x]),3\n\t" - "vl %%v17,80(%%r1,%[x]),3\n\t" - "vl %%v18,96(%%r1,%[x]),3\n\t" - "vl %%v19,112(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[y]),3\n\t" - "vl %%v21,80(%%r1,%[y]),3\n\t" - "vl %%v22,96(%%r1,%[y]),3\n\t" - "vl %%v23,112(%%r1,%[y]),3\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" "vpdi %%v24,%%v8,%%v8,4\n\t" "vpdi %%v25,%%v9,%%v9,4\n\t" "vpdi %%v26,%%v10,%%v10,4\n\t" @@ -85,14 +85,14 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y]),3\n\t" - "vst %%v9,16(%%r1,%[y]),3\n\t" - "vst %%v10,32(%%r1,%[y]),3\n\t" - "vst %%v11,48(%%r1,%[y]),3\n\t" - "vst %%v16,64(%%r1,%[y]),3\n\t" - "vst %%v17,80(%%r1,%[y]),3\n\t" - "vst %%v18,96(%%r1,%[y]),3\n\t" - "vst %%v19,112(%%r1,%[y]),3\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 8cfbaadb83..7a67ef734b 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -41,14 +41,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v0, 0(%%r1,%[y]),3\n\t" - "vl %%v1, 16(%%r1,%[y]),3\n\t" - "vl %%v2, 32(%%r1,%[y]),3\n\t" - "vl %%v3, 48(%%r1,%[y]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" @@ -61,14 +61,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x]),3\n\t" - "vl %%v17, 80(%%r1,%[x]),3\n\t" - "vl %%v18, 96(%%r1,%[x]),3\n\t" - "vl %%v19, 112(%%r1,%[x]),3\n\t" - "vl %%v0, 64(%%r1,%[y]),3\n\t" - "vl %%v1, 80(%%r1,%[y]),3\n\t" - "vl %%v2, 96(%%r1,%[y]),3\n\t" - "vl %%v3, 112(%%r1,%[y]),3\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 4b64fc8a56..7f21985ecf 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 1024 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x]),3\n\t" - "vl %%v17,16(%[x]),3\n\t" - "vl %%v18,32(%[x]),3\n\t" - "vl %%v19,48(%[x]),3\n\t" + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" + "vl %%v18,32(%[x])\n\t" + "vl %%v19,48(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v20,8(%[x]),0\n\t" "wflcdb %%v20,%%v20\n\t" @@ -69,8 +69,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y]),3\n\t" - "vl %%v1,16(%%r1,%[y]),3\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" "vlrepg %%v24,0(%%r1,%[ap0])\n\t" "vlrepg %%v25,8(%%r1,%[ap0])\n\t" "vlrepg %%v26,0(%%r1,%[ap1])\n\t" @@ -103,8 +103,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" - "vst %%v0,0(%%r1,%[y]),3\n\t" - "vst %%v1,16(%%r1,%[y]),3\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -119,8 +119,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x]),3\n\t" - "vl %%v17,16(%[x]),3\n\t" + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v18,8(%[x]),0\n\t" "wflcdb %%v18,%%v18\n\t" @@ -142,8 +142,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y]),3\n\t" - "vl %%v1,16(%%r1,%[y]),3\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" "vlrepg %%v20,0(%%r1,%[ap0])\n\t" "vlrepg %%v21,8(%%r1,%[ap0])\n\t" "vlrepg %%v22,0(%%r1,%[ap1])\n\t" @@ -160,8 +160,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" - "vst %%v0,0(%%r1,%[y]),3\n\t" - "vst %%v1,16(%%r1,%[y]),3\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -173,7 +173,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x]),3\n\t" + __asm__("vl %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v17,8(%[x]),0\n\t" "wflcdb %%v17,%%v17\n\t" @@ -188,8 +188,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y]),3\n\t" - "vl %%v1,16(%%r1,%[y]),3\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" "vlrepg %%v18,0(%%r1,%[ap])\n\t" "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vlrepg %%v20,16(%%r1,%[ap])\n\t" @@ -198,8 +198,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" - "vst %%v0,0(%%r1,%[y]),3\n\t" - "vst %%v1,16(%%r1,%[y]),3\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -227,14 +227,14 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src]),3\n\t" - "vl %%v17,16(%%r1,%[src]),3\n\t" - "vl %%v18,32(%%r1,%[src]),3\n\t" - "vl %%v19,48(%%r1,%[src]),3\n\t" - "vl %%v20,0(%%r1,%[dest]),3\n\t" - "vl %%v21,16(%%r1,%[dest]),3\n\t" - "vl %%v22,32(%%r1,%[dest]),3\n\t" - "vl %%v23,48(%%r1,%[dest]),3\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,0(%%r1,%[dest])\n\t" + "vl %%v21,16(%%r1,%[dest])\n\t" + "vl %%v22,32(%%r1,%[dest])\n\t" + "vl %%v23,48(%%r1,%[dest])\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -247,10 +247,10 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" - "vst %%v28,0(%%r1,%[dest]),3\n\t" - "vst %%v29,16(%%r1,%[dest]),3\n\t" - "vst %%v30,32(%%r1,%[dest]),3\n\t" - "vst %%v31,48(%%r1,%[dest]),3\n\t" + "vst %%v28,0(%%r1,%[dest])\n\t" + "vst %%v29,16(%%r1,%[dest])\n\t" + "vst %%v30,32(%%r1,%[dest])\n\t" + "vst %%v31,48(%%r1,%[dest])\n\t" "agfi %%r1,64\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c index 429824dcf8..7b3e6c1fc6 100644 --- a/kernel/zarch/zgemv_t_4.c +++ b/kernel/zarch/zgemv_t_4.c @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x]),3\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -73,7 +73,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vl %%v0,16(%%r1,%[x]),3\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -120,10 +120,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v24,0(%[alpha]),0\n\t" "vlrepg %%v25,8(%[alpha])\n\t" #endif - "vl %%v26,0(%[y]),3\n\t" - "vl %%v27,16(%[y]),3\n\t" - "vl %%v28,32(%[y]),3\n\t" - "vl %%v29,48(%[y]),3\n\t" + "vl %%v26,0(%[y])\n\t" + "vl %%v27,16(%[y])\n\t" + "vl %%v28,32(%[y])\n\t" + "vl %%v29,48(%[y])\n\t" "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" @@ -132,10 +132,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" - "vst %%v26,0(%[y]),3\n\t" - "vst %%v27,16(%[y]),3\n\t" - "vst %%v28,32(%[y]),3\n\t" - "vst %%v29,48(%[y]),3" + "vst %%v26,0(%[y])\n\t" + "vst %%v27,16(%[y])\n\t" + "vst %%v28,32(%[y])\n\t" + "vst %%v29,48(%[y])" : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -160,7 +160,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x]),3\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -178,7 +178,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "vl %%v0,16(%%r1,%[x]),3\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -213,14 +213,14 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v20,0(%[alpha]),0\n\t" "vlrepg %%v21,8(%[alpha])\n\t" #endif - "vl %%v22,0(%[y]),3\n\t" - "vl %%v23,16(%[y]),3\n\t" + "vl %%v22,0(%[y])\n\t" + "vl %%v23,16(%[y])\n\t" "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" - "vst %%v22,0(%[y]),3\n\t" - "vst %%v23,16(%[y]),3\n\t" + "vst %%v22,0(%[y])\n\t" + "vst %%v23,16(%[y])\n\t" : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -239,7 +239,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x]),3\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -253,7 +253,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "vl %%v0,16(%%r1,%[x]),3\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -282,10 +282,10 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vleg %%v18,0(%[alpha]),0\n\t" "vlrepg %%v19,8(%[alpha])\n\t" #endif - "vl %%v0,0(%[y]),3\n\t" + "vl %%v0,0(%[y])\n\t" "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" - "vst %%v0,0(%[y]),3\n\t" + "vst %%v0,0(%[y])\n\t" : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index ea81e4741c..aa7f166052 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -35,14 +35,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x]),3\n\t" - "vl %%v25, 16(%%r1,%[x]),3\n\t" - "vl %%v26, 32(%%r1,%[x]),3\n\t" - "vl %%v27, 48(%%r1,%[x]),3\n\t" - "vl %%v16, 0(%%r1,%[y]),3\n\t" - "vl %%v17, 16(%%r1,%[y]),3\n\t" - "vl %%v18, 32(%%r1,%[y]),3\n\t" - "vl %%v19, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x]),3\n\t" - "vst %%v29, 16(%%r1,%[x]),3\n\t" - "vst %%v30, 32(%%r1,%[x]),3\n\t" - "vst %%v31, 48(%%r1,%[x]),3\n\t" - "vst %%v20, 0(%%r1,%[y]),3\n\t" - "vst %%v21, 16(%%r1,%[y]),3\n\t" - "vst %%v22, 32(%%r1,%[y]),3\n\t" - "vst %%v23, 48(%%r1,%[y]),3\n\t" - "vl %%v24, 64(%%r1,%[x]),3\n\t" - "vl %%v25, 80(%%r1,%[x]),3\n\t" - "vl %%v26, 96(%%r1,%[x]),3\n\t" - "vl %%v27, 112(%%r1,%[x]),3\n\t" - "vl %%v16, 64(%%r1,%[y]),3\n\t" - "vl %%v17, 80(%%r1,%[y]),3\n\t" - "vl %%v18, 96(%%r1,%[y]),3\n\t" - "vl %%v19, 112(%%r1,%[y]),3\n\t" + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x]),3\n\t" - "vst %%v29, 80(%%r1,%[x]),3\n\t" - "vst %%v30, 96(%%r1,%[x]),3\n\t" - "vst %%v31, 112(%%r1,%[x]),3\n\t" - "vst %%v20, 64(%%r1,%[y]),3\n\t" - "vst %%v21, 80(%%r1,%[y]),3\n\t" - "vst %%v22, 96(%%r1,%[y]),3\n\t" - "vst %%v23, 112(%%r1,%[y]),3\n\t" - "vl %%v24, 128(%%r1,%[x]),3\n\t" - "vl %%v25, 144(%%r1,%[x]),3\n\t" - "vl %%v26, 160(%%r1,%[x]),3\n\t" - "vl %%v27, 176(%%r1,%[x]),3\n\t" - "vl %%v16, 128(%%r1,%[y]),3\n\t" - "vl %%v17, 144(%%r1,%[y]),3\n\t" - "vl %%v18, 160(%%r1,%[y]),3\n\t" - "vl %%v19, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x]),3\n\t" - "vst %%v29, 144(%%r1,%[x]),3\n\t" - "vst %%v30, 160(%%r1,%[x]),3\n\t" - "vst %%v31, 176(%%r1,%[x]),3\n\t" - "vst %%v20, 128(%%r1,%[y]),3\n\t" - "vst %%v21, 144(%%r1,%[y]),3\n\t" - "vst %%v22, 160(%%r1,%[y]),3\n\t" - "vst %%v23, 176(%%r1,%[y]),3\n\t" - "vl %%v24, 192(%%r1,%[x]),3\n\t" - "vl %%v25, 208(%%r1,%[x]),3\n\t" - "vl %%v26, 224(%%r1,%[x]),3\n\t" - "vl %%v27, 240(%%r1,%[x]),3\n\t" - "vl %%v16, 192(%%r1,%[y]),3\n\t" - "vl %%v17, 208(%%r1,%[y]),3\n\t" - "vl %%v18, 224(%%r1,%[y]),3\n\t" - "vl %%v19, 240(%%r1,%[y]),3\n\t" + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x]),3\n\t" - "vst %%v29, 208(%%r1,%[x]),3\n\t" - "vst %%v30, 224(%%r1,%[x]),3\n\t" - "vst %%v31, 240(%%r1,%[x]),3\n\t" - "vst %%v20, 192(%%r1,%[y]),3\n\t" - "vst %%v21, 208(%%r1,%[y]),3\n\t" - "vst %%v22, 224(%%r1,%[y]),3\n\t" - "vst %%v23, 240(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 7fd62a1ac5..fbcc0c5b9a 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -36,14 +36,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -68,14 +68,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x]),3\n\t" - "vst %%v17,16(%%r1,%[x]),3\n\t" - "vst %%v18,32(%%r1,%[x]),3\n\t" - "vst %%v19,48(%%r1,%[x]),3\n\t" - "vst %%v20,64(%%r1,%[x]),3\n\t" - "vst %%v21,80(%%r1,%[x]),3\n\t" - "vst %%v22,96(%%r1,%[x]),3\n\t" - "vst %%v23,112(%%r1,%[x]),3\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -93,14 +93,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vpdi %%v16,%%v16,%%v16,4\n\t" "vpdi %%v17,%%v17,%%v17,4\n\t" "vpdi %%v18,%%v18,%%v18,4\n\t" @@ -117,14 +117,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x]),3\n\t" - "vst %%v17,16(%%r1,%[x]),3\n\t" - "vst %%v18,32(%%r1,%[x]),3\n\t" - "vst %%v19,48(%%r1,%[x]),3\n\t" - "vst %%v20,64(%%r1,%[x]),3\n\t" - "vst %%v21,80(%%r1,%[x]),3\n\t" - "vst %%v22,96(%%r1,%[x]),3\n\t" - "vst %%v23,112(%%r1,%[x]),3\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -139,14 +139,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfmdb %%v16,%%v16,%%v0\n\t" "vfmdb %%v17,%%v17,%%v0\n\t" "vfmdb %%v18,%%v18,%%v0\n\t" @@ -155,14 +155,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x]),3\n\t" - "vst %%v17,16(%%r1,%[x]),3\n\t" - "vst %%v18,32(%%r1,%[x]),3\n\t" - "vst %%v19,48(%%r1,%[x]),3\n\t" - "vst %%v20,64(%%r1,%[x]),3\n\t" - "vst %%v21,80(%%r1,%[x]),3\n\t" - "vst %%v22,96(%%r1,%[x]),3\n\t" - "vst %%v23,112(%%r1,%[x]),3\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -177,14 +177,14 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x]),3\n\t" - "vst %%v0,16(%%r1,%[x]),3\n\t" - "vst %%v0,32(%%r1,%[x]),3\n\t" - "vst %%v0,48(%%r1,%[x]),3\n\t" - "vst %%v0,64(%%r1,%[x]),3\n\t" - "vst %%v0,80(%%r1,%[x]),3\n\t" - "vst %%v0,96(%%r1,%[x]),3\n\t" - "vst %%v0,112(%%r1,%[x]),3\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 0252ab8db9..0f38103be7 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -33,70 +33,70 @@ static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v20, 64(%%r1,%[x]),3\n\t" - "vl %%v21, 80(%%r1,%[x]),3\n\t" - "vl %%v22, 96(%%r1,%[x]),3\n\t" - "vl %%v23, 112(%%r1,%[x]),3\n\t" - "vl %%v24, 128(%%r1,%[x]),3\n\t" - "vl %%v25, 144(%%r1,%[x]),3\n\t" - "vl %%v26, 160(%%r1,%[x]),3\n\t" - "vl %%v27, 176(%%r1,%[x]),3\n\t" - "vl %%v28, 192(%%r1,%[x]),3\n\t" - "vl %%v29, 208(%%r1,%[x]),3\n\t" - "vl %%v30, 224(%%r1,%[x]),3\n\t" - "vl %%v31, 240(%%r1,%[x]),3\n\t" - "vl %%v0, 0(%%r1,%[y]),3\n\t" - "vl %%v1, 16(%%r1,%[y]),3\n\t" - "vl %%v2, 32(%%r1,%[y]),3\n\t" - "vl %%v3, 48(%%r1,%[y]),3\n\t" - "vl %%v4, 64(%%r1,%[y]),3\n\t" - "vl %%v5, 80(%%r1,%[y]),3\n\t" - "vl %%v6, 96(%%r1,%[y]),3\n\t" - "vl %%v7, 112(%%r1,%[y]),3\n\t" - "vst %%v0, 0(%%r1,%[x]),3\n\t" - "vst %%v1, 16(%%r1,%[x]),3\n\t" - "vst %%v2, 32(%%r1,%[x]),3\n\t" - "vst %%v3, 48(%%r1,%[x]),3\n\t" - "vst %%v4, 64(%%r1,%[x]),3\n\t" - "vst %%v5, 80(%%r1,%[x]),3\n\t" - "vst %%v6, 96(%%r1,%[x]),3\n\t" - "vst %%v7, 112(%%r1,%[x]),3\n\t" - "vl %%v0, 128(%%r1,%[y]),3\n\t" - "vl %%v1, 144(%%r1,%[y]),3\n\t" - "vl %%v2, 160(%%r1,%[y]),3\n\t" - "vl %%v3, 176(%%r1,%[y]),3\n\t" - "vl %%v4, 192(%%r1,%[y]),3\n\t" - "vl %%v5, 208(%%r1,%[y]),3\n\t" - "vl %%v6, 224(%%r1,%[y]),3\n\t" - "vl %%v7, 240(%%r1,%[y]),3\n\t" - "vst %%v0, 128(%%r1,%[x]),3\n\t" - "vst %%v1, 144(%%r1,%[x]),3\n\t" - "vst %%v2, 160(%%r1,%[x]),3\n\t" - "vst %%v3, 176(%%r1,%[x]),3\n\t" - "vst %%v4, 192(%%r1,%[x]),3\n\t" - "vst %%v5, 208(%%r1,%[x]),3\n\t" - "vst %%v6, 224(%%r1,%[x]),3\n\t" - "vst %%v7, 240(%%r1,%[x]),3\n\t" - "vst %%v16, 0(%%r1,%[y]),3\n\t" - "vst %%v17, 16(%%r1,%[y]),3\n\t" - "vst %%v18, 32(%%r1,%[y]),3\n\t" - "vst %%v19, 48(%%r1,%[y]),3\n\t" - "vst %%v20, 64(%%r1,%[y]),3\n\t" - "vst %%v21, 80(%%r1,%[y]),3\n\t" - "vst %%v22, 96(%%r1,%[y]),3\n\t" - "vst %%v23, 112(%%r1,%[y]),3\n\t" - "vst %%v24, 128(%%r1,%[y]),3\n\t" - "vst %%v25, 144(%%r1,%[y]),3\n\t" - "vst %%v26, 160(%%r1,%[y]),3\n\t" - "vst %%v27, 176(%%r1,%[y]),3\n\t" - "vst %%v28, 192(%%r1,%[y]),3\n\t" - "vst %%v29, 208(%%r1,%[y]),3\n\t" - "vst %%v30, 224(%%r1,%[y]),3\n\t" - "vst %%v31, 240(%%r1,%[y]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) From 69edc5bbe79af88710666aa909e7b39c89558b9c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 7 Feb 2019 20:06:13 +0100 Subject: [PATCH 095/189] Restore dropped patches in the non-TLS branch of memory.c (#2004) * Restore dropped patches in the non-TLS branch of memory.c As discovered in #2002, the reintroduction of the "original" non-TLS version of memory.c as an alternate branch had inadvertently used ba1f91f rather than a8002e2 , thereby dropping the commits for #1450, #1468, #1501, #1504 and #1520. --- driver/others/memory.c | 77 ++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 22 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 72d3e173cf..2e185593e8 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1603,9 +1603,11 @@ void gotoblas_dummy_for_PGI(void) { #endif #else +/* USE_TLS / COMPILE_TLS not set */ + #include -#ifdef OS_WINDOWS +#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) #define ALLOC_WINDOWS #ifndef MEM_LARGE_PAGES #define MEM_LARGE_PAGES 0x20000000 @@ -1619,7 +1621,7 @@ void gotoblas_dummy_for_PGI(void) { #include #include -#ifndef OS_WINDOWS +#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) #include #ifndef NO_SYSV_IPC #include @@ -1639,7 +1641,7 @@ void gotoblas_dummy_for_PGI(void) { #include #endif -#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include #endif @@ -1678,9 +1680,12 @@ void gotoblas_dummy_for_PGI(void) { #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) -#else +#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) +#else +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #endif #ifdef DYNAMIC_ARCH @@ -1740,7 +1745,8 @@ int i,n; size = CPU_ALLOC_SIZE(nums); ret = sched_getaffinity(0,size,cpusetp); if (ret!=0) return nums; - nums = CPU_COUNT_S(size,cpusetp); + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; #endif @@ -1756,7 +1762,7 @@ int get_num_procs(void) { return nums; } #endif - + #ifdef OS_HAIKU int get_num_procs(void) { static int nums = 0; @@ -1793,7 +1799,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) int get_num_procs(void) { @@ -1870,7 +1876,7 @@ void openblas_fork_handler() // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 // In the mean time build with USE_OPENMP=0 or link against another // implementation of OpenMP. -#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER) +#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) int err; err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); if(err != 0) @@ -1883,7 +1889,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -1891,11 +1897,11 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif - blas_goto_num = 0; + // blas_goto_num = 0; #ifndef USE_OPENMP blas_goto_num=openblas_num_threads_env(); if (blas_goto_num < 0) blas_goto_num = 0; @@ -1907,7 +1913,7 @@ int blas_get_cpu_number(void){ #endif - blas_omp_num = 0; + // blas_omp_num = 0; blas_omp_num=openblas_omp_num_threads_env(); if (blas_omp_num < 0) blas_omp_num = 0; @@ -1915,7 +1921,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -2002,11 +2008,15 @@ static void *alloc_mmap(void *address){ } if (map_address != (void *)-1) { +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); +#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); +#endif } #ifdef OS_LINUX @@ -2148,14 +2158,18 @@ static void *alloc_mmap(void *address){ #if defined(OS_LINUX) && !defined(NO_WARMUP) } #endif - LOCK_COMMAND(&alloc_lock); if (map_address != (void *)-1) { +#if defined(SMP) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; +#if defined(SMP) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif } - UNLOCK_COMMAND(&alloc_lock); return map_address; } @@ -2554,6 +2568,11 @@ void *blas_memory_alloc(int procpos){ NULL, }; void *(**func)(void *address); + +#if defined(USE_OPENMP) + if (!memory_initialized) { +#endif + LOCK_COMMAND(&alloc_lock); if (!memory_initialized) { @@ -2589,6 +2608,9 @@ void *blas_memory_alloc(int procpos){ } UNLOCK_COMMAND(&alloc_lock); +#if defined(USE_OPENMP) + } +#endif #ifdef DEBUG printf("Alloc Start ...\n"); @@ -2603,13 +2625,17 @@ void *blas_memory_alloc(int procpos){ do { if (!memory[position].used && (memory[position].pos == mypos)) { +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -// blas_lock(&memory[position].lock); - +#else + blas_lock(&memory[position].lock); +#endif if (!memory[position].used) goto allocation; - +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); -// blas_unlock(&memory[position].lock); +#else + blas_unlock(&memory[position].lock); +#endif } position ++; @@ -2647,7 +2673,6 @@ void *blas_memory_alloc(int procpos){ memory[position].used = 1; UNLOCK_COMMAND(&alloc_lock); -/* blas_unlock(&memory[position].lock);*/ if (!memory[position].addr) { do { @@ -2693,9 +2718,13 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); +#endif memory[position].addr = map_address; +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); +#endif #ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); @@ -2749,8 +2778,9 @@ void blas_memory_free(void *free_area){ #endif position = 0; +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); - +#endif while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) position++; @@ -2764,7 +2794,9 @@ void blas_memory_free(void *free_area){ WMB; memory[position].used = 0; +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); +#endif #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -2779,8 +2811,9 @@ void blas_memory_free(void *free_area){ for (position = 0; position < NUM_BUFFERS; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); #endif +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); - +#endif return; } From 03a2bf2602714360fdf7096a4fc362ecfc700823 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 10 Feb 2019 23:24:45 +0100 Subject: [PATCH 096/189] Fix potential memory leak in cpu enumeration on Linux (#2008) * Fix potential memory leak in cpu enumeration with glibc An early return after a failed call to sched_getaffinity would leak the previously allocated cpu_set_t. Wrong calculation of the size argument in that call increased the likelyhood of that failure. Fixes #2003 --- driver/others/memory.c | 123 ++++++++++++++++++++++++++++------------- 1 file changed, 85 insertions(+), 38 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 2e185593e8..09851f15c0 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -198,45 +198,68 @@ int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; -cpu_set_t *cpusetp; -size_t size; -int ret; -int i,n; + cpu_set_t cpuset,*cpusetp; + size_t size; + int ret; + +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 7) + int i; +#if !__GLIBC_PREREQ(2, 6) + int n; +#endif +#endif +#endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) - return nums; + return nums; #endif #if !defined(__GLIBC_PREREQ) - return nums; + return nums; #else #if !__GLIBC_PREREQ(2, 3) - return nums; + return nums; #endif #if !__GLIBC_PREREQ(2, 7) - ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); if (ret!=0) return nums; n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i 0 && ret < nums) nums = ret; - CPU_FREE(cpusetp); - return nums; + if (nums >= CPU_SETSIZE) { + cpusetp = CPU_ALLOC(nums); + if (cpusetp == NULL) { + return nums; + } + size = CPU_ALLOC_SIZE(nums); + ret = sched_getaffinity(0,size,cpusetp); + if (ret!=0) { + CPU_FREE(cpusetp); + return nums; + } + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; + CPU_FREE(cpusetp); + return nums; + } else { + ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); + if (ret!=0) { + return nums; + } + ret = CPU_COUNT(&cpuset); + if (ret > 0 && ret < nums) nums = ret; + return nums; + } #endif #endif } @@ -1709,46 +1732,70 @@ void goto_set_num_threads(int num_threads) {}; int get_num_procs(void); #else int get_num_procs(void) { + static int nums = 0; -cpu_set_t *cpusetp; -size_t size; -int ret; -int i,n; + cpu_set_t cpuset,*cpusetp; + size_t size; + int ret; + +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 7) + int i; +#if !__GLIBC_PREREQ(2, 6) + int n; +#endif +#endif +#endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) - return nums; + return nums; #endif #if !defined(__GLIBC_PREREQ) - return nums; + return nums; #else #if !__GLIBC_PREREQ(2, 3) - return nums; + return nums; #endif #if !__GLIBC_PREREQ(2, 7) - ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); if (ret!=0) return nums; n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i 0 && ret < nums) nums = ret; - CPU_FREE(cpusetp); - return nums; + if (nums >= CPU_SETSIZE) { + cpusetp = CPU_ALLOC(nums); + if (cpusetp == NULL) { + return nums; + } + size = CPU_ALLOC_SIZE(nums); + ret = sched_getaffinity(0,size,cpusetp); + if (ret!=0) { + CPU_FREE(cpusetp); + return nums; + } + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; + CPU_FREE(cpusetp); + return nums; + } else { + ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); + if (ret!=0) { + return nums; + } + ret = CPU_COUNT(&cpuset); + if (ret > 0 && ret < nums) nums = ret; + return nums; + } #endif #endif } From 77fe70019f0fb4064eec2a5b26a6057acef29b58 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 11 Feb 2019 16:01:13 +0200 Subject: [PATCH 097/189] [ZARCH] Fix constraints and source code formatting --- kernel/zarch/camax.c | 212 +++++------ kernel/zarch/camin.c | 212 +++++------ kernel/zarch/casum.c | 154 ++++---- kernel/zarch/caxpy.c | 130 +++---- kernel/zarch/ccopy.c | 21 +- kernel/zarch/cdot.c | 148 ++++---- kernel/zarch/cgemv_n_4.c | 590 ++++++++++++++--------------- kernel/zarch/cgemv_t_4.c | 52 +-- kernel/zarch/crot.c | 291 +++++++-------- kernel/zarch/cscal.c | 309 ++++++++-------- kernel/zarch/cswap.c | 151 ++++---- kernel/zarch/damax.c | 90 ++--- kernel/zarch/damax_z13.c | 158 ++++---- kernel/zarch/damin.c | 90 ++--- kernel/zarch/damin_z13.c | 158 ++++---- kernel/zarch/dasum.c | 150 ++++---- kernel/zarch/daxpy.c | 152 ++++---- kernel/zarch/dcopy.c | 20 +- kernel/zarch/ddot.c | 108 +++--- kernel/zarch/dgemv_n_4.c | 624 ++++++++++++++++--------------- kernel/zarch/dgemv_t_4.c | 780 ++++++++++++++++++++------------------- kernel/zarch/dmax.c | 90 ++--- kernel/zarch/dmax_z13.c | 124 +++---- kernel/zarch/dmin.c | 90 ++--- kernel/zarch/dmin_z13.c | 124 +++---- kernel/zarch/drot.c | 291 +++++++-------- kernel/zarch/dscal.c | 102 ++--- kernel/zarch/dsdot.c | 171 ++++----- kernel/zarch/dswap.c | 151 ++++---- kernel/zarch/icamax.c | 370 +++++++++---------- kernel/zarch/icamin.c | 370 +++++++++---------- kernel/zarch/idamax.c | 264 ++++++------- kernel/zarch/idamin.c | 264 ++++++------- kernel/zarch/idmax.c | 230 ++++++------ kernel/zarch/idmin.c | 230 ++++++------ kernel/zarch/isamax.c | 352 +++++++++--------- kernel/zarch/isamin.c | 352 +++++++++--------- kernel/zarch/ismax.c | 318 ++++++++-------- kernel/zarch/ismin.c | 318 ++++++++-------- kernel/zarch/izamax.c | 256 ++++++------- kernel/zarch/izamin.c | 256 ++++++------- kernel/zarch/samax.c | 94 ++--- kernel/zarch/samin.c | 94 ++--- kernel/zarch/sasum.c | 154 ++++---- kernel/zarch/saxpy.c | 152 ++++---- kernel/zarch/scopy.c | 20 +- kernel/zarch/sdot.c | 116 +++--- kernel/zarch/sgemv_n_4.c | 584 +++++++++++++++-------------- kernel/zarch/sgemv_t_4.c | 766 +++++++++++++++++++------------------- kernel/zarch/smax.c | 94 ++--- kernel/zarch/smin.c | 94 ++--- kernel/zarch/srot.c | 291 +++++++-------- kernel/zarch/sscal.c | 102 ++--- kernel/zarch/sswap.c | 151 ++++---- kernel/zarch/zamax.c | 166 ++++----- kernel/zarch/zamax_z13.c | 184 ++++----- kernel/zarch/zamin.c | 166 ++++----- kernel/zarch/zamin_z13.c | 184 ++++----- kernel/zarch/zasum.c | 150 ++++---- kernel/zarch/zaxpy.c | 138 +++---- kernel/zarch/zcopy.c | 21 +- kernel/zarch/zdot.c | 140 +++---- kernel/zarch/zgemv_n_4.c | 414 +++++++++++---------- kernel/zarch/zgemv_t_4.c | 452 ++++++++++++----------- kernel/zarch/zrot.c | 291 +++++++-------- kernel/zarch/zscal.c | 301 +++++++-------- kernel/zarch/zswap.c | 151 ++++---- 67 files changed, 7439 insertions(+), 7354 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 40a9903e94..b10ca4752d 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -34,112 +34,112 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v16,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v16,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v16,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v16,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v16,%%v16\n\t" - "vfasb %%v0,%%v0,%%v16\n\t" - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,8,4\n\t" - "vleib %%v1,9,5\n\t" - "vleib %%v1,10,6\n\t" - "vleib %%v1,11,7\n\t" - "vleib %%v1,16,8\n\t" - "vleib %%v1,17,9\n\t" - "vleib %%v1,18,10\n\t" - "vleib %%v1,19,11\n\t" - "vleib %%v1,24,12\n\t" - "vleib %%v1,25,13\n\t" - "vleib %%v1,26,14\n\t" - "vleib %%v1,27,15\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v2,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v2\n\t" - "vperm %%v16,%%v16,%%v2,%%v1\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v2,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v2\n\t" - "vperm %%v18,%%v18,%%v2,%%v1\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v2,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v2\n\t" - "vperm %%v20,%%v20,%%v2,%%v1\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v2,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v2\n\t" - "vperm %%v22,%%v22,%%v2,%%v1\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v2,144(%%r1,%[x])\n\t" - "vpkg %%v25,%%v24,%%v2\n\t" - "vperm %%v24,%%v24,%%v2,%%v1\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v2,176(%%r1,%[x])\n\t" - "vpkg %%v27,%%v26,%%v2\n\t" - "vperm %%v26,%%v26,%%v2,%%v1\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v2,208(%%r1,%[x])\n\t" - "vpkg %%v29,%%v28,%%v2\n\t" - "vperm %%v28,%%v28,%%v2,%%v1\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v2,240(%%r1,%[x])\n\t" - "vpkg %%v31,%%v30,%%v2\n\t" - "vperm %%v30,%%v30,%%v2,%%v1\n\t" - "vflpsb %%v16,%%v16\n\t" - "vflpsb %%v17,%%v17\n\t" - "vflpsb %%v18,%%v18\n\t" - "vflpsb %%v19,%%v19\n\t" - "vflpsb %%v20,%%v20\n\t" - "vflpsb %%v21,%%v21\n\t" - "vflpsb %%v22,%%v22\n\t" - "vflpsb %%v23,%%v23\n\t" - "vflpsb %%v24,%%v24\n\t" - "vflpsb %%v25,%%v25\n\t" - "vflpsb %%v26,%%v26\n\t" - "vflpsb %%v27,%%v27\n\t" - "vflpsb %%v28,%%v28\n\t" - "vflpsb %%v29,%%v29\n\t" - "vflpsb %%v30,%%v30\n\t" - "vflpsb %%v31,%%v31\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v18,%%v18,%%v19\n\t" - "vfasb %%v20,%%v20,%%v21\n\t" - "vfasb %%v22,%%v22,%%v23\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v26,%%v26,%%v27\n\t" - "vfasb %%v28,%%v28,%%v29\n\t" - "vfasb %%v30,%%v30,%%v31\n\t" - "vfmaxsb %%v16,%%v16,%%v24,0\n\t" - "vfmaxsb %%v18,%%v18,%%v26,0\n\t" - "vfmaxsb %%v20,%%v20,%%v28,0\n\t" - "vfmaxsb %%v22,%%v22,%%v30,0\n\t" - "vfmaxsb %%v16,%%v16,%%v20,0\n\t" - "vfmaxsb %%v18,%%v18,%%v22,0\n\t" - "vfmaxsb %%v16,%%v16,%%v18,0\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfmaxsb %%v0,%%v0,%%v16,0\n\t" - "ler %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfmaxsb %%v16,%%v16,%%v24,0\n\t" + "vfmaxsb %%v18,%%v18,%%v26,0\n\t" + "vfmaxsb %%v20,%%v20,%%v28,0\n\t" + "vfmaxsb %%v22,%%v22,%%v30,0\n\t" + "vfmaxsb %%v16,%%v16,%%v20,0\n\t" + "vfmaxsb %%v18,%%v18,%%v22,0\n\t" + "vfmaxsb %%v16,%%v16,%%v18,0\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); return amax; } diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 842635afc4..40945fae81 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -34,112 +34,112 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v16,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v16,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v16,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v16,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v16,%%v16\n\t" - "vfasb %%v0,%%v0,%%v16\n\t" - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,8,4\n\t" - "vleib %%v1,9,5\n\t" - "vleib %%v1,10,6\n\t" - "vleib %%v1,11,7\n\t" - "vleib %%v1,16,8\n\t" - "vleib %%v1,17,9\n\t" - "vleib %%v1,18,10\n\t" - "vleib %%v1,19,11\n\t" - "vleib %%v1,24,12\n\t" - "vleib %%v1,25,13\n\t" - "vleib %%v1,26,14\n\t" - "vleib %%v1,27,15\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v2,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v2\n\t" - "vperm %%v16,%%v16,%%v2,%%v1\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v2,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v2\n\t" - "vperm %%v18,%%v18,%%v2,%%v1\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v2,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v2\n\t" - "vperm %%v20,%%v20,%%v2,%%v1\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v2,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v2\n\t" - "vperm %%v22,%%v22,%%v2,%%v1\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v2,144(%%r1,%[x])\n\t" - "vpkg %%v25,%%v24,%%v2\n\t" - "vperm %%v24,%%v24,%%v2,%%v1\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v2,176(%%r1,%[x])\n\t" - "vpkg %%v27,%%v26,%%v2\n\t" - "vperm %%v26,%%v26,%%v2,%%v1\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v2,208(%%r1,%[x])\n\t" - "vpkg %%v29,%%v28,%%v2\n\t" - "vperm %%v28,%%v28,%%v2,%%v1\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v2,240(%%r1,%[x])\n\t" - "vpkg %%v31,%%v30,%%v2\n\t" - "vperm %%v30,%%v30,%%v2,%%v1\n\t" - "vflpsb %%v16,%%v16\n\t" - "vflpsb %%v17,%%v17\n\t" - "vflpsb %%v18,%%v18\n\t" - "vflpsb %%v19,%%v19\n\t" - "vflpsb %%v20,%%v20\n\t" - "vflpsb %%v21,%%v21\n\t" - "vflpsb %%v22,%%v22\n\t" - "vflpsb %%v23,%%v23\n\t" - "vflpsb %%v24,%%v24\n\t" - "vflpsb %%v25,%%v25\n\t" - "vflpsb %%v26,%%v26\n\t" - "vflpsb %%v27,%%v27\n\t" - "vflpsb %%v28,%%v28\n\t" - "vflpsb %%v29,%%v29\n\t" - "vflpsb %%v30,%%v30\n\t" - "vflpsb %%v31,%%v31\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v18,%%v18,%%v19\n\t" - "vfasb %%v20,%%v20,%%v21\n\t" - "vfasb %%v22,%%v22,%%v23\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v26,%%v26,%%v27\n\t" - "vfasb %%v28,%%v28,%%v29\n\t" - "vfasb %%v30,%%v30,%%v31\n\t" - "vfminsb %%v16,%%v16,%%v24,0\n\t" - "vfminsb %%v18,%%v18,%%v26,0\n\t" - "vfminsb %%v20,%%v20,%%v28,0\n\t" - "vfminsb %%v22,%%v22,%%v30,0\n\t" - "vfminsb %%v16,%%v16,%%v20,0\n\t" - "vfminsb %%v18,%%v18,%%v22,0\n\t" - "vfminsb %%v16,%%v16,%%v18,0\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfminsb %%v0,%%v0,%%v16,0\n\t" - "ler %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); return amin; } diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c index f59e5a20b3..e28f2018c7 100644 --- a/kernel/zarch/casum.c +++ b/kernel/zarch/casum.c @@ -34,83 +34,83 @@ static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) { FLOAT asum; __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v27\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v29\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vfasb %%v24,%%v24,%%v31\n\t" - "veslg %%v25,%%v24,32\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vrepf %%v25,%%v24,2\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" - : [asum] "=m"(asum),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=Q"(asum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return asum; } diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index d86342bd0f..e4b484ab7d 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -30,73 +30,73 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__( #if !defined(CONJ) - "vlrepf %%v0,0(%[alpha])\n\t" - "vlef %%v1,4(%[alpha]),0\n\t" - "vlef %%v1,4(%[alpha]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%[alpha]),1\n\t" - "vlef %%v1,4(%[alpha]),3\n\t" + "vlrepf %%v0,0(%[alpha])\n\t" + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" #else - "vlef %%v0,0(%[alpha]),1\n\t" - "vlef %%v0,0(%[alpha]),3\n\t" - "vflcsb %%v0,%%v0\n\t" - "vlef %%v0,0(%[alpha]),0\n\t" - "vlef %%v0,0(%[alpha]),2\n\t" - "vlrepf %%v1,4(%[alpha])\n\t" + "vlef %%v0,0(%[alpha]),1\n\t" + "vlef %%v0,0(%[alpha]),3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,0(%[alpha]),0\n\t" + "vlef %%v0,0(%[alpha]),2\n\t" + "vlrepf %%v1,4(%[alpha])\n\t" #endif - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x])\n\t" - "vl %%v9,16(%%r1,%[x])\n\t" - "vl %%v10,32(%%r1,%[x])\n\t" - "vl %%v11,48(%%r1,%[x])\n\t" - "vl %%v12,0(%%r1,%[y])\n\t" - "vl %%v13,16(%%r1,%[y])\n\t" - "vl %%v14,32(%%r1,%[y])\n\t" - "vl %%v15,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[x])\n\t" - "vl %%v17,80(%%r1,%[x])\n\t" - "vl %%v18,96(%%r1,%[x])\n\t" - "vl %%v19,112(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[y])\n\t" - "vl %%v21,80(%%r1,%[y])\n\t" - "vl %%v22,96(%%r1,%[y])\n\t" - "vl %%v23,112(%%r1,%[y])\n\t" - "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" - "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" - "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" - "vfmasb %%v11,%%v11,%%v0,%%v15\n\t" - "vfmasb %%v16,%%v16,%%v0,%%v20\n\t" - "vfmasb %%v17,%%v17,%%v0,%%v21\n\t" - "vfmasb %%v18,%%v18,%%v0,%%v22\n\t" - "vfmasb %%v19,%%v19,%%v0,%%v23\n\t" - "vfmasb %%v8,%%v24,%%v1,%%v8\n\t" - "vfmasb %%v9,%%v25,%%v1,%%v9\n\t" - "vfmasb %%v10,%%v26,%%v1,%%v10\n\t" - "vfmasb %%v11,%%v27,%%v1,%%v11\n\t" - "vfmasb %%v16,%%v28,%%v1,%%v16\n\t" - "vfmasb %%v17,%%v29,%%v1,%%v17\n\t" - "vfmasb %%v18,%%v30,%%v1,%%v18\n\t" - "vfmasb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y])\n\t" - "vst %%v9,16(%%r1,%[y])\n\t" - "vst %%v10,32(%%r1,%[y])\n\t" - "vst %%v11,48(%%r1,%[y])\n\t" - "vst %%v16,64(%%r1,%[y])\n\t" - "vst %%v17,80(%%r1,%[y])\n\t" - "vst %%v18,96(%%r1,%[y])\n\t" - "vst %%v19,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", - "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmasb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmasb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmasb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmasb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmasb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmasb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmasb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmasb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmasb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c index d17bddcc86..0a5e03992a 100644 --- a/kernel/zarch/ccopy.c +++ b/kernel/zarch/ccopy.c @@ -29,16 +29,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],5\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x) - : "cc"); + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y), + [n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c index 64d81ae5c9..d90f9c8712 100644 --- a/kernel/zarch/cdot.c +++ b/kernel/zarch/cdot.c @@ -29,80 +29,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "verllg %%v20,%%v16,32\n\t" - "verllg %%v21,%%v17,32\n\t" - "verllg %%v22,%%v18,32\n\t" - "verllg %%v23,%%v19,32\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x])\n\t" - "vl %%v17, 80(%%r1,%[x])\n\t" - "vl %%v18, 96(%%r1,%[x])\n\t" - "vl %%v19, 112(%%r1,%[x])\n\t" - "vl %%v0, 64(%%r1,%[y])\n\t" - "vl %%v1, 80(%%r1,%[y])\n\t" - "vl %%v2, 96(%%r1,%[y])\n\t" - "vl %%v3, 112(%%r1,%[y])\n\t" - "verllg %%v20,%%v16,32\n\t" - "verllg %%v21,%%v17,32\n\t" - "verllg %%v22,%%v18,32\n\t" - "verllg %%v23,%%v19,32\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vrepg %%v26,%%v24,1\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v25,%%v25,%%v27\n\t" - "vfasb %%v25,%%v25,%%v29\n\t" - "vfasb %%v25,%%v25,%%v31\n\t" - "vrepg %%v27,%%v25,1\n\t" - "vfasb %%v25,%%v25,%%v27\n\t" - "vstef %%v24,0(%[d]),0\n\t" - "vstef %%v24,4(%[d]),1\n\t" - "vstef %%v25,8(%[d]),1\n\t" - "vstef %%v25,12(%[d]),0" - : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) - : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vrepg %%v26,%%v24,1\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vfasb %%v25,%%v25,%%v29\n\t" + "vfasb %%v25,%%v25,%%v31\n\t" + "vrepg %%v27,%%v25,1\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vstef %%v24,0(%[d]),0\n\t" + "vstef %%v24,4(%[d]),1\n\t" + "vstef %%v25,8(%[d]),1\n\t" + "vstef %%v25,12(%[d]),0" + : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index db91d90634..adba05d475 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -30,323 +30,331 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vlrepg %%v16,0(%[x])\n\t" - "vlrepg %%v17,8(%[x])\n\t" - "vlrepg %%v18,16(%[x])\n\t" - "vlrepg %%v19,24(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" + "vlrepg %%v18,16(%[x])\n\t" + "vlrepg %%v19,24(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v20,4(%[x]),0\n\t" - "vlef %%v20,4(%[x]),2\n\t" - "vflcsb %%v20,%%v20\n\t" - "vlef %%v20,0(%[x]),1\n\t" - "vlef %%v20,0(%[x]),3\n\t" - "vlef %%v21,12(%[x]),0\n\t" - "vlef %%v21,12(%[x]),2\n\t" - "vflcsb %%v21,%%v21\n\t" - "vlef %%v21,8(%[x]),1\n\t" - "vlef %%v21,8(%[x]),3\n\t" - "vlef %%v22,20(%[x]),0\n\t" - "vlef %%v22,20(%[x]),2\n\t" - "vflcsb %%v22,%%v22\n\t" - "vlef %%v22,16(%[x]),1\n\t" - "vlef %%v22,16(%[x]),3\n\t" - "vlef %%v23,28(%[x]),0\n\t" - "vlef %%v23,28(%[x]),2\n\t" - "vflcsb %%v23,%%v23\n\t" - "vlef %%v23,24(%[x]),1\n\t" - "vlef %%v23,24(%[x]),3\n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" #else - "vlef %%v20,0(%[x]),1\n\t" - "vlef %%v20,0(%[x]),3\n\t" - "vflcsb %%v20,%%v20\n\t" - "vlef %%v20,4(%[x]),0\n\t" - "vlef %%v20,4(%[x]),2\n\t" - "vlef %%v21,8(%[x]),1\n\t" - "vlef %%v21,8(%[x]),3\n\t" - "vflcsb %%v21,%%v21\n\t" - "vlef %%v21,12(%[x]),0\n\t" - "vlef %%v21,12(%[x]),2\n\t" - "vlef %%v22,16(%[x]),1\n\t" - "vlef %%v22,16(%[x]),3\n\t" - "vflcsb %%v22,%%v22\n\t" - "vlef %%v22,20(%[x]),0\n\t" - "vlef %%v22,20(%[x]),2\n\t" - "vlef %%v23,24(%[x]),1\n\t" - "vlef %%v23,24(%[x]),3\n\t" - "vflcsb %%v23,%%v23\n\t" - "vlef %%v23,28(%[x]),0\n\t" - "vlef %%v23,28(%[x]),2\n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" #endif - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,0,4\n\t" - "vleib %%v1,1,5\n\t" - "vleib %%v1,2,6\n\t" - "vleib %%v1,3,7\n\t" - "vleib %%v1,8,8\n\t" - "vleib %%v1,9,9\n\t" - "vleib %%v1,10,10\n\t" - "vleib %%v1,11,11\n\t" - "vleib %%v1,8,12\n\t" - "vleib %%v1,9,13\n\t" - "vleib %%v1,10,14\n\t" - "vleib %%v1,11,15\n\t" - "vleib %%v2,4,0\n\t" - "vleib %%v2,5,1\n\t" - "vleib %%v2,6,2\n\t" - "vleib %%v2,7,3\n\t" - "vleib %%v2,4,4\n\t" - "vleib %%v2,5,5\n\t" - "vleib %%v2,6,6\n\t" - "vleib %%v2,7,7\n\t" - "vleib %%v2,12,8\n\t" - "vleib %%v2,13,9\n\t" - "vleib %%v2,14,10\n\t" - "vleib %%v2,15,11\n\t" - "vleib %%v2,12,12\n\t" - "vleib %%v2,13,13\n\t" - "vleib %%v2,14,14\n\t" - "vleib %%v2,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vperm %%v25,%%v24,%%v24,%%v2\n\t" - "vperm %%v24,%%v24,%%v24,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap1])\n\t" - "vperm %%v27,%%v26,%%v26,%%v2\n\t" - "vperm %%v26,%%v26,%%v26,%%v1\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v24,%%v16,%%v0\n\t" - "vfmasb %%v0,%%v25,%%v20,%%v0\n\t" - "vfmasb %%v0,%%v26,%%v17,%%v0\n\t" - "vfmasb %%v0,%%v27,%%v21,%%v0\n\t" - "vl %%v28,0(%%r1,%[ap2])\n\t" - "vperm %%v29,%%v28,%%v28,%%v2\n\t" - "vperm %%v28,%%v28,%%v28,%%v1\n\t" - "vl %%v30,0(%%r1,%[ap3])\n\t" - "vperm %%v31,%%v30,%%v30,%%v2\n\t" - "vperm %%v30,%%v30,%%v30,%%v1\n\t" - "vfmasb %%v0,%%v28,%%v18,%%v0\n\t" - "vfmasb %%v0,%%v29,%%v22,%%v0\n\t" - "vfmasb %%v0,%%v30,%%v19,%%v0\n\t" - "vfmasb %%v0,%%v31,%%v23,%%v0\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vperm %%v25,%%v24,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v24,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap1])\n\t" + "vperm %%v27,%%v26,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v26,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0\n\t" + "vl %%v28,0(%%r1,%[ap2])\n\t" + "vperm %%v29,%%v28,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v28,%%v1\n\t" + "vl %%v30,0(%%r1,%[ap3])\n\t" + "vperm %%v31,%%v30,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v30,%%v1\n\t" + "vfmasb %%v0,%%v28,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v29,%%v22,%%v0\n\t" + "vfmasb %%v0,%%v30,%%v19,%%v0\n\t" + "vfmasb %%v0,%%v31,%%v23,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vlrepg %%v16,0(%[x])\n\t" - "vlrepg %%v17,8(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v18,4(%[x]),0\n\t" - "vlef %%v18,4(%[x]),2\n\t" - "vflcsb %%v18,%%v18\n\t" - "vlef %%v18,0(%[x]),1\n\t" - "vlef %%v18,0(%[x]),3\n\t" - "vlef %%v19,12(%[x]),0\n\t" - "vlef %%v19,12(%[x]),2\n\t" - "vflcsb %%v19,%%v19\n\t" - "vlef %%v19,8(%[x]),1\n\t" - "vlef %%v19,8(%[x]),3\n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" #else - "vlef %%v18,0(%[x]),1\n\t" - "vlef %%v18,0(%[x]),3\n\t" - "vflcsb %%v18,%%v18\n\t" - "vlef %%v18,4(%[x]),0\n\t" - "vlef %%v18,4(%[x]),2\n\t" - "vlef %%v19,8(%[x]),1\n\t" - "vlef %%v19,8(%[x]),3\n\t" - "vflcsb %%v19,%%v19\n\t" - "vlef %%v19,12(%[x]),0\n\t" - "vlef %%v19,12(%[x]),2\n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" #endif - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,0,4\n\t" - "vleib %%v1,1,5\n\t" - "vleib %%v1,2,6\n\t" - "vleib %%v1,3,7\n\t" - "vleib %%v1,8,8\n\t" - "vleib %%v1,9,9\n\t" - "vleib %%v1,10,10\n\t" - "vleib %%v1,11,11\n\t" - "vleib %%v1,8,12\n\t" - "vleib %%v1,9,13\n\t" - "vleib %%v1,10,14\n\t" - "vleib %%v1,11,15\n\t" - "vleib %%v2,4,0\n\t" - "vleib %%v2,5,1\n\t" - "vleib %%v2,6,2\n\t" - "vleib %%v2,7,3\n\t" - "vleib %%v2,4,4\n\t" - "vleib %%v2,5,5\n\t" - "vleib %%v2,6,6\n\t" - "vleib %%v2,7,7\n\t" - "vleib %%v2,12,8\n\t" - "vleib %%v2,13,9\n\t" - "vleib %%v2,14,10\n\t" - "vleib %%v2,15,11\n\t" - "vleib %%v2,12,12\n\t" - "vleib %%v2,13,13\n\t" - "vleib %%v2,14,14\n\t" - "vleib %%v2,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v20,0(%%r1,%[ap0])\n\t" - "vperm %%v21,%%v20,%%v20,%%v2\n\t" - "vperm %%v20,%%v20,%%v20,%%v1\n\t" - "vl %%v22,0(%%r1,%[ap1])\n\t" - "vperm %%v23,%%v22,%%v22,%%v2\n\t" - "vperm %%v22,%%v22,%%v22,%%v1\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v20,%%v16,%%v0\n\t" - "vfmasb %%v0,%%v21,%%v18,%%v0\n\t" - "vfmasb %%v0,%%v22,%%v17,%%v0\n\t" - "vfmasb %%v0,%%v23,%%v19,%%v0\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23"); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v20,0(%%r1,%[ap0])\n\t" + "vperm %%v21,%%v20,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v20,%%v1\n\t" + "vl %%v22,0(%%r1,%[ap1])\n\t" + "vperm %%v23,%%v22,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v22,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v23,%%v19,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23"); } static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { __asm__("vlrepg %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v17,4(%[x]),0\n\t" - "vlef %%v17,4(%[x]),2\n\t" - "vflcsb %%v17,%%v17\n\t" - "vlef %%v17,0(%[x]),1\n\t" - "vlef %%v17,0(%[x]),3\n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" #else - "vlef %%v17,0(%[x]),1\n\t" - "vlef %%v17,0(%[x]),3\n\t" - "vflcsb %%v17,%%v17\n\t" - "vlef %%v17,4(%[x]),0\n\t" - "vlef %%v17,4(%[x]),2\n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" #endif - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,0,4\n\t" - "vleib %%v1,1,5\n\t" - "vleib %%v1,2,6\n\t" - "vleib %%v1,3,7\n\t" - "vleib %%v1,8,8\n\t" - "vleib %%v1,9,9\n\t" - "vleib %%v1,10,10\n\t" - "vleib %%v1,11,11\n\t" - "vleib %%v1,8,12\n\t" - "vleib %%v1,9,13\n\t" - "vleib %%v1,10,14\n\t" - "vleib %%v1,11,15\n\t" - "vleib %%v2,4,0\n\t" - "vleib %%v2,5,1\n\t" - "vleib %%v2,6,2\n\t" - "vleib %%v2,7,3\n\t" - "vleib %%v2,4,4\n\t" - "vleib %%v2,5,5\n\t" - "vleib %%v2,6,6\n\t" - "vleib %%v2,7,7\n\t" - "vleib %%v2,12,8\n\t" - "vleib %%v2,13,9\n\t" - "vleib %%v2,14,10\n\t" - "vleib %%v2,15,11\n\t" - "vleib %%v2,12,12\n\t" - "vleib %%v2,13,13\n\t" - "vleib %%v2,14,14\n\t" - "vleib %%v2,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v18,0(%%r1,%[ap])\n\t" - "vperm %%v19,%%v18,%%v18,%%v2\n\t" - "vperm %%v18,%%v18,%%v18,%%v1\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v18,%%v16,%%v0\n\t" - "vfmasb %%v0,%%v19,%%v17,%%v0\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), - "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19"); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v18,0(%%r1,%[ap])\n\t" + "vperm %%v19,%%v18,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v18,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v19,%%v17,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), + "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19"); } static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) { __asm__( #if !defined(XCONJ) - "vlrepf %%v0,%[alpha_r]\n\t" - "vlef %%v1,%[alpha_i],0\n\t" - "vlef %%v1,%[alpha_i],2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,%[alpha_i],1\n\t" - "vlef %%v1,%[alpha_i],3\n\t" + "vlrepf %%v0,%[alpha_r]\n\t" + "vlef %%v1,%[alpha_i],0\n\t" + "vlef %%v1,%[alpha_i],2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,%[alpha_i],1\n\t" + "vlef %%v1,%[alpha_i],3\n\t" #else - "vlef %%v0,%[alpha_r],1\n\t" - "vlef %%v0,%[alpha_r],3\n\t" - "vflcsb %%v0,%%v0\n\t" - "vlef %%v0,%[alpha_r],0\n\t" - "vlef %%v0,%[alpha_r],2\n\t" - "vlrepf %%v1,%[alpha_i]\n\t" + "vlef %%v0,%[alpha_r],1\n\t" + "vlef %%v0,%[alpha_r],3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,%[alpha_r],0\n\t" + "vlef %%v0,%[alpha_r],2\n\t" + "vlrepf %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],2\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,0(%%r1,%[dest])\n\t" - "vl %%v19,16(%%r1,%[dest])\n\t" - "verllg %%v20,%%v16,32\n\t" - "verllg %%v21,%%v17,32\n\t" - "vfmasb %%v22,%%v16,%%v0,%%v18\n\t" - "vfmasb %%v23,%%v17,%%v0,%%v19\n\t" - "vfmasb %%v22,%%v20,%%v1,%%v22\n\t" - "vfmasb %%v23,%%v21,%%v1,%%v23\n\t" - "vst %%v22,0(%%r1,%[dest])\n\t" - "vst %%v23,16(%%r1,%[dest])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) - : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), - [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,0(%%r1,%[dest])\n\t" + "vl %%v19,16(%%r1,%[dest])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "vfmasb %%v22,%%v16,%%v0,%%v18\n\t" + "vfmasb %%v23,%%v17,%%v0,%%v19\n\t" + "vfmasb %%v22,%%v20,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v21,%%v1,%%v23\n\t" + "vst %%v22,0(%%r1,%[dest])\n\t" + "vst %%v23,16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), + [src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23"); } static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c index 9e65c5fb59..91ea1c10c2 100644 --- a/kernel/zarch/cgemv_t_4.c +++ b/kernel/zarch/cgemv_t_4.c @@ -31,6 +31,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vzero %%v16\n\t" "vzero %%v17\n\t" "vzero %%v18\n\t" @@ -154,20 +159,23 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v23,%%v19,%%v21,%%v23\n\t" "vst %%v22,0(%[y])\n\t" "vst %%v23,16(%[y])" - : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vzero %%v16\n\t" "vzero %%v17\n\t" "vzero %%v18\n\t" @@ -263,13 +271,13 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v20,%%v16,%%v18,%%v20\n\t" "vfmasb %%v20,%%v17,%%v19,%%v20\n\t" "vst %%v20,0(%[y])" - : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23"); + : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23"); } static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, @@ -353,11 +361,11 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vfmasb %%v0,%%v16,%%v18,%%v0\n\t" "vfmasb %%v0,%%v17,%%v19,%%v0\n\t" "vsteg %%v0,0(%[y]),0" - : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c index 669d78a9d5..aab155f8b5 100644 --- a/kernel/zarch/crot.c +++ b/kernel/zarch/crot.c @@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { __asm__("vlrepf %%v0,%[c]\n\t" - "vlrepf %%v1,%[s]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x), + "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index a2d5bf2239..9fc54cf295 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -29,171 +29,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlrepf %%v0,0(%[alpha])\n\t" - "vlef %%v1,4(%[alpha]),0\n\t" - "vlef %%v1,4(%[alpha]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%[alpha]),1\n\t" - "vlef %%v1,4(%[alpha]),3\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "verllg %%v24,%%v16,32\n\t" - "verllg %%v25,%%v17,32\n\t" - "verllg %%v26,%%v18,32\n\t" - "verllg %%v27,%%v19,32\n\t" - "verllg %%v28,%%v20,32\n\t" - "verllg %%v29,%%v21,32\n\t" - "verllg %%v30,%%v22,32\n\t" - "verllg %%v31,%%v23,32\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v24,%%v16,32\n\t" + "verllg %%v25,%%v17,32\n\t" + "verllg %%v26,%%v18,32\n\t" + "verllg %%v27,%%v19,32\n\t" + "verllg %%v28,%%v20,32\n\t" + "verllg %%v29,%%v21,32\n\t" + "verllg %%v30,%%v22,32\n\t" + "verllg %%v31,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlef %%v0,4(%[alpha]),0\n\t" - "vlef %%v0,4(%[alpha]),2\n\t" - "vflcsb %%v0,%%v0\n\t" - "vlef %%v0,4(%[alpha]),1\n\t" - "vlef %%v0,4(%[alpha]),3\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "verllg %%v16,%%v16,32\n\t" - "verllg %%v17,%%v17,32\n\t" - "verllg %%v18,%%v18,32\n\t" - "verllg %%v19,%%v19,32\n\t" - "verllg %%v20,%%v20,32\n\t" - "verllg %%v21,%%v21,32\n\t" - "verllg %%v22,%%v22,32\n\t" - "verllg %%v23,%%v23,32\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); + "vlef %%v0,4(%[alpha]),2\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,4(%[alpha]),1\n\t" + "vlef %%v0,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v16,%%v16,32\n\t" + "verllg %%v17,%%v17,32\n\t" + "verllg %%v18,%%v18,32\n\t" + "verllg %%v19,%%v19,32\n\t" + "verllg %%v20,%%v20,32\n\t" + "verllg %%v21,%%v21,32\n\t" + "verllg %%v22,%%v22,32\n\t" + "verllg %%v23,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlrepf %%v0,0(%[alpha])\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c index 92a81591fb..198994e185 100644 --- a/kernel/zarch/cswap.c +++ b/kernel/zarch/cswap.c @@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x), + "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 37008f702d..caacb50dc1 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -34,51 +34,51 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxdb %%v16,%%v16,%%v24,8\n\t" - "vfmaxdb %%v17,%%v17,%%v25,8\n\t" - "vfmaxdb %%v18,%%v18,%%v26,8\n\t" - "vfmaxdb %%v19,%%v19,%%v27,8\n\t" - "vfmaxdb %%v20,%%v20,%%v28,8\n\t" - "vfmaxdb %%v21,%%v21,%%v29,8\n\t" - "vfmaxdb %%v22,%%v22,%%v30,8\n\t" - "vfmaxdb %%v23,%%v23,%%v31,8\n\t" - "vfmaxdb %%v16,%%v16,%%v20,8\n\t" - "vfmaxdb %%v17,%%v17,%%v21,8\n\t" - "vfmaxdb %%v18,%%v18,%%v22,8\n\t" - "vfmaxdb %%v19,%%v19,%%v23,8\n\t" - "vfmaxdb %%v16,%%v16,%%v18,8\n\t" - "vfmaxdb %%v17,%%v17,%%v19,8\n\t" - "vfmaxdb %%v16,%%v16,%%v17,8\n\t" - "vfmaxdb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmaxdb %%v0,%%v0,%%v16,8\n\t" - "lpdr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxdb %%v16,%%v16,%%v24,8\n\t" + "vfmaxdb %%v17,%%v17,%%v25,8\n\t" + "vfmaxdb %%v18,%%v18,%%v26,8\n\t" + "vfmaxdb %%v19,%%v19,%%v27,8\n\t" + "vfmaxdb %%v20,%%v20,%%v28,8\n\t" + "vfmaxdb %%v21,%%v21,%%v29,8\n\t" + "vfmaxdb %%v22,%%v22,%%v30,8\n\t" + "vfmaxdb %%v23,%%v23,%%v31,8\n\t" + "vfmaxdb %%v16,%%v16,%%v20,8\n\t" + "vfmaxdb %%v17,%%v17,%%v21,8\n\t" + "vfmaxdb %%v18,%%v18,%%v22,8\n\t" + "vfmaxdb %%v19,%%v19,%%v23,8\n\t" + "vfmaxdb %%v16,%%v16,%%v18,8\n\t" + "vfmaxdb %%v17,%%v17,%%v19,8\n\t" + "vfmaxdb %%v16,%%v16,%%v17,8\n\t" + "vfmaxdb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amax; } diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index 530d6e5bb6..f3db4c108f 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -34,85 +34,85 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amax; } diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index a01791741d..0163a144b3 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -34,51 +34,51 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmindb %%v16,%%v16,%%v24,8\n\t" - "vfmindb %%v17,%%v17,%%v25,8\n\t" - "vfmindb %%v18,%%v18,%%v26,8\n\t" - "vfmindb %%v19,%%v19,%%v27,8\n\t" - "vfmindb %%v20,%%v20,%%v28,8\n\t" - "vfmindb %%v21,%%v21,%%v29,8\n\t" - "vfmindb %%v22,%%v22,%%v30,8\n\t" - "vfmindb %%v23,%%v23,%%v31,8\n\t" - "vfmindb %%v16,%%v16,%%v20,8\n\t" - "vfmindb %%v17,%%v17,%%v21,8\n\t" - "vfmindb %%v18,%%v18,%%v22,8\n\t" - "vfmindb %%v19,%%v19,%%v23,8\n\t" - "vfmindb %%v16,%%v16,%%v18,8\n\t" - "vfmindb %%v17,%%v17,%%v19,8\n\t" - "vfmindb %%v16,%%v16,%%v17,8\n\t" - "vfmindb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmindb %%v0,%%v0,%%v16,8\n\t" - "lpdr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,8\n\t" + "vfmindb %%v17,%%v17,%%v25,8\n\t" + "vfmindb %%v18,%%v18,%%v26,8\n\t" + "vfmindb %%v19,%%v19,%%v27,8\n\t" + "vfmindb %%v20,%%v20,%%v28,8\n\t" + "vfmindb %%v21,%%v21,%%v29,8\n\t" + "vfmindb %%v22,%%v22,%%v30,8\n\t" + "vfmindb %%v23,%%v23,%%v31,8\n\t" + "vfmindb %%v16,%%v16,%%v20,8\n\t" + "vfmindb %%v17,%%v17,%%v21,8\n\t" + "vfmindb %%v18,%%v18,%%v22,8\n\t" + "vfmindb %%v19,%%v19,%%v23,8\n\t" + "vfmindb %%v16,%%v16,%%v18,8\n\t" + "vfmindb %%v17,%%v17,%%v19,8\n\t" + "vfmindb %%v16,%%v16,%%v17,8\n\t" + "vfmindb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amin; } diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 2172b6d6f5..4196b2e15f 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -34,85 +34,85 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amin; } diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 9f69a99314..aa1382b103 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -34,81 +34,81 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { FLOAT asum; __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v27\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v29\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v24,%%v24,%%v31\n\t" - "vrepg %%v25,%%v24,1\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" - : [asum] "=m"(asum),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=Q"(asum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return asum; } diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 179ef8834c..5b0208c20e 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -29,82 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vlrepg %%v0,%[alpha]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,0(%%r1,%[y])\n\t" - "vl %%v21,16(%%r1,%[y])\n\t" - "vl %%v22,32(%%r1,%[y])\n\t" - "vl %%v23,48(%%r1,%[y])\n\t" - "vl %%v24,64(%%r1,%[x])\n\t" - "vl %%v25,80(%%r1,%[x])\n\t" - "vl %%v26,96(%%r1,%[x])\n\t" - "vl %%v27,112(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y])\n\t" - "vst %%v17,16(%%r1,%[y])\n\t" - "vst %%v18,32(%%r1,%[y])\n\t" - "vst %%v19,48(%%r1,%[y])\n\t" - "vst %%v24,64(%%r1,%[y])\n\t" - "vst %%v25,80(%%r1,%[y])\n\t" - "vst %%v26,96(%%r1,%[y])\n\t" - "vst %%v27,112(%%r1,%[y])\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,128(%%r1,%[y])\n\t" - "vl %%v21,144(%%r1,%[y])\n\t" - "vl %%v22,160(%%r1,%[y])\n\t" - "vl %%v23,176(%%r1,%[y])\n\t" - "vl %%v24,192(%%r1,%[x])\n\t" - "vl %%v25,208(%%r1,%[x])\n\t" - "vl %%v26,224(%%r1,%[x])\n\t" - "vl %%v27,240(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[y])\n\t" - "vl %%v29,208(%%r1,%[y])\n\t" - "vl %%v30,224(%%r1,%[y])\n\t" - "vl %%v31,240(%%r1,%[y])\n\t" - "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y])\n\t" - "vst %%v17,144(%%r1,%[y])\n\t" - "vst %%v18,160(%%r1,%[y])\n\t" - "vst %%v19,176(%%r1,%[y])\n\t" - "vst %%v24,192(%%r1,%[y])\n\t" - "vst %%v25,208(%%r1,%[y])\n\t" - "vst %%v26,224(%%r1,%[y])\n\t" - "vst %%v27,240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), - [alpha] "m"(*alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + [alpha] "Q"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c index b6a740c431..691b90c64c 100644 --- a/kernel/zarch/dcopy.c +++ b/kernel/zarch/dcopy.c @@ -29,16 +29,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],5\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x) - : "cc"); + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index f5f601717c..9cad68f4b6 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -31,60 +31,60 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { FLOAT dot; __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v3\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v5\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v0,%%v0,%%v7\n\t" - "vrepg %%v1,%%v0,1\n\t" - "adbr %%f0,%%f1\n\t" - "ldr %[dot],%%f0" - : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), - [y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return dot; } diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index c93ff9b548..502ba837ea 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -31,324 +31,334 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vlrepg %%v0,0(%[x])\n\t" - "vlrepg %%v1,8(%[x])\n\t" - "vlrepg %%v2,16(%[x])\n\t" - "vlrepg %%v3,24(%[x])\n\t" - "vlrepg %%v4,%[alpha]\n\t" - "vfmdb %%v0,%%v0,%%v4\n\t" - "vfmdb %%v1,%%v1,%%v4\n\t" - "vfmdb %%v2,%%v2,%%v4\n\t" - "vfmdb %%v3,%%v3,%%v4\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vl %%v6,32(%%r1,%[y])\n\t" - "vl %%v7,48(%%r1,%[y])\n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "vst %%v6,32(%%r1,%[y])\n\t" - "vst %%v7,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[ap0])\n\t" - "vl %%v17,64(%%r1,%[ap1])\n\t" - "vl %%v18,64(%%r1,%[ap2])\n\t" - "vl %%v19,64(%%r1,%[ap3])\n\t" - "vl %%v20,80(%%r1,%[ap0])\n\t" - "vl %%v21,80(%%r1,%[ap1])\n\t" - "vl %%v22,80(%%r1,%[ap2])\n\t" - "vl %%v23,80(%%r1,%[ap3])\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vl %%v4,64(%%r1,%[y])\n\t" - "vl %%v5,80(%%r1,%[y])\n\t" - "vl %%v6,96(%%r1,%[y])\n\t" - "vl %%v7,112(%%r1,%[y])\n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y])\n\t" - "vst %%v5,80(%%r1,%[y])\n\t" - "vst %%v6,96(%%r1,%[y])\n\t" - "vst %%v7,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,16(%[x])\n\t" + "vlrepg %%v3,24(%[x])\n\t" + "vlrepg %%v4,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v4\n\t" + "vfmdb %%v1,%%v1,%%v4\n\t" + "vfmdb %%v2,%%v2,%%v4\n\t" + "vfmdb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vlrepg %%v0,0(%[x])\n\t" - "vlrepg %%v1,8(%[x])\n\t" - "vlrepg %%v2,%[alpha]\n\t" - "vfmdb %%v0,%%v0,%%v2\n\t" - "vfmdb %%v1,%%v1,%%v2\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v20,32(%%r1,%[ap0])\n\t" - "vl %%v21,32(%%r1,%[ap1])\n\t" - "vl %%v22,48(%%r1,%[ap0])\n\t" - "vl %%v23,48(%%r1,%[ap1])\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vl %%v4,32(%%r1,%[y])\n\t" - "vl %%v5,48(%%r1,%[y])\n\t" - "vl %%v6,64(%%r1,%[y])\n\t" - "vl %%v7,80(%%r1,%[y])\n\t" - "vl %%v8,96(%%r1,%[y])\n\t" - "vl %%v9,112(%%r1,%[y])\n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v0,%%v5\n\t" - "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmadb %%v7,%%v26,%%v0,%%v7\n\t" - "vfmadb %%v8,%%v28,%%v0,%%v8\n\t" - "vfmadb %%v9,%%v30,%%v0,%%v9\n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" - "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v1,%%v5\n\t" - "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" - "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" - "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "vst %%v4,32(%%r1,%[y])\n\t" - "vst %%v5,48(%%r1,%[y])\n\t" - "vst %%v6,64(%%r1,%[y])\n\t" - "vst %%v7,80(%%r1,%[y])\n\t" - "vst %%v8,96(%%r1,%[y])\n\t" - "vst %%v9,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" - "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v2\n\t" + "vfmdb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmadb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmadb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vlrepg %%v0,0(%[x])\n\t" - "vlrepg %%v16,%[alpha]\n\t" - "vfmdb %%v0,%%v0,%%v16\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,32(%%r1,%[a0])\n\t" - "vl %%v19,48(%%r1,%[a0])\n\t" - "vl %%v20,64(%%r1,%[a0])\n\t" - "vl %%v21,80(%%r1,%[a0])\n\t" - "vl %%v22,96(%%r1,%[a0])\n\t" - "vl %%v23,112(%%r1,%[a0])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y])\n\t" - "vst %%v25,16(%%r1,%[y])\n\t" - "vst %%v26,32(%%r1,%[y])\n\t" - "vst %%v27,48(%%r1,%[y])\n\t" - "vst %%v28,64(%%r1,%[y])\n\t" - "vst %%v29,80(%%r1,%[y])\n\t" - "vst %%v30,96(%%r1,%[y])\n\t" - "vst %%v31,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,0(%%r1,%[y])\n\t" - "vl %%v19,16(%%r1,%[y])\n\t" - "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" - "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" - "vst %%v18,0(%%r1,%[y])\n\t" - "vst %%v19,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), - [n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepg %%v16,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,0(%%r1,%[y])\n\t" + "vl %%v19,16(%%r1,%[y])\n\t" + "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" + "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" + "vst %%v18,0(%%r1,%[y])\n\t" + "vst %%v19,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 24680cf1b7..de72a1798a 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -30,333 +30,341 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v1,%%v1,%%v5\n\t" - "vfadb %%v2,%%v2,%%v6\n\t" - "vfadb %%v3,%%v3,%%v7\n\t" - "vrepg %%v4,%%v0,1\n\t" - "adbr %%f0,%%f4\n\t" - "std %%f0,0(%[y])\n\t" - "vrepg %%v4,%%v1,1\n\t" - "adbr %%f1,%%f4\n\t" - "std %%f1,8(%[y])\n\t" - "vrepg %%v4,%%v2,1\n\t" - "adbr %%f2,%%f4\n\t" - "std %%f2,16(%[y])\n\t" - "vrepg %%v4,%%v3,1\n\t" - "adbr %%f3,%%f4\n\t" - "std %%f3,24(%[y])" - : "=m"(*(FLOAT (*)[4]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v2,%%v2,%%v6\n\t" + "vfadb %%v3,%%v3,%%v7\n\t" + "vrepg %%v4,%%v0,1\n\t" + "adbr %%f0,%%f4\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v4,%%v1,1\n\t" + "adbr %%f1,%%f4\n\t" + "std %%f1,8(%[y])\n\t" + "vrepg %%v4,%%v2,1\n\t" + "adbr %%f2,%%f4\n\t" + "std %%f2,16(%[y])\n\t" + "vrepg %%v4,%%v3,1\n\t" + "adbr %%f3,%%f4\n\t" + "std %%f3,24(%[y])" + : "=m"(*(struct { FLOAT x[4]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" - "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" - "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0])\n\t" - "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1])\n\t" - "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" - "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" - "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v1,%%v1,%%v3\n\t" - "vfadb %%v1,%%v1,%%v5\n\t" - "vfadb %%v1,%%v1,%%v7\n\t" - "vrepg %%v2,%%v0,1\n\t" - "adbr %%f0,%%f2\n\t" - "std %%f0,0(%[y])\n\t" - "vrepg %%v2,%%v1,1\n\t" - "adbr %%f1,%%f2\n\t" - "std %%f1,8(%[y])" - : "=m"(*(FLOAT (*)[2]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v1,%%v1,%%v3\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v1,%%v1,%%v7\n\t" + "vrepg %%v2,%%v0,1\n\t" + "adbr %%f0,%%f2\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v2,%%v1,1\n\t" + "adbr %%f1,%%f2\n\t" + "std %%f1,8(%[y])" + : "=m"(*(struct { FLOAT x[2]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0])\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0])\n\t" - "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0])\n\t" - "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0])\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0])\n\t" - "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0])\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v3\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v5\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v0,%%v0,%%v7\n\t" - "vrepg %%v1,%%v0,1\n\t" - "adbr %%f0,%%f1\n\t" - "std %%f0,0(%[y])" - : "=m"(*(FLOAT (*)[1]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "std %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { @@ -369,74 +377,74 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { __asm__("vlrepg %%v0,%[da]\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,64(%%r1,%[src])\n\t" - "vl %%v21,80(%%r1,%[src])\n\t" - "vl %%v22,96(%%r1,%[src])\n\t" - "vl %%v23,112(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "vl %%v26, 32(%%r1,%[dest])\n\t" - "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest])\n\t" - "vl %%v27, 48(%%r1,%[dest])\n\t" - "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest])\n\t" - "vl %%v28, 64(%%r1,%[dest])\n\t" - "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest])\n\t" - "vl %%v29, 80(%%r1,%[dest])\n\t" - "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest])\n\t" - "vl %%v30, 96(%%r1,%[dest])\n\t" - "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest])\n\t" - "vl %%v31, 112(%%r1,%[dest])\n\t" - "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) dest) - : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), - [src] "a"(src),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) dest) + : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index 65ed31f01b..cdc8d5d08f 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -31,51 +31,51 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxdb %%v16,%%v16,%%v24,0\n\t" - "vfmaxdb %%v17,%%v17,%%v25,0\n\t" - "vfmaxdb %%v18,%%v18,%%v26,0\n\t" - "vfmaxdb %%v19,%%v19,%%v27,0\n\t" - "vfmaxdb %%v20,%%v20,%%v28,0\n\t" - "vfmaxdb %%v21,%%v21,%%v29,0\n\t" - "vfmaxdb %%v22,%%v22,%%v30,0\n\t" - "vfmaxdb %%v23,%%v23,%%v31,0\n\t" - "vfmaxdb %%v16,%%v16,%%v20,0\n\t" - "vfmaxdb %%v17,%%v17,%%v21,0\n\t" - "vfmaxdb %%v18,%%v18,%%v22,0\n\t" - "vfmaxdb %%v19,%%v19,%%v23,0\n\t" - "vfmaxdb %%v16,%%v16,%%v18,0\n\t" - "vfmaxdb %%v17,%%v17,%%v19,0\n\t" - "vfmaxdb %%v16,%%v16,%%v17,0\n\t" - "vfmaxdb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmaxdb %%v0,%%v0,%%v16,0\n\t" - "ldr %[max],%%f0" - : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxdb %%v16,%%v16,%%v24,0\n\t" + "vfmaxdb %%v17,%%v17,%%v25,0\n\t" + "vfmaxdb %%v18,%%v18,%%v26,0\n\t" + "vfmaxdb %%v19,%%v19,%%v27,0\n\t" + "vfmaxdb %%v20,%%v20,%%v28,0\n\t" + "vfmaxdb %%v21,%%v21,%%v29,0\n\t" + "vfmaxdb %%v22,%%v22,%%v30,0\n\t" + "vfmaxdb %%v23,%%v23,%%v31,0\n\t" + "vfmaxdb %%v16,%%v16,%%v20,0\n\t" + "vfmaxdb %%v17,%%v17,%%v21,0\n\t" + "vfmaxdb %%v18,%%v18,%%v22,0\n\t" + "vfmaxdb %%v19,%%v19,%%v23,0\n\t" + "vfmaxdb %%v16,%%v16,%%v18,0\n\t" + "vfmaxdb %%v17,%%v17,%%v19,0\n\t" + "vfmaxdb %%v16,%%v16,%%v17,0\n\t" + "vfmaxdb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,0\n\t" + "ldr %[max],%%f0" + : [max] "=f"(max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return max; } diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 87bccbe55d..c4e8d91f87 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -31,68 +31,68 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[max],%%f0" - : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[max],%%f0" + : [max] "=f"(max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return max; } diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 518cc262ce..f9b129cbd9 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -31,51 +31,51 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmindb %%v16,%%v16,%%v24,0\n\t" - "vfmindb %%v17,%%v17,%%v25,0\n\t" - "vfmindb %%v18,%%v18,%%v26,0\n\t" - "vfmindb %%v19,%%v19,%%v27,0\n\t" - "vfmindb %%v20,%%v20,%%v28,0\n\t" - "vfmindb %%v21,%%v21,%%v29,0\n\t" - "vfmindb %%v22,%%v22,%%v30,0\n\t" - "vfmindb %%v23,%%v23,%%v31,0\n\t" - "vfmindb %%v16,%%v16,%%v20,0\n\t" - "vfmindb %%v17,%%v17,%%v21,0\n\t" - "vfmindb %%v18,%%v18,%%v22,0\n\t" - "vfmindb %%v19,%%v19,%%v23,0\n\t" - "vfmindb %%v16,%%v16,%%v18,0\n\t" - "vfmindb %%v17,%%v17,%%v19,0\n\t" - "vfmindb %%v16,%%v16,%%v17,0\n\t" - "vfmindb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmindb %%v0,%%v0,%%v16,0\n\t" - "ldr %[min],%%f0" - : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v17,%%v17,%%v25,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v19,%%v19,%%v27,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v21,%%v21,%%v29,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v23,%%v23,%%v31,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v17,%%v17,%%v21,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v19,%%v19,%%v23,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v17,%%v17,%%v19,0\n\t" + "vfmindb %%v16,%%v16,%%v17,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return min; } diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index 91561992f5..77f021c1d9 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -31,68 +31,68 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[min],%%f0" - : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return min; } diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index 8f0197f023..11fbe15b6d 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { __asm__("vlrepg %%v0,%[c]\n\t" - "vlrepg %%v1,%[s]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + [n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index c944990b5a..2961eff202 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -29,61 +29,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { __asm__("vlrepg %%v0,%[da]\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x])\n\t" - "vfmdb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x])\n\t" - "vl %%v25,16(%%r1,%[x])\n\t" - "vfmdb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x])\n\t" - "vl %%v26,32(%%r1,%[x])\n\t" - "vfmdb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x])\n\t" - "vl %%v27,48(%%r1,%[x])\n\t" - "vfmdb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[x])\n\t" - "vfmdb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x])\n\t" - "vl %%v29,80(%%r1,%[x])\n\t" - "vfmdb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x])\n\t" - "vl %%v30,96(%%r1,%[x])\n\t" - "vfmdb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vfmdb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) - : [x] "a"(x),[da] "m"(da) - : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmdb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmdb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmdb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmdb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmdb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmdb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmdb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmdb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : [x] "a"(x),[da] "Q"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 1ac02d4b93..5fa88c3b92 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -31,91 +31,92 @@ static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { double dot; __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "pfd 1,1024(%%r1,%[y])\n\t" - "vlef %%v16,0(%%r1,%[x]),0\n\t" - "vlef %%v16,4(%%r1,%[x]),2\n\t" - "vlef %%v17,8(%%r1,%[x]),0\n\t" - "vlef %%v17,12(%%r1,%[x]),2\n\t" - "vlef %%v18,16(%%r1,%[x]),0\n\t" - "vlef %%v18,20(%%r1,%[x]),2\n\t" - "vlef %%v19,24(%%r1,%[x]),0\n\t" - "vlef %%v19,28(%%r1,%[x]),2\n\t" - "vlef %%v20,32(%%r1,%[x]),0\n\t" - "vlef %%v20,36(%%r1,%[x]),2\n\t" - "vlef %%v21,40(%%r1,%[x]),0\n\t" - "vlef %%v21,44(%%r1,%[x]),2\n\t" - "vlef %%v22,48(%%r1,%[x]),0\n\t" - "vlef %%v22,52(%%r1,%[x]),2\n\t" - "vlef %%v23,56(%%r1,%[x]),0\n\t" - "vlef %%v23,60(%%r1,%[x]),2\n\t" - "vflls %%v16,%%v16\n\t" - "vflls %%v17,%%v17\n\t" - "vflls %%v18,%%v18\n\t" - "vflls %%v19,%%v19\n\t" - "vflls %%v20,%%v20\n\t" - "vflls %%v21,%%v21\n\t" - "vflls %%v22,%%v22\n\t" - "vflls %%v23,%%v23\n\t" - "vlef %%v24,0(%%r1,%[y]),0\n\t" - "vlef %%v24,4(%%r1,%[y]),2\n\t" - "vflls %%v24,%%v24\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vlef %%v25,8(%%r1,%[y]),0\n\t" - "vlef %%v25,12(%%r1,%[y]),2\n\t" - "vflls %%v25,%%v25\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vlef %%v26,16(%%r1,%[y]),0\n\t" - "vlef %%v26,20(%%r1,%[y]),2\n\t" - "vflls %%v26,%%v26\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vlef %%v27,24(%%r1,%[y]),0\n\t" - "vlef %%v27,28(%%r1,%[y]),2\n\t" - "vflls %%v27,%%v27\n\t" - "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vlef %%v28,32(%%r1,%[y]),0\n\t" - "vlef %%v28,36(%%r1,%[y]),2\n\t" - "vflls %%v28,%%v28\n\t" - "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vlef %%v29,40(%%r1,%[y]),0\n\t" - "vlef %%v29,44(%%r1,%[y]),2\n\t" - "vflls %%v29,%%v29\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vlef %%v30,48(%%r1,%[y]),0\n\t" - "vlef %%v30,52(%%r1,%[y]),2\n\t" - "vflls %%v30,%%v30\n\t" - "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vlef %%v31,56(%%r1,%[y]),0\n\t" - "vlef %%v31,60(%%r1,%[y]),2\n\t" - "vflls %%v31,%%v31\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,64\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v3\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v5\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v0,%%v0,%%v7\n\t" - "vrepg %%v1,%%v0,1\n\t" - "adbr %%f0,%%f1\n\t" - "ldr %[dot],%%f0" - : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vlef %%v16,0(%%r1,%[x]),0\n\t" + "vlef %%v16,4(%%r1,%[x]),2\n\t" + "vlef %%v17,8(%%r1,%[x]),0\n\t" + "vlef %%v17,12(%%r1,%[x]),2\n\t" + "vlef %%v18,16(%%r1,%[x]),0\n\t" + "vlef %%v18,20(%%r1,%[x]),2\n\t" + "vlef %%v19,24(%%r1,%[x]),0\n\t" + "vlef %%v19,28(%%r1,%[x]),2\n\t" + "vlef %%v20,32(%%r1,%[x]),0\n\t" + "vlef %%v20,36(%%r1,%[x]),2\n\t" + "vlef %%v21,40(%%r1,%[x]),0\n\t" + "vlef %%v21,44(%%r1,%[x]),2\n\t" + "vlef %%v22,48(%%r1,%[x]),0\n\t" + "vlef %%v22,52(%%r1,%[x]),2\n\t" + "vlef %%v23,56(%%r1,%[x]),0\n\t" + "vlef %%v23,60(%%r1,%[x]),2\n\t" + "vflls %%v16,%%v16\n\t" + "vflls %%v17,%%v17\n\t" + "vflls %%v18,%%v18\n\t" + "vflls %%v19,%%v19\n\t" + "vflls %%v20,%%v20\n\t" + "vflls %%v21,%%v21\n\t" + "vflls %%v22,%%v22\n\t" + "vflls %%v23,%%v23\n\t" + "vlef %%v24,0(%%r1,%[y]),0\n\t" + "vlef %%v24,4(%%r1,%[y]),2\n\t" + "vflls %%v24,%%v24\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vlef %%v25,8(%%r1,%[y]),0\n\t" + "vlef %%v25,12(%%r1,%[y]),2\n\t" + "vflls %%v25,%%v25\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vlef %%v26,16(%%r1,%[y]),0\n\t" + "vlef %%v26,20(%%r1,%[y]),2\n\t" + "vflls %%v26,%%v26\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vlef %%v27,24(%%r1,%[y]),0\n\t" + "vlef %%v27,28(%%r1,%[y]),2\n\t" + "vflls %%v27,%%v27\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vlef %%v28,32(%%r1,%[y]),0\n\t" + "vlef %%v28,36(%%r1,%[y]),2\n\t" + "vflls %%v28,%%v28\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vlef %%v29,40(%%r1,%[y]),0\n\t" + "vlef %%v29,44(%%r1,%[y]),2\n\t" + "vflls %%v29,%%v29\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vlef %%v30,48(%%r1,%[y]),0\n\t" + "vlef %%v30,52(%%r1,%[y]),2\n\t" + "vflls %%v30,%%v30\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vlef %%v31,56(%%r1,%[y]),0\n\t" + "vlef %%v31,60(%%r1,%[y]),2\n\t" + "vflls %%v31,%%v31\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return dot; } diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index 60ba40bd62..f0c9ded511 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + [n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 1e1040a6e2..a2546b8124 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -34,191 +34,191 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v1,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v1,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v1,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v1,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v1,%%v1\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,16\n\t" - "vzero %%v4\n\t" - "vleib %%v9,0,0\n\t" - "vleib %%v9,1,1\n\t" - "vleib %%v9,2,2\n\t" - "vleib %%v9,3,3\n\t" - "vleib %%v9,8,4\n\t" - "vleib %%v9,9,5\n\t" - "vleib %%v9,10,6\n\t" - "vleib %%v9,11,7\n\t" - "vleib %%v9,16,8\n\t" - "vleib %%v9,17,9\n\t" - "vleib %%v9,18,10\n\t" - "vleib %%v9,19,11\n\t" - "vleib %%v9,24,12\n\t" - "vleib %%v9,25,13\n\t" - "vleib %%v9,26,14\n\t" - "vleib %%v9,27,15\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v28,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v29,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v30,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v28,144(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v29,176(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v30,208(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v0,%%v3\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v2,%%v0\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return iamax; } diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index d1c0e32a1e..09654b7426 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -34,191 +34,191 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v1,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v1,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v1,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v1,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v1,%%v1\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,16\n\t" - "vzero %%v4\n\t" - "vleib %%v9,0,0\n\t" - "vleib %%v9,1,1\n\t" - "vleib %%v9,2,2\n\t" - "vleib %%v9,3,3\n\t" - "vleib %%v9,8,4\n\t" - "vleib %%v9,9,5\n\t" - "vleib %%v9,10,6\n\t" - "vleib %%v9,11,7\n\t" - "vleib %%v9,16,8\n\t" - "vleib %%v9,17,9\n\t" - "vleib %%v9,18,10\n\t" - "vleib %%v9,19,11\n\t" - "vleib %%v9,24,12\n\t" - "vleib %%v9,25,13\n\t" - "vleib %%v9,26,14\n\t" - "vleib %%v9,27,15\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v28,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v29,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v30,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v28,144(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v29,176(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v30,208(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v3,%%v0\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v0,%%v2\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return iamin; } diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 8434c811f4..b292c1d151 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -34,138 +34,138 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v2,%%v0\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return iamax; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 80a37e6c25..f9a8119e15 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -34,138 +34,138 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v0,%%v2\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return iamin; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 18cdba4376..8f283bc170 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -31,121 +31,121 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { BLASLONG imax; __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[max],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v2,%%v0\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[max]\n\t" - "vlgvg %[imax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return imax; } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 02ca427e47..e4b7bb4fe3 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -31,121 +31,121 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { BLASLONG imin; __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[min],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v0,%%v2\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[min]\n\t" - "vlgvg %[imin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return imin; } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index bbb4012aae..ac86435d77 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -34,182 +34,182 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; __asm__("vl %%v0,0(%[x])\n\t" - "vflpsb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v0,%%v3\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v2,%%v0\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return iamax; } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index e8b34b934a..3f2d039eb9 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -34,182 +34,182 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; __asm__("vl %%v0,0(%[x])\n\t" - "vflpsb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v3,%%v0\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v0,%%v2\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return iamin; } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index a565df5031..41172c1bd3 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -31,165 +31,165 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) { BLASLONG imax; __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v0,%%v3\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[max],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v2,%%v0\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[max]\n\t" - "vlgvg %[imax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return imax; } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index ff72b2c641..e2684df416 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -31,165 +31,165 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) { BLASLONG imin; __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v3,%%v0\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[min],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v0,%%v2\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[min]\n\t" - "vlgvg %[imin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return imin; } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 48afb8215b..daca1d6f71 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -34,134 +34,134 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v1,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v1,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v1,%%v1\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,8\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v2,%%v0\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); return iamax; } diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 3edbe3d58c..9ababb91fd 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -34,134 +34,134 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v1,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v1,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v1,%%v1\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,8\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v0,%%v2\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); return iamin; } diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index efbc0318c8..fdda6dd321 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -34,53 +34,53 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxsb %%v16,%%v16,%%v24,8\n\t" - "vfmaxsb %%v17,%%v17,%%v25,8\n\t" - "vfmaxsb %%v18,%%v18,%%v26,8\n\t" - "vfmaxsb %%v19,%%v19,%%v27,8\n\t" - "vfmaxsb %%v20,%%v20,%%v28,8\n\t" - "vfmaxsb %%v21,%%v21,%%v29,8\n\t" - "vfmaxsb %%v22,%%v22,%%v30,8\n\t" - "vfmaxsb %%v23,%%v23,%%v31,8\n\t" - "vfmaxsb %%v16,%%v16,%%v20,8\n\t" - "vfmaxsb %%v17,%%v17,%%v21,8\n\t" - "vfmaxsb %%v18,%%v18,%%v22,8\n\t" - "vfmaxsb %%v19,%%v19,%%v23,8\n\t" - "vfmaxsb %%v16,%%v16,%%v18,8\n\t" - "vfmaxsb %%v17,%%v17,%%v19,8\n\t" - "vfmaxsb %%v16,%%v16,%%v17,8\n\t" - "vfmaxsb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfmaxsb %%v0,%%v0,%%v16,8\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfmaxsb %%v0,%%v0,%%v16,8\n\t" - "lper %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxsb %%v16,%%v16,%%v24,8\n\t" + "vfmaxsb %%v17,%%v17,%%v25,8\n\t" + "vfmaxsb %%v18,%%v18,%%v26,8\n\t" + "vfmaxsb %%v19,%%v19,%%v27,8\n\t" + "vfmaxsb %%v20,%%v20,%%v28,8\n\t" + "vfmaxsb %%v21,%%v21,%%v29,8\n\t" + "vfmaxsb %%v22,%%v22,%%v30,8\n\t" + "vfmaxsb %%v23,%%v23,%%v31,8\n\t" + "vfmaxsb %%v16,%%v16,%%v20,8\n\t" + "vfmaxsb %%v17,%%v17,%%v21,8\n\t" + "vfmaxsb %%v18,%%v18,%%v22,8\n\t" + "vfmaxsb %%v19,%%v19,%%v23,8\n\t" + "vfmaxsb %%v16,%%v16,%%v18,8\n\t" + "vfmaxsb %%v17,%%v17,%%v19,8\n\t" + "vfmaxsb %%v16,%%v16,%%v17,8\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amax; } diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index 138836ce57..f05e851f96 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -34,53 +34,53 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfminsb %%v16,%%v16,%%v24,8\n\t" - "vfminsb %%v17,%%v17,%%v25,8\n\t" - "vfminsb %%v18,%%v18,%%v26,8\n\t" - "vfminsb %%v19,%%v19,%%v27,8\n\t" - "vfminsb %%v20,%%v20,%%v28,8\n\t" - "vfminsb %%v21,%%v21,%%v29,8\n\t" - "vfminsb %%v22,%%v22,%%v30,8\n\t" - "vfminsb %%v23,%%v23,%%v31,8\n\t" - "vfminsb %%v16,%%v16,%%v20,8\n\t" - "vfminsb %%v17,%%v17,%%v21,8\n\t" - "vfminsb %%v18,%%v18,%%v22,8\n\t" - "vfminsb %%v19,%%v19,%%v23,8\n\t" - "vfminsb %%v16,%%v16,%%v18,8\n\t" - "vfminsb %%v17,%%v17,%%v19,8\n\t" - "vfminsb %%v16,%%v16,%%v17,8\n\t" - "vfminsb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfminsb %%v0,%%v0,%%v16,8\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfminsb %%v0,%%v0,%%v16,8\n\t" - "lper %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,8\n\t" + "vfminsb %%v17,%%v17,%%v25,8\n\t" + "vfminsb %%v18,%%v18,%%v26,8\n\t" + "vfminsb %%v19,%%v19,%%v27,8\n\t" + "vfminsb %%v20,%%v20,%%v28,8\n\t" + "vfminsb %%v21,%%v21,%%v29,8\n\t" + "vfminsb %%v22,%%v22,%%v30,8\n\t" + "vfminsb %%v23,%%v23,%%v31,8\n\t" + "vfminsb %%v16,%%v16,%%v20,8\n\t" + "vfminsb %%v17,%%v17,%%v21,8\n\t" + "vfminsb %%v18,%%v18,%%v22,8\n\t" + "vfminsb %%v19,%%v19,%%v23,8\n\t" + "vfminsb %%v16,%%v16,%%v18,8\n\t" + "vfminsb %%v17,%%v17,%%v19,8\n\t" + "vfminsb %%v16,%%v16,%%v17,8\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amin; } diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c index 0c3057a929..d56f2697b1 100644 --- a/kernel/zarch/sasum.c +++ b/kernel/zarch/sasum.c @@ -34,83 +34,83 @@ static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) { FLOAT asum; __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v27\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v29\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vfasb %%v24,%%v24,%%v31\n\t" - "veslg %%v25,%%v24,32\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vrepf %%v25,%%v24,2\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" - : [asum] "=m"(asum),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=Q"(asum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return asum; } diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c index e41e87af07..ca34a47ff3 100644 --- a/kernel/zarch/saxpy.c +++ b/kernel/zarch/saxpy.c @@ -29,82 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vlrepf %%v0,%[alpha]\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,0(%%r1,%[y])\n\t" - "vl %%v21,16(%%r1,%[y])\n\t" - "vl %%v22,32(%%r1,%[y])\n\t" - "vl %%v23,48(%%r1,%[y])\n\t" - "vl %%v24,64(%%r1,%[x])\n\t" - "vl %%v25,80(%%r1,%[x])\n\t" - "vl %%v26,96(%%r1,%[x])\n\t" - "vl %%v27,112(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y])\n\t" - "vst %%v17,16(%%r1,%[y])\n\t" - "vst %%v18,32(%%r1,%[y])\n\t" - "vst %%v19,48(%%r1,%[y])\n\t" - "vst %%v24,64(%%r1,%[y])\n\t" - "vst %%v25,80(%%r1,%[y])\n\t" - "vst %%v26,96(%%r1,%[y])\n\t" - "vst %%v27,112(%%r1,%[y])\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,128(%%r1,%[y])\n\t" - "vl %%v21,144(%%r1,%[y])\n\t" - "vl %%v22,160(%%r1,%[y])\n\t" - "vl %%v23,176(%%r1,%[y])\n\t" - "vl %%v24,192(%%r1,%[x])\n\t" - "vl %%v25,208(%%r1,%[x])\n\t" - "vl %%v26,224(%%r1,%[x])\n\t" - "vl %%v27,240(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[y])\n\t" - "vl %%v29,208(%%r1,%[y])\n\t" - "vl %%v30,224(%%r1,%[y])\n\t" - "vl %%v31,240(%%r1,%[y])\n\t" - "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y])\n\t" - "vst %%v17,144(%%r1,%[y])\n\t" - "vst %%v18,160(%%r1,%[y])\n\t" - "vst %%v19,176(%%r1,%[y])\n\t" - "vst %%v24,192(%%r1,%[y])\n\t" - "vst %%v25,208(%%r1,%[y])\n\t" - "vst %%v26,224(%%r1,%[y])\n\t" - "vst %%v27,240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), - [alpha] "m"(*alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + [alpha] "Q"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c index 4e4993737d..5c453cfbb9 100644 --- a/kernel/zarch/scopy.c +++ b/kernel/zarch/scopy.c @@ -29,16 +29,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],6\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x) - : "cc"); + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c index f659b0c8a5..d870b30f07 100644 --- a/kernel/zarch/sdot.c +++ b/kernel/zarch/sdot.c @@ -31,64 +31,64 @@ static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { FLOAT dot; __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" - "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" - "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" - "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" - "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vfasb %%v0,%%v0,%%v3\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v0,%%v0,%%v5\n\t" - "vfasb %%v0,%%v0,%%v6\n\t" - "vfasb %%v0,%%v0,%%v7\n\t" - "vrepf %%v1,%%v0,1\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepf %%v3,%%v0,3\n\t" - "aebr %%f0,%%f1\n\t" - "aebr %%f0,%%f2\n\t" - "aebr %%f0,%%f3\n\t" - "ler %[dot],%%f0" - : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), - [y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "vrepf %%v1,%%v0,1\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepf %%v3,%%v0,3\n\t" + "aebr %%f0,%%f1\n\t" + "aebr %%f0,%%f2\n\t" + "aebr %%f0,%%f3\n\t" + "ler %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return dot; } diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index 86ac249931..a1efef373f 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -31,304 +31,314 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vlrepf %%v0,0(%[x])\n\t" - "vlrepf %%v1,4(%[x])\n\t" - "vlrepf %%v2,8(%[x])\n\t" - "vlrepf %%v3,12(%[x])\n\t" - "vlrepf %%v4,%[alpha]\n\t" - "vfmsb %%v0,%%v0,%%v4\n\t" - "vfmsb %%v1,%%v1,%%v4\n\t" - "vfmsb %%v2,%%v2,%%v4\n\t" - "vfmsb %%v3,%%v3,%%v4\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vl %%v6,32(%%r1,%[y])\n\t" - "vl %%v7,48(%%r1,%[y])\n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "vst %%v6,32(%%r1,%[y])\n\t" - "vst %%v7,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[ap0])\n\t" - "vl %%v17,64(%%r1,%[ap1])\n\t" - "vl %%v18,64(%%r1,%[ap2])\n\t" - "vl %%v19,64(%%r1,%[ap3])\n\t" - "vl %%v20,80(%%r1,%[ap0])\n\t" - "vl %%v21,80(%%r1,%[ap1])\n\t" - "vl %%v22,80(%%r1,%[ap2])\n\t" - "vl %%v23,80(%%r1,%[ap3])\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vl %%v4,64(%%r1,%[y])\n\t" - "vl %%v5,80(%%r1,%[y])\n\t" - "vl %%v6,96(%%r1,%[y])\n\t" - "vl %%v7,112(%%r1,%[y])\n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y])\n\t" - "vst %%v5,80(%%r1,%[y])\n\t" - "vst %%v6,96(%%r1,%[y])\n\t" - "vst %%v7,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,8(%[x])\n\t" + "vlrepf %%v3,12(%[x])\n\t" + "vlrepf %%v4,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v4\n\t" + "vfmsb %%v1,%%v1,%%v4\n\t" + "vfmsb %%v2,%%v2,%%v4\n\t" + "vfmsb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vlrepf %%v0,0(%[x])\n\t" - "vlrepf %%v1,4(%[x])\n\t" - "vlrepf %%v2,%[alpha]\n\t" - "vfmsb %%v0,%%v0,%%v2\n\t" - "vfmsb %%v1,%%v1,%%v2\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v20,32(%%r1,%[ap0])\n\t" - "vl %%v21,32(%%r1,%[ap1])\n\t" - "vl %%v22,48(%%r1,%[ap0])\n\t" - "vl %%v23,48(%%r1,%[ap1])\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vl %%v4,32(%%r1,%[y])\n\t" - "vl %%v5,48(%%r1,%[y])\n\t" - "vl %%v6,64(%%r1,%[y])\n\t" - "vl %%v7,80(%%r1,%[y])\n\t" - "vl %%v8,96(%%r1,%[y])\n\t" - "vl %%v9,112(%%r1,%[y])\n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmasb %%v3,%%v18,%%v0,%%v3\n\t" - "vfmasb %%v4,%%v20,%%v0,%%v4\n\t" - "vfmasb %%v5,%%v22,%%v0,%%v5\n\t" - "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmasb %%v7,%%v26,%%v0,%%v7\n\t" - "vfmasb %%v8,%%v28,%%v0,%%v8\n\t" - "vfmasb %%v9,%%v30,%%v0,%%v9\n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" - "vfmasb %%v3,%%v19,%%v1,%%v3\n\t" - "vfmasb %%v4,%%v21,%%v1,%%v4\n\t" - "vfmasb %%v5,%%v23,%%v1,%%v5\n\t" - "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmasb %%v7,%%v27,%%v1,%%v7\n\t" - "vfmasb %%v8,%%v29,%%v1,%%v8\n\t" - "vfmasb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "vst %%v4,32(%%r1,%[y])\n\t" - "vst %%v5,48(%%r1,%[y])\n\t" - "vst %%v6,64(%%r1,%[y])\n\t" - "vst %%v7,80(%%r1,%[y])\n\t" - "vst %%v8,96(%%r1,%[y])\n\t" - "vst %%v9,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v2\n\t" + "vfmsb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmasb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmasb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmasb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vlrepf %%v0,0(%[x])\n\t" - "vlrepf %%v16,%[alpha]\n\t" - "vfmsb %%v0,%%v0,%%v16\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,32(%%r1,%[a0])\n\t" - "vl %%v19,48(%%r1,%[a0])\n\t" - "vl %%v20,64(%%r1,%[a0])\n\t" - "vl %%v21,80(%%r1,%[a0])\n\t" - "vl %%v22,96(%%r1,%[a0])\n\t" - "vl %%v23,112(%%r1,%[a0])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y])\n\t" - "vst %%v25,16(%%r1,%[y])\n\t" - "vst %%v26,32(%%r1,%[y])\n\t" - "vst %%v27,48(%%r1,%[y])\n\t" - "vst %%v28,64(%%r1,%[y])\n\t" - "vst %%v29,80(%%r1,%[y])\n\t" - "vst %%v30,96(%%r1,%[y])\n\t" - "vst %%v31,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,0(%%r1,%[y])\n\t" - "vfmasb %%v17,%%v16,%%v0,%%v17\n\t" - "vst %%v17,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), - [n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepf %%v16,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,0(%%r1,%[y])\n\t" + "vfmasb %%v17,%%v16,%%v0,%%v17\n\t" + "vst %%v17,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index 6ae9b6d7f2..81d7c9fe74 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -30,330 +30,338 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v1,%%v1,%%v5\n\t" - "vfasb %%v2,%%v2,%%v6\n\t" - "vfasb %%v3,%%v3,%%v7\n\t" - "veslg %%v4,%%v0,32\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vrepg %%v4,%%v0,1\n\t" - "aebr %%f0,%%f4\n\t" - "ste %%f0,0(%[y])\n\t" - "veslg %%v4,%%v1,32\n\t" - "vfasb %%v1,%%v1,%%v4\n\t" - "vrepg %%v4,%%v1,1\n\t" - "aebr %%f1,%%f4\n\t" - "ste %%f1,4(%[y])\n\t" - "veslg %%v4,%%v2,32\n\t" - "vfasb %%v2,%%v2,%%v4\n\t" - "vrepg %%v4,%%v2,1\n\t" - "aebr %%f2,%%f4\n\t" - "ste %%f2,8(%[y])\n\t" - "veslg %%v4,%%v3,32\n\t" - "vfasb %%v3,%%v3,%%v4\n\t" - "vrepg %%v4,%%v3,1\n\t" - "aebr %%f3,%%f4\n\t" - "ste %%f3,12(%[y])" - : "=m"(*(FLOAT (*)[4]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v2,%%v2,%%v6\n\t" + "vfasb %%v3,%%v3,%%v7\n\t" + "veslg %%v4,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vrepg %%v4,%%v0,1\n\t" + "aebr %%f0,%%f4\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v4,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v4\n\t" + "vrepg %%v4,%%v1,1\n\t" + "aebr %%f1,%%f4\n\t" + "ste %%f1,4(%[y])\n\t" + "veslg %%v4,%%v2,32\n\t" + "vfasb %%v2,%%v2,%%v4\n\t" + "vrepg %%v4,%%v2,1\n\t" + "aebr %%f2,%%f4\n\t" + "ste %%f2,8(%[y])\n\t" + "veslg %%v4,%%v3,32\n\t" + "vfasb %%v3,%%v3,%%v4\n\t" + "vrepg %%v4,%%v3,1\n\t" + "aebr %%f3,%%f4\n\t" + "ste %%f3,12(%[y])" + : "=m"(*(struct { FLOAT x[4]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" - "vfmasb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" - "vfmasb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0])\n\t" - "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1])\n\t" - "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vfmasb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vfmasb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v0,%%v0,%%v6\n\t" - "vfasb %%v1,%%v1,%%v3\n\t" - "vfasb %%v1,%%v1,%%v5\n\t" - "vfasb %%v1,%%v1,%%v7\n\t" - "veslg %%v2,%%v0,32\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vrepg %%v2,%%v0,1\n\t" - "aebr %%f0,%%f2\n\t" - "ste %%f0,0(%[y])\n\t" - "veslg %%v2,%%v1,32\n\t" - "vfasb %%v1,%%v1,%%v2\n\t" - "vrepg %%v2,%%v1,1\n\t" - "aebr %%f1,%%f2\n\t" - "ste %%f1,4(%[y])" - : "=m"(*(FLOAT (*)[2]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v1,%%v1,%%v3\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v1,%%v1,%%v7\n\t" + "veslg %%v2,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vrepg %%v2,%%v0,1\n\t" + "aebr %%f0,%%f2\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v2,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v2\n\t" + "vrepg %%v2,%%v1,1\n\t" + "aebr %%f1,%%f2\n\t" + "ste %%f1,4(%[y])" + : "=m"(*(struct { FLOAT x[2]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" - "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0])\n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0])\n\t" - "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0])\n\t" - "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0])\n\t" - "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0])\n\t" - "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0])\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vfasb %%v0,%%v0,%%v3\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v0,%%v0,%%v5\n\t" - "vfasb %%v0,%%v0,%%v6\n\t" - "vfasb %%v0,%%v0,%%v7\n\t" - "veslg %%v1,%%v0,32\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vrepg %%v1,%%v0,1\n\t" - "aebr %%f0,%%f1\n\t" - "ste %%f0,0(%[y])" - : "=m"(*(FLOAT (*)[1]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "veslg %%v1,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vrepg %%v1,%%v0,1\n\t" + "aebr %%f0,%%f1\n\t" + "ste %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { @@ -366,70 +374,70 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { __asm__("vlrepf %%v0,%[da]\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,64(%%r1,%[src])\n\t" - "vl %%v21,80(%%r1,%[src])\n\t" - "vl %%v22,96(%%r1,%[src])\n\t" - "vl %%v23,112(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "vl %%v26, 32(%%r1,%[dest])\n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest])\n\t" - "vl %%v27, 48(%%r1,%[dest])\n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest])\n\t" - "vl %%v28, 64(%%r1,%[dest])\n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest])\n\t" - "vl %%v29, 80(%%r1,%[dest])\n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest])\n\t" - "vl %%v30, 96(%%r1,%[dest])\n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest])\n\t" - "vl %%v31, 112(%%r1,%[dest])\n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) dest) - : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), - [src] "a"(src),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) dest) + : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index df3c9cb4d7..7015aaa1da 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -31,53 +31,53 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) { FLOAT max; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxsb %%v16,%%v16,%%v24,0\n\t" - "vfmaxsb %%v17,%%v17,%%v25,0\n\t" - "vfmaxsb %%v18,%%v18,%%v26,0\n\t" - "vfmaxsb %%v19,%%v19,%%v27,0\n\t" - "vfmaxsb %%v20,%%v20,%%v28,0\n\t" - "vfmaxsb %%v21,%%v21,%%v29,0\n\t" - "vfmaxsb %%v22,%%v22,%%v30,0\n\t" - "vfmaxsb %%v23,%%v23,%%v31,0\n\t" - "vfmaxsb %%v16,%%v16,%%v20,0\n\t" - "vfmaxsb %%v17,%%v17,%%v21,0\n\t" - "vfmaxsb %%v18,%%v18,%%v22,0\n\t" - "vfmaxsb %%v19,%%v19,%%v23,0\n\t" - "vfmaxsb %%v16,%%v16,%%v18,0\n\t" - "vfmaxsb %%v17,%%v17,%%v19,0\n\t" - "vfmaxsb %%v16,%%v16,%%v17,0\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfmaxsb %%v0,%%v0,%%v16,0\n\t" - "ler %[max],%%f0" - : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxsb %%v16,%%v16,%%v24,0\n\t" + "vfmaxsb %%v17,%%v17,%%v25,0\n\t" + "vfmaxsb %%v18,%%v18,%%v26,0\n\t" + "vfmaxsb %%v19,%%v19,%%v27,0\n\t" + "vfmaxsb %%v20,%%v20,%%v28,0\n\t" + "vfmaxsb %%v21,%%v21,%%v29,0\n\t" + "vfmaxsb %%v22,%%v22,%%v30,0\n\t" + "vfmaxsb %%v23,%%v23,%%v31,0\n\t" + "vfmaxsb %%v16,%%v16,%%v20,0\n\t" + "vfmaxsb %%v17,%%v17,%%v21,0\n\t" + "vfmaxsb %%v18,%%v18,%%v22,0\n\t" + "vfmaxsb %%v19,%%v19,%%v23,0\n\t" + "vfmaxsb %%v16,%%v16,%%v18,0\n\t" + "vfmaxsb %%v17,%%v17,%%v19,0\n\t" + "vfmaxsb %%v16,%%v16,%%v17,0\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,0\n\t" + "ler %[max],%%f0" + : [max] "=f"(max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return max; } diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index 2e9c793c46..b6875c5c69 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -31,53 +31,53 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) { FLOAT min; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfminsb %%v16,%%v16,%%v24,0\n\t" - "vfminsb %%v17,%%v17,%%v25,0\n\t" - "vfminsb %%v18,%%v18,%%v26,0\n\t" - "vfminsb %%v19,%%v19,%%v27,0\n\t" - "vfminsb %%v20,%%v20,%%v28,0\n\t" - "vfminsb %%v21,%%v21,%%v29,0\n\t" - "vfminsb %%v22,%%v22,%%v30,0\n\t" - "vfminsb %%v23,%%v23,%%v31,0\n\t" - "vfminsb %%v16,%%v16,%%v20,0\n\t" - "vfminsb %%v17,%%v17,%%v21,0\n\t" - "vfminsb %%v18,%%v18,%%v22,0\n\t" - "vfminsb %%v19,%%v19,%%v23,0\n\t" - "vfminsb %%v16,%%v16,%%v18,0\n\t" - "vfminsb %%v17,%%v17,%%v19,0\n\t" - "vfminsb %%v16,%%v16,%%v17,0\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfminsb %%v0,%%v0,%%v16,0\n\t" - "ler %[min],%%f0" - : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v17,%%v17,%%v25,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v19,%%v19,%%v27,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v21,%%v21,%%v29,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v23,%%v23,%%v31,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v17,%%v17,%%v21,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v19,%%v19,%%v23,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v17,%%v17,%%v19,0\n\t" + "vfminsb %%v16,%%v16,%%v17,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return min; } diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c index 5b21a19dcf..4f471d8668 100644 --- a/kernel/zarch/srot.c +++ b/kernel/zarch/srot.c @@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { __asm__("vlrepf %%v0,%[c]\n\t" - "vlrepf %%v1,%[s]\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + [n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c index 07e6845c6d..9b9930dc87 100644 --- a/kernel/zarch/sscal.c +++ b/kernel/zarch/sscal.c @@ -29,61 +29,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) { __asm__("vlrepf %%v0,%[da]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x])\n\t" - "vfmsb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x])\n\t" - "vl %%v25,16(%%r1,%[x])\n\t" - "vfmsb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x])\n\t" - "vl %%v26,32(%%r1,%[x])\n\t" - "vfmsb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x])\n\t" - "vl %%v27,48(%%r1,%[x])\n\t" - "vfmsb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[x])\n\t" - "vfmsb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x])\n\t" - "vl %%v29,80(%%r1,%[x])\n\t" - "vfmsb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x])\n\t" - "vl %%v30,96(%%r1,%[x])\n\t" - "vfmsb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vfmsb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) - : [x] "a"(x),[da] "m"(da) - : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmsb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmsb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmsb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmsb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmsb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmsb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmsb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmsb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : [x] "a"(x),[da] "Q"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) { __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c index dc71131436..0c62f189d7 100644 --- a/kernel/zarch/sswap.c +++ b/kernel/zarch/sswap.c @@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + [n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 531e47a0b3..aa04ab91fe 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -34,89 +34,89 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vleg %%v24,128(%%r1,%[x]),0\n\t" - "vleg %%v25,136(%%r1,%[x]),0\n\t" - "vleg %%v24,144(%%r1,%[x]),1\n\t" - "vleg %%v25,152(%%r1,%[x]),1\n\t" - "vleg %%v26,160(%%r1,%[x]),0\n\t" - "vleg %%v27,168(%%r1,%[x]),0\n\t" - "vleg %%v26,176(%%r1,%[x]),1\n\t" - "vleg %%v27,184(%%r1,%[x]),1\n\t" - "vleg %%v28,192(%%r1,%[x]),0\n\t" - "vleg %%v29,200(%%r1,%[x]),0\n\t" - "vleg %%v28,208(%%r1,%[x]),1\n\t" - "vleg %%v29,216(%%r1,%[x]),1\n\t" - "vleg %%v30,224(%%r1,%[x]),0\n\t" - "vleg %%v31,232(%%r1,%[x]),0\n\t" - "vleg %%v30,240(%%r1,%[x]),1\n\t" - "vleg %%v31,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16,%%v16\n\t" - "vflpdb %%v17,%%v17\n\t" - "vflpdb %%v18,%%v18\n\t" - "vflpdb %%v19,%%v19\n\t" - "vflpdb %%v20,%%v20\n\t" - "vflpdb %%v21,%%v21\n\t" - "vflpdb %%v22,%%v22\n\t" - "vflpdb %%v23,%%v23\n\t" - "vflpdb %%v24,%%v24\n\t" - "vflpdb %%v25,%%v25\n\t" - "vflpdb %%v26,%%v26\n\t" - "vflpdb %%v27,%%v27\n\t" - "vflpdb %%v28,%%v28\n\t" - "vflpdb %%v29,%%v29\n\t" - "vflpdb %%v30,%%v30\n\t" - "vflpdb %%v31,%%v31\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v18,%%v18,%%v19\n\t" - "vfadb %%v20,%%v20,%%v21\n\t" - "vfadb %%v22,%%v22,%%v23\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v26,%%v26,%%v27\n\t" - "vfadb %%v28,%%v28,%%v29\n\t" - "vfadb %%v30,%%v30,%%v31\n\t" - "vfmaxdb %%v16,%%v16,%%v24,0\n\t" - "vfmaxdb %%v18,%%v18,%%v26,0\n\t" - "vfmaxdb %%v20,%%v20,%%v28,0\n\t" - "vfmaxdb %%v22,%%v22,%%v30,0\n\t" - "vfmaxdb %%v16,%%v16,%%v20,0\n\t" - "vfmaxdb %%v18,%%v18,%%v22,0\n\t" - "vfmaxdb %%v16,%%v16,%%v18,0\n\t" - "vfmaxdb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmaxdb %%v0,%%v0,%%v16,0\n\t" - "ldr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmaxdb %%v16,%%v16,%%v24,0\n\t" + "vfmaxdb %%v18,%%v18,%%v26,0\n\t" + "vfmaxdb %%v20,%%v20,%%v28,0\n\t" + "vfmaxdb %%v22,%%v22,%%v30,0\n\t" + "vfmaxdb %%v16,%%v16,%%v20,0\n\t" + "vfmaxdb %%v18,%%v18,%%v22,0\n\t" + "vfmaxdb %%v16,%%v16,%%v18,0\n\t" + "vfmaxdb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amax; } diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c index cac2da938f..37278d6dbb 100644 --- a/kernel/zarch/zamax_z13.c +++ b/kernel/zarch/zamax_z13.c @@ -34,98 +34,98 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v24,%%v25\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v26,%%v0\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v24,%%v25\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v26,%%v0\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27"); + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); return amax; } diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 940d81dd20..0b54028532 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -34,89 +34,89 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vleg %%v24,128(%%r1,%[x]),0\n\t" - "vleg %%v25,136(%%r1,%[x]),0\n\t" - "vleg %%v24,144(%%r1,%[x]),1\n\t" - "vleg %%v25,152(%%r1,%[x]),1\n\t" - "vleg %%v26,160(%%r1,%[x]),0\n\t" - "vleg %%v27,168(%%r1,%[x]),0\n\t" - "vleg %%v26,176(%%r1,%[x]),1\n\t" - "vleg %%v27,184(%%r1,%[x]),1\n\t" - "vleg %%v28,192(%%r1,%[x]),0\n\t" - "vleg %%v29,200(%%r1,%[x]),0\n\t" - "vleg %%v28,208(%%r1,%[x]),1\n\t" - "vleg %%v29,216(%%r1,%[x]),1\n\t" - "vleg %%v30,224(%%r1,%[x]),0\n\t" - "vleg %%v31,232(%%r1,%[x]),0\n\t" - "vleg %%v30,240(%%r1,%[x]),1\n\t" - "vleg %%v31,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16,%%v16\n\t" - "vflpdb %%v17,%%v17\n\t" - "vflpdb %%v18,%%v18\n\t" - "vflpdb %%v19,%%v19\n\t" - "vflpdb %%v20,%%v20\n\t" - "vflpdb %%v21,%%v21\n\t" - "vflpdb %%v22,%%v22\n\t" - "vflpdb %%v23,%%v23\n\t" - "vflpdb %%v24,%%v24\n\t" - "vflpdb %%v25,%%v25\n\t" - "vflpdb %%v26,%%v26\n\t" - "vflpdb %%v27,%%v27\n\t" - "vflpdb %%v28,%%v28\n\t" - "vflpdb %%v29,%%v29\n\t" - "vflpdb %%v30,%%v30\n\t" - "vflpdb %%v31,%%v31\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v18,%%v18,%%v19\n\t" - "vfadb %%v20,%%v20,%%v21\n\t" - "vfadb %%v22,%%v22,%%v23\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v26,%%v26,%%v27\n\t" - "vfadb %%v28,%%v28,%%v29\n\t" - "vfadb %%v30,%%v30,%%v31\n\t" - "vfmindb %%v16,%%v16,%%v24,0\n\t" - "vfmindb %%v18,%%v18,%%v26,0\n\t" - "vfmindb %%v20,%%v20,%%v28,0\n\t" - "vfmindb %%v22,%%v22,%%v30,0\n\t" - "vfmindb %%v16,%%v16,%%v20,0\n\t" - "vfmindb %%v18,%%v18,%%v22,0\n\t" - "vfmindb %%v16,%%v16,%%v18,0\n\t" - "vfmindb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmindb %%v0,%%v0,%%v16,0\n\t" - "ldr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amin; } diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c index 7417e0b742..e37bb2236f 100644 --- a/kernel/zarch/zamin_z13.c +++ b/kernel/zarch/zamin_z13.c @@ -34,98 +34,98 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v25,%%v24\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v0,%%v26\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v25,%%v24\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v0,%%v26\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27"); + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); return amin; } diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 43ae8ff8b7..aeef8d77e6 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -34,81 +34,81 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { FLOAT asum; __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v27\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v29\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v24,%%v24,%%v31\n\t" - "vrepg %%v25,%%v24,1\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" - : [asum] "=m"(asum),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=Q"(asum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return asum; } diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 31549849d8..9363ec32df 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -30,77 +30,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__( #if !defined(CONJ) - "vlrepg %%v0,0(%[alpha])\n\t" - "vleg %%v1,8(%[alpha]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%[alpha]),1\n\t" + "vlrepg %%v0,0(%[alpha])\n\t" + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" #else - "vleg %%v0,0(%[alpha]),1\n\t" - "vflcdb %%v0,%%v0\n\t" - "vleg %%v0,0(%[alpha]),0\n\t" - "vlrepg %%v1,8(%[alpha])\n\t" + "vleg %%v0,0(%[alpha]),1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,0(%[alpha]),0\n\t" + "vlrepg %%v1,8(%[alpha])\n\t" #endif - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x])\n\t" - "vl %%v9,16(%%r1,%[x])\n\t" - "vl %%v10,32(%%r1,%[x])\n\t" - "vl %%v11,48(%%r1,%[x])\n\t" - "vl %%v12,0(%%r1,%[y])\n\t" - "vl %%v13,16(%%r1,%[y])\n\t" - "vl %%v14,32(%%r1,%[y])\n\t" - "vl %%v15,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[x])\n\t" - "vl %%v17,80(%%r1,%[x])\n\t" - "vl %%v18,96(%%r1,%[x])\n\t" - "vl %%v19,112(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[y])\n\t" - "vl %%v21,80(%%r1,%[y])\n\t" - "vl %%v22,96(%%r1,%[y])\n\t" - "vl %%v23,112(%%r1,%[y])\n\t" - "vpdi %%v24,%%v8,%%v8,4\n\t" - "vpdi %%v25,%%v9,%%v9,4\n\t" - "vpdi %%v26,%%v10,%%v10,4\n\t" - "vpdi %%v27,%%v11,%%v11,4\n\t" - "vpdi %%v28,%%v16,%%v16,4\n\t" - "vpdi %%v29,%%v17,%%v17,4\n\t" - "vpdi %%v30,%%v18,%%v18,4\n\t" - "vpdi %%v31,%%v19,%%v19,4\n\t" - "vfmadb %%v8,%%v8,%%v0,%%v12\n\t" - "vfmadb %%v9,%%v9,%%v0,%%v13\n\t" - "vfmadb %%v10,%%v10,%%v0,%%v14\n\t" - "vfmadb %%v11,%%v11,%%v0,%%v15\n\t" - "vfmadb %%v16,%%v16,%%v0,%%v20\n\t" - "vfmadb %%v17,%%v17,%%v0,%%v21\n\t" - "vfmadb %%v18,%%v18,%%v0,%%v22\n\t" - "vfmadb %%v19,%%v19,%%v0,%%v23\n\t" - "vfmadb %%v8,%%v24,%%v1,%%v8\n\t" - "vfmadb %%v9,%%v25,%%v1,%%v9\n\t" - "vfmadb %%v10,%%v26,%%v1,%%v10\n\t" - "vfmadb %%v11,%%v27,%%v1,%%v11\n\t" - "vfmadb %%v16,%%v28,%%v1,%%v16\n\t" - "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" - "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" - "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y])\n\t" - "vst %%v9,16(%%r1,%[y])\n\t" - "vst %%v10,32(%%r1,%[y])\n\t" - "vst %%v11,48(%%r1,%[y])\n\t" - "vst %%v16,64(%%r1,%[y])\n\t" - "vst %%v17,80(%%r1,%[y])\n\t" - "vst %%v18,96(%%r1,%[y])\n\t" - "vst %%v19,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", - "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vpdi %%v24,%%v8,%%v8,4\n\t" + "vpdi %%v25,%%v9,%%v9,4\n\t" + "vpdi %%v26,%%v10,%%v10,4\n\t" + "vpdi %%v27,%%v11,%%v11,4\n\t" + "vpdi %%v28,%%v16,%%v16,4\n\t" + "vpdi %%v29,%%v17,%%v17,4\n\t" + "vpdi %%v30,%%v18,%%v18,4\n\t" + "vpdi %%v31,%%v19,%%v19,4\n\t" + "vfmadb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmadb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmadb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmadb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmadb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmadb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmadb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmadb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 50ff186461..5a46aec1c9 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -29,16 +29,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],4\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x) - : "cc"); + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y), + [n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 7a67ef734b..ac6e69c23f 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -29,76 +29,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vpdi %%v20,%%v16,%%v16,4\n\t" - "vpdi %%v21,%%v17,%%v17,4\n\t" - "vpdi %%v22,%%v18,%%v18,4\n\t" - "vpdi %%v23,%%v19,%%v19,4\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x])\n\t" - "vl %%v17, 80(%%r1,%[x])\n\t" - "vl %%v18, 96(%%r1,%[x])\n\t" - "vl %%v19, 112(%%r1,%[x])\n\t" - "vl %%v0, 64(%%r1,%[y])\n\t" - "vl %%v1, 80(%%r1,%[y])\n\t" - "vl %%v2, 96(%%r1,%[y])\n\t" - "vl %%v3, 112(%%r1,%[y])\n\t" - "vpdi %%v20,%%v16,%%v16,4\n\t" - "vpdi %%v21,%%v17,%%v17,4\n\t" - "vpdi %%v22,%%v18,%%v18,4\n\t" - "vpdi %%v23,%%v19,%%v19,4\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v25,%%v25,%%v27\n\t" - "vfadb %%v25,%%v25,%%v29\n\t" - "vfadb %%v25,%%v25,%%v31\n\t" - "vsteg %%v24,0(%[d]),0\n\t" - "vsteg %%v24,8(%[d]),1\n\t" - "vsteg %%v25,16(%[d]),1\n\t" - "vsteg %%v25,24(%[d]),0" - : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) - : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v25,%%v25,%%v27\n\t" + "vfadb %%v25,%%v25,%%v29\n\t" + "vfadb %%v25,%%v25,%%v31\n\t" + "vsteg %%v24,0(%[d]),0\n\t" + "vsteg %%v24,8(%[d]),1\n\t" + "vsteg %%v25,16(%[d]),1\n\t" + "vsteg %%v25,24(%[d]),0" + : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 7f21985ecf..5ca8da3c17 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -30,235 +30,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 1024 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" - "vl %%v18,32(%[x])\n\t" - "vl %%v19,48(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" + "vl %%v18,32(%[x])\n\t" + "vl %%v19,48(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v20,8(%[x]),0\n\t" - "wflcdb %%v20,%%v20\n\t" - "vleg %%v20,0(%[x]),1\n\t" - "vleg %%v21,24(%[x]),0\n\t" - "wflcdb %%v21,%%v21\n\t" - "vleg %%v21,16(%[x]),1\n\t" - "vleg %%v22,40(%[x]),0\n\t" - "wflcdb %%v22,%%v22\n\t" - "vleg %%v22,32(%[x]),1\n\t" - "vleg %%v23,56(%[x]),0\n\t" - "wflcdb %%v23,%%v23\n\t" - "vleg %%v23,48(%[x]),1\n\t" + "vleg %%v20,8(%[x]),0\n\t" + "wflcdb %%v20,%%v20\n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "wflcdb %%v21,%%v21\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "wflcdb %%v22,%%v22\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vleg %%v23,56(%[x]),0\n\t" + "wflcdb %%v23,%%v23\n\t" + "vleg %%v23,48(%[x]),1\n\t" #else - "vleg %%v20,0(%[x]),1\n\t" - "vflcdb %%v20,%%v20\n\t" - "vleg %%v20,8(%[x]),0\n\t" - "vleg %%v21,16(%[x]),1\n\t" - "vflcdb %%v21,%%v21\n\t" - "vleg %%v21,24(%[x]),0\n\t" - "vleg %%v22,32(%[x]),1\n\t" - "vflcdb %%v22,%%v22\n\t" - "vleg %%v22,40(%[x]),0\n\t" - "vleg %%v23,48(%[x]),1\n\t" - "vflcdb %%v23,%%v23\n\t" - "vleg %%v23,56(%[x]),0\n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vflcdb %%v20,%%v20\n\t" + "vleg %%v20,8(%[x]),0\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vflcdb %%v21,%%v21\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vflcdb %%v22,%%v22\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "vleg %%v23,48(%[x]),1\n\t" + "vflcdb %%v23,%%v23\n\t" + "vleg %%v23,56(%[x]),0\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" - "vlrepg %%v24,0(%%r1,%[ap0])\n\t" - "vlrepg %%v25,8(%%r1,%[ap0])\n\t" - "vlrepg %%v26,0(%%r1,%[ap1])\n\t" - "vlrepg %%v27,8(%%r1,%[ap1])\n\t" - "vlrepg %%v28,16(%%r1,%[ap0])\n\t" - "vlrepg %%v29,24(%%r1,%[ap0])\n\t" - "vlrepg %%v30,16(%%r1,%[ap1])\n\t" - "vlrepg %%v31,24(%%r1,%[ap1])\n\t" - "vfmadb %%v0,%%v24,%%v16,%%v0\n\t" - "vfmadb %%v1,%%v28,%%v16,%%v1\n\t" - "vfmadb %%v0,%%v25,%%v20,%%v0\n\t" - "vfmadb %%v1,%%v29,%%v20,%%v1\n\t" - "vfmadb %%v0,%%v26,%%v17,%%v0\n\t" - "vfmadb %%v1,%%v30,%%v17,%%v1\n\t" - "vfmadb %%v0,%%v27,%%v21,%%v0\n\t" - "vfmadb %%v1,%%v31,%%v21,%%v1\n\t" - "vlrepg %%v24,0(%%r1,%[ap2])\n\t" - "vlrepg %%v25,8(%%r1,%[ap2])\n\t" - "vlrepg %%v26,0(%%r1,%[ap3])\n\t" - "vlrepg %%v27,8(%%r1,%[ap3])\n\t" - "vlrepg %%v28,16(%%r1,%[ap2])\n\t" - "vlrepg %%v29,24(%%r1,%[ap2])\n\t" - "vlrepg %%v30,16(%%r1,%[ap3])\n\t" - "vlrepg %%v31,24(%%r1,%[ap3])\n\t" - "vfmadb %%v0,%%v24,%%v18,%%v0\n\t" - "vfmadb %%v1,%%v28,%%v18,%%v1\n\t" - "vfmadb %%v0,%%v25,%%v22,%%v0\n\t" - "vfmadb %%v1,%%v29,%%v22,%%v1\n\t" - "vfmadb %%v0,%%v26,%%v19,%%v0\n\t" - "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" - "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" - "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v24,0(%%r1,%[ap0])\n\t" + "vlrepg %%v25,8(%%r1,%[ap0])\n\t" + "vlrepg %%v26,0(%%r1,%[ap1])\n\t" + "vlrepg %%v27,8(%%r1,%[ap1])\n\t" + "vlrepg %%v28,16(%%r1,%[ap0])\n\t" + "vlrepg %%v29,24(%%r1,%[ap0])\n\t" + "vlrepg %%v30,16(%%r1,%[ap1])\n\t" + "vlrepg %%v31,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v20,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v21,%%v1\n\t" + "vlrepg %%v24,0(%%r1,%[ap2])\n\t" + "vlrepg %%v25,8(%%r1,%[ap2])\n\t" + "vlrepg %%v26,0(%%r1,%[ap3])\n\t" + "vlrepg %%v27,8(%%r1,%[ap3])\n\t" + "vlrepg %%v28,16(%%r1,%[ap2])\n\t" + "vlrepg %%v29,24(%%r1,%[ap2])\n\t" + "vlrepg %%v30,16(%%r1,%[ap3])\n\t" + "vlrepg %%v31,24(%%r1,%[ap3])\n\t" + "vfmadb %%v0,%%v24,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v22,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v22,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v18,8(%[x]),0\n\t" - "wflcdb %%v18,%%v18\n\t" - "vleg %%v18,0(%[x]),1\n\t" - "vleg %%v19,24(%[x]),0\n\t" - "wflcdb %%v19,%%v19\n\t" - "vleg %%v19,16(%[x]),1\n\t" + "vleg %%v18,8(%[x]),0\n\t" + "wflcdb %%v18,%%v18\n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vleg %%v19,24(%[x]),0\n\t" + "wflcdb %%v19,%%v19\n\t" + "vleg %%v19,16(%[x]),1\n\t" #else - "vleg %%v18,0(%[x]),1\n\t" - "vflcdb %%v18,%%v18\n\t" - "vleg %%v18,8(%[x]),0\n\t" - "vleg %%v19,16(%[x]),1\n\t" - "vflcdb %%v19,%%v19\n\t" - "vleg %%v19,24(%[x]),0\n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vflcdb %%v18,%%v18\n\t" + "vleg %%v18,8(%[x]),0\n\t" + "vleg %%v19,16(%[x]),1\n\t" + "vflcdb %%v19,%%v19\n\t" + "vleg %%v19,24(%[x]),0\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" - "vlrepg %%v20,0(%%r1,%[ap0])\n\t" - "vlrepg %%v21,8(%%r1,%[ap0])\n\t" - "vlrepg %%v22,0(%%r1,%[ap1])\n\t" - "vlrepg %%v23,8(%%r1,%[ap1])\n\t" - "vlrepg %%v24,16(%%r1,%[ap0])\n\t" - "vlrepg %%v25,24(%%r1,%[ap0])\n\t" - "vlrepg %%v26,16(%%r1,%[ap1])\n\t" - "vlrepg %%v27,24(%%r1,%[ap1])\n\t" - "vfmadb %%v0,%%v20,%%v16,%%v0\n\t" - "vfmadb %%v1,%%v24,%%v16,%%v1\n\t" - "vfmadb %%v0,%%v21,%%v18,%%v0\n\t" - "vfmadb %%v1,%%v25,%%v18,%%v1\n\t" - "vfmadb %%v0,%%v22,%%v17,%%v0\n\t" - "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" - "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" - "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v20,0(%%r1,%[ap0])\n\t" + "vlrepg %%v21,8(%%r1,%[ap0])\n\t" + "vlrepg %%v22,0(%%r1,%[ap1])\n\t" + "vlrepg %%v23,8(%%r1,%[ap1])\n\t" + "vlrepg %%v24,16(%%r1,%[ap0])\n\t" + "vlrepg %%v25,24(%%r1,%[ap0])\n\t" + "vlrepg %%v26,16(%%r1,%[ap1])\n\t" + "vlrepg %%v27,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v24,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v25,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27"); } static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { __asm__("vl %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v17,8(%[x]),0\n\t" - "wflcdb %%v17,%%v17\n\t" - "vleg %%v17,0(%[x]),1\n\t" + "vleg %%v17,8(%[x]),0\n\t" + "wflcdb %%v17,%%v17\n\t" + "vleg %%v17,0(%[x]),1\n\t" #else - "vleg %%v17,0(%[x]),1\n\t" - "vflcdb %%v17,%%v17\n\t" - "vleg %%v17,8(%[x]),0\n\t" + "vleg %%v17,0(%[x]),1\n\t" + "vflcdb %%v17,%%v17\n\t" + "vleg %%v17,8(%[x]),0\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" - "vlrepg %%v18,0(%%r1,%[ap])\n\t" - "vlrepg %%v19,8(%%r1,%[ap])\n\t" - "vlrepg %%v20,16(%%r1,%[ap])\n\t" - "vlrepg %%v21,24(%%r1,%[ap])\n\t" - "vfmadb %%v0,%%v18,%%v16,%%v0\n\t" - "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" - "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" - "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), - "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v18,0(%%r1,%[ap])\n\t" + "vlrepg %%v19,8(%%r1,%[ap])\n\t" + "vlrepg %%v20,16(%%r1,%[ap])\n\t" + "vlrepg %%v21,24(%%r1,%[ap])\n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), + "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21"); } static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) { __asm__( #if !defined(XCONJ) - "vlrepg %%v0,%[alpha_r]\n\t" - "vleg %%v1,%[alpha_i],0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,%[alpha_i],1\n\t" + "vlrepg %%v0,%[alpha_r]\n\t" + "vleg %%v1,%[alpha_i],0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,%[alpha_i],1\n\t" #else - "vleg %%v0,%[alpha_r],1\n\t" - "vflcdb %%v0,%%v0\n\t" - "vleg %%v0,%[alpha_r],0\n\t" - "vlrepg %%v1,%[alpha_i]\n\t" + "vleg %%v0,%[alpha_r],1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,%[alpha_r],0\n\t" + "vlrepg %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],2\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,0(%%r1,%[dest])\n\t" - "vl %%v21,16(%%r1,%[dest])\n\t" - "vl %%v22,32(%%r1,%[dest])\n\t" - "vl %%v23,48(%%r1,%[dest])\n\t" - "vpdi %%v24,%%v16,%%v16,4\n\t" - "vpdi %%v25,%%v17,%%v17,4\n\t" - "vpdi %%v26,%%v18,%%v18,4\n\t" - "vpdi %%v27,%%v19,%%v19,4\n\t" - "vfmadb %%v28,%%v16,%%v0,%%v20\n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21\n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22\n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23\n\t" - "vfmadb %%v28,%%v24,%%v1,%%v28\n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" - "vst %%v28,0(%%r1,%[dest])\n\t" - "vst %%v29,16(%%r1,%[dest])\n\t" - "vst %%v30,32(%%r1,%[dest])\n\t" - "vst %%v31,48(%%r1,%[dest])\n\t" - "agfi %%r1,64\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) - : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), - [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,0(%%r1,%[dest])\n\t" + "vl %%v21,16(%%r1,%[dest])\n\t" + "vl %%v22,32(%%r1,%[dest])\n\t" + "vl %%v23,48(%%r1,%[dest])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vfmadb %%v28,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v28,%%v24,%%v1,%%v28\n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" + "vst %%v28,0(%%r1,%[dest])\n\t" + "vst %%v29,16(%%r1,%[dest])\n\t" + "vst %%v30,32(%%r1,%[dest])\n\t" + "vst %%v31,48(%%r1,%[dest])\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), + [src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c index 7b3e6c1fc6..031c31e29b 100644 --- a/kernel/zarch/zgemv_t_4.c +++ b/kernel/zarch/zgemv_t_4.c @@ -31,266 +31,274 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "vzero %%v18\n\t" - "vzero %%v19\n\t" - "vzero %%v20\n\t" - "vzero %%v21\n\t" - "vzero %%v22\n\t" - "vzero %%v23\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vzero %%v17\n\t" + "vzero %%v18\n\t" + "vzero %%v19\n\t" + "vzero %%v20\n\t" + "vzero %%v21\n\t" + "vzero %%v22\n\t" + "vzero %%v23\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,8(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" #else - "vleg %%v1,0(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%%r1,%[x]),0\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v24,0(%%r1,%[ap0])\n\t" - "vlrepg %%v25,8(%%r1,%[ap0])\n\t" - "vlrepg %%v26,0(%%r1,%[ap1])\n\t" - "vlrepg %%v27,8(%%r1,%[ap1])\n\t" - "vlrepg %%v28,0(%%r1,%[ap2])\n\t" - "vlrepg %%v29,8(%%r1,%[ap2])\n\t" - "vlrepg %%v30,0(%%r1,%[ap3])\n\t" - "vlrepg %%v31,8(%%r1,%[ap3])\n\t" - "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" - "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" - "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" - "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" - "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" - "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" - "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vlrepg %%v24,0(%%r1,%[ap0])\n\t" + "vlrepg %%v25,8(%%r1,%[ap0])\n\t" + "vlrepg %%v26,0(%%r1,%[ap1])\n\t" + "vlrepg %%v27,8(%%r1,%[ap1])\n\t" + "vlrepg %%v28,0(%%r1,%[ap2])\n\t" + "vlrepg %%v29,8(%%r1,%[ap2])\n\t" + "vlrepg %%v30,0(%%r1,%[ap3])\n\t" + "vlrepg %%v31,8(%%r1,%[ap3])\n\t" + "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" + "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" + "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" + "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" + "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" + "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" + "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,24(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" #else - "vleg %%v1,16(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,24(%%r1,%[x]),0\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v24,16(%%r1,%[ap0])\n\t" - "vlrepg %%v25,24(%%r1,%[ap0])\n\t" - "vlrepg %%v26,16(%%r1,%[ap1])\n\t" - "vlrepg %%v27,24(%%r1,%[ap1])\n\t" - "vlrepg %%v28,16(%%r1,%[ap2])\n\t" - "vlrepg %%v29,24(%%r1,%[ap2])\n\t" - "vlrepg %%v30,16(%%r1,%[ap3])\n\t" - "vlrepg %%v31,24(%%r1,%[ap3])\n\t" - "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" - "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" - "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" - "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" - "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" - "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" - "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v20\n\t" - "vfadb %%v17,%%v17,%%v21\n\t" - "vfadb %%v18,%%v18,%%v22\n\t" - "vfadb %%v19,%%v19,%%v23\n\t" - "vpdi %%v20,%%v16,%%v16,4\n\t" - "vpdi %%v21,%%v17,%%v17,4\n\t" - "vpdi %%v22,%%v18,%%v18,4\n\t" - "vpdi %%v23,%%v19,%%v19,4\n\t" + "vlrepg %%v24,16(%%r1,%[ap0])\n\t" + "vlrepg %%v25,24(%%r1,%[ap0])\n\t" + "vlrepg %%v26,16(%%r1,%[ap1])\n\t" + "vlrepg %%v27,24(%%r1,%[ap1])\n\t" + "vlrepg %%v28,16(%%r1,%[ap2])\n\t" + "vlrepg %%v29,24(%%r1,%[ap2])\n\t" + "vlrepg %%v30,16(%%r1,%[ap3])\n\t" + "vlrepg %%v31,24(%%r1,%[ap3])\n\t" + "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" + "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" + "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" + "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" + "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" + "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" + "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v16,%%v16,%%v20\n\t" + "vfadb %%v17,%%v17,%%v21\n\t" + "vfadb %%v18,%%v18,%%v22\n\t" + "vfadb %%v19,%%v19,%%v23\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" #if !defined(XCONJ) - "vlrepg %%v24,0(%[alpha])\n\t" - "vleg %%v25,8(%[alpha]),0\n\t" - "wflcdb %%v25,%%v25\n\t" - "vleg %%v25,8(%[alpha]),1\n\t" + "vlrepg %%v24,0(%[alpha])\n\t" + "vleg %%v25,8(%[alpha]),0\n\t" + "wflcdb %%v25,%%v25\n\t" + "vleg %%v25,8(%[alpha]),1\n\t" #else - "vleg %%v24,0(%[alpha]),1\n\t" - "vflcdb %%v24,%%v24\n\t" - "vleg %%v24,0(%[alpha]),0\n\t" - "vlrepg %%v25,8(%[alpha])\n\t" + "vleg %%v24,0(%[alpha]),1\n\t" + "vflcdb %%v24,%%v24\n\t" + "vleg %%v24,0(%[alpha]),0\n\t" + "vlrepg %%v25,8(%[alpha])\n\t" #endif - "vl %%v26,0(%[y])\n\t" - "vl %%v27,16(%[y])\n\t" - "vl %%v28,32(%[y])\n\t" - "vl %%v29,48(%[y])\n\t" - "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" - "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" - "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" - "vfmadb %%v27,%%v21,%%v25,%%v27\n\t" - "vfmadb %%v28,%%v18,%%v24,%%v28\n\t" - "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" - "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" - "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" - "vst %%v26,0(%[y])\n\t" - "vst %%v27,16(%[y])\n\t" - "vst %%v28,32(%[y])\n\t" - "vst %%v29,48(%[y])" - : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vl %%v26,0(%[y])\n\t" + "vl %%v27,16(%[y])\n\t" + "vl %%v28,32(%[y])\n\t" + "vl %%v29,48(%[y])\n\t" + "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" + "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" + "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" + "vfmadb %%v27,%%v21,%%v25,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v24,%%v28\n\t" + "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" + "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" + "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" + "vst %%v26,0(%[y])\n\t" + "vst %%v27,16(%[y])\n\t" + "vst %%v28,32(%[y])\n\t" + "vst %%v29,48(%[y])" + : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "vzero %%v18\n\t" - "vzero %%v19\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vzero %%v17\n\t" + "vzero %%v18\n\t" + "vzero %%v19\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,8(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" #else - "vleg %%v1,0(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%%r1,%[x]),0\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v20,0(%%r1,%[ap0])\n\t" - "vlrepg %%v21,8(%%r1,%[ap0])\n\t" - "vlrepg %%v22,0(%%r1,%[ap1])\n\t" - "vlrepg %%v23,8(%%r1,%[ap1])\n\t" - "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" - "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" - "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" - "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vlrepg %%v20,0(%%r1,%[ap0])\n\t" + "vlrepg %%v21,8(%%r1,%[ap0])\n\t" + "vlrepg %%v22,0(%%r1,%[ap1])\n\t" + "vlrepg %%v23,8(%%r1,%[ap1])\n\t" + "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" + "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" + "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" + "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,24(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" #else - "vleg %%v1,16(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,24(%%r1,%[x]),0\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v20,16(%%r1,%[ap0])\n\t" - "vlrepg %%v21,24(%%r1,%[ap0])\n\t" - "vlrepg %%v22,16(%%r1,%[ap1])\n\t" - "vlrepg %%v23,24(%%r1,%[ap1])\n\t" - "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" - "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" - "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" - "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v18\n\t" - "vfadb %%v17,%%v17,%%v19\n\t" - "vpdi %%v18,%%v16,%%v16,4\n\t" - "vpdi %%v19,%%v17,%%v17,4\n\t" + "vlrepg %%v20,16(%%r1,%[ap0])\n\t" + "vlrepg %%v21,24(%%r1,%[ap0])\n\t" + "vlrepg %%v22,16(%%r1,%[ap1])\n\t" + "vlrepg %%v23,24(%%r1,%[ap1])\n\t" + "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" + "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" + "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" + "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v16,%%v16,%%v18\n\t" + "vfadb %%v17,%%v17,%%v19\n\t" + "vpdi %%v18,%%v16,%%v16,4\n\t" + "vpdi %%v19,%%v17,%%v17,4\n\t" #if !defined(XCONJ) - "vlrepg %%v20,0(%[alpha])\n\t" - "vleg %%v21,8(%[alpha]),0\n\t" - "wflcdb %%v21,%%v21\n\t" - "vleg %%v21,8(%[alpha]),1\n\t" + "vlrepg %%v20,0(%[alpha])\n\t" + "vleg %%v21,8(%[alpha]),0\n\t" + "wflcdb %%v21,%%v21\n\t" + "vleg %%v21,8(%[alpha]),1\n\t" #else - "vleg %%v20,0(%[alpha]),1\n\t" - "vflcdb %%v20,%%v20\n\t" - "vleg %%v20,0(%[alpha]),0\n\t" - "vlrepg %%v21,8(%[alpha])\n\t" + "vleg %%v20,0(%[alpha]),1\n\t" + "vflcdb %%v20,%%v20\n\t" + "vleg %%v20,0(%[alpha]),0\n\t" + "vlrepg %%v21,8(%[alpha])\n\t" #endif - "vl %%v22,0(%[y])\n\t" - "vl %%v23,16(%[y])\n\t" - "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" - "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" - "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" - "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" - "vst %%v22,0(%[y])\n\t" - "vst %%v23,16(%[y])\n\t" - : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23"); + "vl %%v22,0(%[y])\n\t" + "vl %%v23,16(%[y])\n\t" + "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" + "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" + "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" + "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" + "vst %%v22,0(%[y])\n\t" + "vst %%v23,16(%[y])\n\t" + : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23"); } static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vzero %%v17\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,8(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" #else - "vleg %%v1,0(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%%r1,%[x]),0\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v18,0(%%r1,%[ap])\n\t" - "vlrepg %%v19,8(%%r1,%[ap])\n\t" - "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" - "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vlrepg %%v18,0(%%r1,%[ap])\n\t" + "vlrepg %%v19,8(%%r1,%[ap])\n\t" + "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" + "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,24(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" #else - "vleg %%v1,16(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,24(%%r1,%[x]),0\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v18,16(%%r1,%[ap])\n\t" - "vlrepg %%v19,24(%%r1,%[ap])\n\t" - "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" - "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vpdi %%v17,%%v16,%%v16,4\n\t" + "vlrepg %%v18,16(%%r1,%[ap])\n\t" + "vlrepg %%v19,24(%%r1,%[ap])\n\t" + "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" + "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vpdi %%v17,%%v16,%%v16,4\n\t" #if !defined(XCONJ) - "vlrepg %%v18,0(%[alpha])\n\t" - "vleg %%v19,8(%[alpha]),0\n\t" - "wflcdb %%v19,%%v19\n\t" - "vleg %%v19,8(%[alpha]),1\n\t" + "vlrepg %%v18,0(%[alpha])\n\t" + "vleg %%v19,8(%[alpha]),0\n\t" + "wflcdb %%v19,%%v19\n\t" + "vleg %%v19,8(%[alpha]),1\n\t" #else - "vleg %%v18,0(%[alpha]),1\n\t" - "vflcdb %%v18,%%v18\n\t" - "vleg %%v18,0(%[alpha]),0\n\t" - "vlrepg %%v19,8(%[alpha])\n\t" + "vleg %%v18,0(%[alpha]),1\n\t" + "vflcdb %%v18,%%v18\n\t" + "vleg %%v18,0(%[alpha]),0\n\t" + "vlrepg %%v19,8(%[alpha])\n\t" #endif - "vl %%v0,0(%[y])\n\t" - "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" - "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" - "vst %%v0,0(%[y])\n\t" - : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19"); + "vl %%v0,0(%[y])\n\t" + "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" + "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" + "vst %%v0,0(%[y])\n\t" + : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19"); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index aa7f166052..6284d5a474 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { __asm__("vlrepg %%v0,%[c]\n\t" - "vlrepg %%v1,%[s]\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x), + "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index fbcc0c5b9a..e497a6d7b9 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -29,167 +29,170 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlrepg %%v0,0(%[alpha])\n\t" - "vleg %%v1,8(%[alpha]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%[alpha]),1\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vpdi %%v24,%%v16,%%v16,4\n\t" - "vpdi %%v25,%%v17,%%v17,4\n\t" - "vpdi %%v26,%%v18,%%v18,4\n\t" - "vpdi %%v27,%%v19,%%v19,4\n\t" - "vpdi %%v28,%%v20,%%v20,4\n\t" - "vpdi %%v29,%%v21,%%v21,4\n\t" - "vpdi %%v30,%%v22,%%v22,4\n\t" - "vpdi %%v31,%%v23,%%v23,4\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vpdi %%v28,%%v20,%%v20,4\n\t" + "vpdi %%v29,%%v21,%%v21,4\n\t" + "vpdi %%v30,%%v22,%%v22,4\n\t" + "vpdi %%v31,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vleg %%v0,8(%[alpha]),0\n\t" - "wflcdb %%v0,%%v0\n\t" - "vleg %%v0,8(%[alpha]),1\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vpdi %%v16,%%v16,%%v16,4\n\t" - "vpdi %%v17,%%v17,%%v17,4\n\t" - "vpdi %%v18,%%v18,%%v18,4\n\t" - "vpdi %%v19,%%v19,%%v19,4\n\t" - "vpdi %%v20,%%v20,%%v20,4\n\t" - "vpdi %%v21,%%v21,%%v21,4\n\t" - "vpdi %%v22,%%v22,%%v22,4\n\t" - "vpdi %%v23,%%v23,%%v23,4\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); + "wflcdb %%v0,%%v0\n\t" + "vleg %%v0,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v16,%%v16,%%v16,4\n\t" + "vpdi %%v17,%%v17,%%v17,4\n\t" + "vpdi %%v18,%%v18,%%v18,4\n\t" + "vpdi %%v19,%%v19,%%v19,4\n\t" + "vpdi %%v20,%%v20,%%v20,4\n\t" + "vpdi %%v21,%%v21,%%v21,4\n\t" + "vpdi %%v22,%%v22,%%v22,4\n\t" + "vpdi %%v23,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlrepg %%v0,0(%[alpha])\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 0f38103be7..bc466866cb 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x), + "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, From f5836741092ca3f9358c2a24c6056bf098b3f748 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 12 Feb 2019 13:12:28 +0200 Subject: [PATCH 098/189] [ZARCH] Fix cgemv_t_4 --- kernel/zarch/cgemv_t_4.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c index 91ea1c10c2..e10edfab02 100644 --- a/kernel/zarch/cgemv_t_4.c +++ b/kernel/zarch/cgemv_t_4.c @@ -120,10 +120,10 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v20\n\t" - "vfadb %%v17,%%v17,%%v21\n\t" - "vfadb %%v18,%%v18,%%v22\n\t" - "vfadb %%v19,%%v19,%%v23\n\t" + "vfasb %%v16,%%v16,%%v20\n\t" + "vfasb %%v17,%%v17,%%v21\n\t" + "vfasb %%v18,%%v18,%%v22\n\t" + "vfasb %%v19,%%v19,%%v23\n\t" "vrepg %%v20,%%v16,1\n\t" "vrepg %%v21,%%v17,1\n\t" "vrepg %%v22,%%v18,1\n\t" @@ -244,8 +244,8 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v19,%%v23,%%v1,%%v19\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v18\n\t" - "vfadb %%v17,%%v17,%%v19\n\t" + "vfasb %%v16,%%v16,%%v18\n\t" + "vfasb %%v17,%%v17,%%v19\n\t" "vrepg %%v18,%%v16,1\n\t" "vrepg %%v19,%%v17,1\n\t" "vfasb %%v16,%%v16,%%v18\n\t" @@ -342,7 +342,7 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vfmasb %%v17,%%v19,%%v1,%%v17\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" "vrepg %%v17,%%v16,1\n\t" "vfasb %%v16,%%v16,%%v17\n\t" "verllg %%v17,%%v16,32\n\t" From dc6ac9eab0c59bcf56c1c512c099723215609fb2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Feb 2019 15:33:48 +0100 Subject: [PATCH 099/189] Fix declaration of input arguments in the x86_64 s/dGEMV_T and s/dGEMV_N kernels Arguments 0 and 1 need to be tagged as both input and output --- kernel/x86_64/dgemv_n_4.c | 10 +++++----- kernel/x86_64/dgemv_t_4.c | 18 +++++++++--------- kernel/x86_64/sgemv_n_4.c | 14 +++++++------- kernel/x86_64/sgemv_t_4.c | 18 +++++++++--------- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 6d2530e81e..6d33641e91 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 @@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "jnz 1b \n\t" : + "+r" (i), // 0 + "+r" (n) // 1 : - "r" (i), // 0 - "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap), // 4 diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index a7478e3a8b..ed672a7579 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "movsd %%xmm11,8(%2) \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (y), // 2 "r" (ap0), // 3 "r" (ap1), // 4 @@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "movsd %%xmm10, (%2) \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (y), // 2 "r" (ap), // 3 "r" (x) // 4 @@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (&da), // 2 "r" (src), // 3 "r" (dest) // 4 diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 65305ac59f..63697970fe 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 @@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "3: \n\t" : + "+r" (i), // 0 + "+r" (n1) // 1 : - "r" (i), // 0 - "r" (n1), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap), // 4 @@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) "jnz 1b \n\t" : + "+r" (i), // 0 + "+r" (n) // 1 : - "r" (i), // 0 - "r" (n), // 1 "r" (src), // 2 "r" (dest) // 3 : "cc", diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 065e5b3852..86ecaf516e 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "movss %%xmm11,4(%2) \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (y), // 2 "r" (ap0), // 3 "r" (ap1), // 4 @@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "movss %%xmm10, (%2) \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (y), // 2 "r" (ap), // 3 "r" (x) // 4 @@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (&da), // 2 "r" (src), // 3 "r" (dest) // 4 From 91481a3e4e88b26be920aff7d5c9e72ee82d6abc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Feb 2019 15:51:43 +0100 Subject: [PATCH 100/189] Fix declaration of input arguments in inline assembly Argument 0 is modified as it doubles as a counter --- kernel/x86_64/dscal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index ef9a0a6ba0..d0d7801fd4 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ "jnz 1b \n\t" : + "+r" (n) // 0 : - "r" (n), // 0 "r" (x), // 1 "r" (x1), // 2 "r" (alpha), // 3 From b824fa70ebdd0b66ed045dbb17c08519525af782 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Feb 2019 16:00:18 +0100 Subject: [PATCH 101/189] Fix declaration of assembly arguments in SSYMV and DSYMV microkernels Arguments 0 and 1 are both input and output --- kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 6 +++--- kernel/x86_64/dsymv_U_microk_haswell-2.c | 6 +++--- kernel/x86_64/dsymv_U_microk_nehalem-2.c | 6 +++--- kernel/x86_64/dsymv_U_microk_sandy-2.c | 6 +++--- kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 6 +++--- kernel/x86_64/ssymv_U_microk_haswell-2.c | 6 +++--- kernel/x86_64/ssymv_U_microk_nehalem-2.c | 6 +++--- kernel/x86_64/ssymv_U_microk_sandy-2.c | 6 +++--- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c index d7166fe4b4..ae287b6d8c 100644 --- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c @@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c index d83d20f8e6..4778f644a3 100644 --- a/kernel/x86_64/dsymv_U_microk_haswell-2.c +++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c @@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c index 1344c75f73..065182286a 100644 --- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c +++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c @@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "movsd %%xmm3 , 24(%9) \n\t" // save temp2 : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c index 1ef6fbafdc..d84e703bd5 100644 --- a/kernel/x86_64/dsymv_U_microk_sandy-2.c +++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c @@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c index 8c01ab8069..4a4f4d68de 100644 --- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c @@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c index a32e59b447..e6a09ccf88 100644 --- a/kernel/x86_64/ssymv_U_microk_haswell-2.c +++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c @@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c index b8e6ee7326..c56ff3b15d 100644 --- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c @@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "movss %%xmm3 , 12(%9) \n\t" // save temp2 : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c index e8650650cd..c4919a39a4 100644 --- a/kernel/x86_64/ssymv_U_microk_sandy-2.c +++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c @@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 From ab1630f9fac57245fbbfc20af91a060354e41c71 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Feb 2019 16:14:02 +0100 Subject: [PATCH 102/189] Fix declaration of arguments in inline assembly Argument 0 is modified so should be input and output --- kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 4 ++-- kernel/x86_64/dsymv_L_microk_haswell-2.c | 4 ++-- kernel/x86_64/dsymv_L_microk_nehalem-2.c | 4 ++-- kernel/x86_64/dsymv_L_microk_sandy-2.c | 4 ++-- kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 4 ++-- kernel/x86_64/ssymv_L_microk_haswell-2.c | 4 ++-- kernel/x86_64/ssymv_L_microk_nehalem-2.c | 4 ++-- kernel/x86_64/ssymv_L_microk_sandy-2.c | 8 ++++---- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c index d84470cc44..bfa07b6d02 100644 --- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c +++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c @@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c index 866782ee6f..6241879d5c 100644 --- a/kernel/x86_64/dsymv_L_microk_haswell-2.c +++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c @@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c index 38479f77af..a161dcd8b3 100644 --- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c +++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c @@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "movsd %%xmm3 , 24(%9) \n\t" // save temp2 : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c index b4e6ab3692..b205b10193 100644 --- a/kernel/x86_64/dsymv_L_microk_sandy-2.c +++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c @@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c index 9002228f32..602c3edf2d 100644 --- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c @@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c index 69db008b66..fdfe4349a1 100644 --- a/kernel/x86_64/ssymv_L_microk_haswell-2.c +++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c @@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c index c0fe5d6401..6bb9c02f6f 100644 --- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c @@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F "movss %%xmm3 , 12(%9) \n\t" // save temp2 : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c index 093ca8073c..0c78212e7d 100644 --- a/kernel/x86_64/ssymv_L_microk_sandy-2.c +++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c @@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 @@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 From bec54ae366ebce932b6bd6bdc89d4e585a0da798 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 13 Feb 2019 12:54:35 +0200 Subject: [PATCH 103/189] [ZARCH] Fix caxpy --- kernel/zarch/caxpy.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index e4b484ab7d..14a124ae25 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -65,6 +65,14 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vl %%v21,80(%%r1,%[y])\n\t" "vl %%v22,96(%%r1,%[y])\n\t" "vl %%v23,112(%%r1,%[y])\n\t" + "verllg %%v24,%%v8,32\n\t" + "verllg %%v25,%%v9,32\n\t" + "verllg %%v26,%%v10,32\n\t" + "verllg %%v27,%%v11,32\n\t" + "verllg %%v28,%%v16,32\n\t" + "verllg %%v29,%%v17,32\n\t" + "verllg %%v30,%%v18,32\n\t" + "verllg %%v31,%%v19,32\n\t" "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" From 0a54c98b9d9a6ad8364297bbef0eea4b000a92f0 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 13 Feb 2019 21:06:25 +0200 Subject: [PATCH 104/189] [ZARCH] Modify constraints --- kernel/zarch/cgemv_n_4.c | 2 +- kernel/zarch/zgemv_n_4.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index adba05d475..5c36bc3383 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -352,7 +352,7 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "brctg %[n],0b" : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), - [src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 5ca8da3c17..13045a3591 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -263,7 +263,7 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "brctg %[n],0b" : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), - [src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); From f9d67bb5e8e895fd5fe7e36e43febef7aa06ef35 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Feb 2019 22:06:41 +0100 Subject: [PATCH 105/189] Fix out-of-bounds memory access in gemm_beta Fixes #2011 (as suggested by davemq) presuming typo by K.Goto --- kernel/power/gemm_beta.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S index 62d7761ec7..7acc05b4df 100644 --- a/kernel/power/gemm_beta.S +++ b/kernel/power/gemm_beta.S @@ -129,7 +129,7 @@ LL(12): STFD f0, 14 * SIZE(CO1) STFD f0, 15 * SIZE(CO1) - dcbst PRE, CO1 + dcbtst PRE, CO1 addi CO1, CO1, 16 * SIZE bdnz LL(12) .align 4 From 718efcec6fb6d45d5dd461ed47b26f49c2c4e77d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Feb 2019 22:08:37 +0100 Subject: [PATCH 106/189] Fix out-of-bounds memory access in gemm_beta Fixes #2011 (as suggested by davemq), assuming typo by K.Goto --- kernel/power/zgemm_beta.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S index 43b72ca157..1f4c29210d 100644 --- a/kernel/power/zgemm_beta.S +++ b/kernel/power/zgemm_beta.S @@ -134,7 +134,7 @@ LL(12): STFD f0, 14 * SIZE(CO1) STFD f0, 15 * SIZE(CO1) - dcbst PRE, CO1 + dcbtst PRE, CO1 addi CO1, CO1, 16 * SIZE bdnz LL(12) .align 4 From b55c586faca28863db16a2148b69aaa37aaa797e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Feb 2019 15:21:36 +0100 Subject: [PATCH 107/189] Fix missing clobber in x86/x86_64 blas_quickdivide inline assembly function (#2017) * Fix missing clobber in blas_quickdivide assembly --- common_x86.h | 2 +- common_x86_64.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common_x86.h b/common_x86.h index 4f538c948e..3fdffe2a85 100644 --- a/common_x86.h +++ b/common_x86.h @@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; - __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); + __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y)); return result; #endif diff --git a/common_x86_64.h b/common_x86_64.h index f27c1e9be8..718a81050b 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -210,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; - __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); + __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); return result; } From 69a97ca7b9d7bbbb9b9f018592586e3c17b51a57 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Thu, 14 Feb 2019 16:19:41 +0000 Subject: [PATCH 108/189] dgemv_kernel_4x4(Haswell): add missing clobbers for xmm0,xmm1,xmm2,xmm3 This fixes a crash in dblat2 when OpenBLAS is compiled using -march=znver1 -ftree-vectorize -O2 See also: https://github.com/easybuilders/easybuild-easyconfigs/issues/7180 --- kernel/x86_64/dgemv_n_microk_haswell-4.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index 584a6c6b5b..da0fa2fff2 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", From 46e415b1405044b038586537d213e4f2f04b8536 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Feb 2019 22:43:18 +0100 Subject: [PATCH 109/189] Save and restore input argument 8 (lda4) Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009) --- kernel/x86_64/sgemv_n_microk_haswell-4.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index 2c90f8aa99..e89a16785d 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #define HAVE_KERNEL_4x8 1 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); @@ -49,6 +48,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastss (%9), %%ymm6 \n\t" // alpha + "movq %8, %%xmm10 \n\t" //save lda + "testq $0x04, %1 \n\t" "jz 2f \n\t" @@ -151,6 +152,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "4: \n\t" "vzeroupper \n\t" + "movq %%xmm10, %8 \n\t" //restore lda : "+r" (i), // 0 @@ -170,6 +172,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", + "%xmm10", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); @@ -177,7 +180,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO } - #define HAVE_KERNEL_4x4 1 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -196,6 +198,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vbroadcastss (%8), %%ymm6 \n\t" // alpha + "testq $0x04, %1 \n\t" "jz 2f \n\t" From adb419ed67cb6b3c416a7e6babdd28390cefe37d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Feb 2019 22:57:30 +0100 Subject: [PATCH 110/189] With the Intel compiler on Linux, prefer ifort for the final link step icc has known problems with mixed-language builds that ifort can handle just fine. Fixes #1956 --- exports/Makefile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 3a5f77db3d..b1348bd4ac 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -141,6 +141,14 @@ else $(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed ../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c endif + +ifeq ($(F_COMPILER), INTEL) + $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive $< -Wl,--no-whole-archive \ + -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. +else + ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive $< -Wl,--no-whole-archive \ @@ -152,6 +160,7 @@ else -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. +endif endif rm -f linktest From 4255a58cd22d5395dbd6573683298849bd3a23b5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Feb 2019 10:10:04 +0100 Subject: [PATCH 111/189] Rename operands to put lda on the input/output constraint list --- kernel/x86_64/sgemv_n_microk_haswell-4.c | 126 +++++++++++------------ 1 file changed, 61 insertions(+), 65 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index e89a16785d..93e1e26e8a 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -37,43 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( "vzeroupper \n\t" - "vbroadcastss (%2), %%ymm12 \n\t" // x0 - "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 - "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 - "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 - "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 - "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 - "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 - "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + "vbroadcastss (%3), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 "vbroadcastss (%9), %%ymm6 \n\t" // alpha - "movq %8, %%xmm10 \n\t" //save lda - "testq $0x04, %1 \n\t" "jz 2f \n\t" - "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" - "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" - "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t" - "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" - "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" - "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" - "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" + "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t" + "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t" + "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t" + "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t" "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" - "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y - "addq $4 , %8 \n\t" + "addq $4 , %2 \n\t" "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" @@ -82,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "testq $0x08, %1 \n\t" "jz 3f \n\t" - "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" - "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" - "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" - "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t" - "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" - "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" - "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" - "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" + "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" + "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t" + "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t" "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" - "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y - "addq $8 , %8 \n\t" + "addq $8 , %2 \n\t" "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" @@ -118,53 +116,52 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y - "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y - - "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" - "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" - "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" - "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" - "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" - "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" - "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" - "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" - - "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" + "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y + "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y + + "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t" + "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t" + "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t" + + "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" "addq $16, %0 \n\t" - "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" - "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" - "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" - "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" - "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" - "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" - "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" + "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t" + "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t" + "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t" + "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t" + "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t" + "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t" "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" - "addq $16, %8 \n\t" - "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y + "addq $16, %2 \n\t" + "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y "subq $16, %1 \n\t" - "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y "jnz 1b \n\t" "4: \n\t" "vzeroupper \n\t" - "movq %%xmm10, %8 \n\t" //restore lda : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", @@ -172,7 +169,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", - "%xmm10", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Feb 2019 15:08:16 +0100 Subject: [PATCH 112/189] Fix wrong constraints in inline assembly for #2009 --- kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c index fcab8e2c78..9ab78fc8ea 100644 --- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c +++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c @@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " cmpq $0, %0 \n\t" " je 4f \n\t" - " vmovups (%2,%1,4), %%ymm0 \n\t" // read a - " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 - " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 + " vmovups (%8,%1,4), %%ymm0 \n\t" // read a + " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 + " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 " addq $8, %1 \n\t" @@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .p2align 4 \n\t" "1: \n\t" - " vmovups (%2,%1,4), %%ymm4 \n\t" // read a + " vmovups (%8,%1,4), %%ymm4 \n\t" // read a " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" - " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0 + " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0 " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t" " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t" " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" - " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1 + " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" @@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 22f \n\t" - " vmovups (%2,%1,4), %%ymm0 \n\t" // read a + " vmovups (%8,%1,4), %%ymm0 \n\t" // read a " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" - " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 + " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" - " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 + " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" @@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7 " vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t" - " vmovups (%9), %%ymm0 \n\t" + " vmovups (%3), %%ymm0 \n\t" " vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t" " vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t" " vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t" @@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t" " vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t" - " vmovups 32(%9), %%ymm4 \n\t" + " vmovups 32(%3), %%ymm4 \n\t" " vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t" " vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t" " vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t" @@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "5: \n\t" // i = 0 - " addq $64, %9 \n\t" // b=b+8 + " addq $64, %3 \n\t" // b=b+8 " vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb - " vmovups (%9), %%ymm0 \n\t" - " vmovups %%ymm8 , (%8) \n\t" // write a + " vmovups (%3), %%ymm0 \n\t" + " vmovups %%ymm8 , (%2) \n\t" // write a " vmovups %%ymm8 , (%4) \n\t" // write c " vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t" - " vmovups 32(%9), %%ymm1 \n\t" + " vmovups 32(%3), %%ymm1 \n\t" " vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t" " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" " vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t" @@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb - " vmovups (%9), %%ymm0 \n\t" - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm9 , (%8) \n\t" // write a + " vmovups (%3), %%ymm0 \n\t" + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm9 , (%2) \n\t" // write a " vmovups %%ymm9 , (%4,%7,1) \n\t" // write c " vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t" @@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb - " vmovups (%9), %%ymm0 \n\t" - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm10, (%8) \n\t" // write a + " vmovups (%3), %%ymm0 \n\t" + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm10, (%2) \n\t" // write a " vmovups %%ymm10, (%4,%7,2) \n\t" // write c " vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t" @@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm11, (%8) \n\t" // write a + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm11, (%2) \n\t" // write a " vmovups %%ymm11, (%5) \n\t" // write c " vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t" @@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm12, (%8) \n\t" // write a + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm12, (%2) \n\t" // write a " vmovups %%ymm12, (%5,%7,1) \n\t" // write c " vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t" @@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm13, (%8) \n\t" // write a + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm13, (%2) \n\t" // write a " vmovups %%ymm13, (%5,%7,2) \n\t" // write c " vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t" @@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm14, (%8) \n\t" // write a + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm14, (%2) \n\t" // write a " vmovups %%ymm14, (%6) \n\t" // write c " vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm0 \n\t" - " addq $32, %8 \n\t" // a=a+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb - " vmovups %%ymm15, (%8) \n\t" // write a + " vmovups %%ymm15, (%2) \n\t" // write a " vmovups %%ymm15, (%6,%7,1) \n\t" // write c " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 "r" (c), // 4 "r" (c3), // 5 "r" (c6), // 6 "r" (ldc), // 7 - "r" (as), // 8 - "r" (bs) // 9 + "r" (a), // 8 + "r" (b) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", From f209fc7fa90a583e60ff2c667821d39ae0efbe70 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sat, 16 Feb 2019 12:12:39 +0100 Subject: [PATCH 113/189] Update Makefile.rule add note about NUM_THREADS for package maintainers, add examples of programs that cause affinity troubles --- Makefile.rule | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index faf34c0a11..bba3d15884 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -72,10 +72,16 @@ VERSION = 0.3.6.dev # You can define the maximum number of threads. Basically it should be less # than or equal to the number of CPU threads. If you don't specify one, it's -# automatically detected by the the script. +# automatically detected by the the build system. # If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to # restrict NUM_THREADS to the number of physical cores. By default, the automatic # detection includes logical CPUs, thus allowing the use of SMT. +# Users may opt at runtime to use less than NUM_THREADS threads. +# +# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS +# value (eg. 32-256) if you expect your users to use that many threads. Due to the way +# some internal structures are allocated, using a large NUM_THREADS value has a RAM +# footprint penalty, even if users reduce the actual number of threads at runtime. # NUM_THREADS = 24 # If you have enabled USE_OPENMP and your application would call @@ -138,6 +144,7 @@ NO_WARMUP = 1 # to the same core(s) as OpenBLAS, possibly binding all threads to a single core. # For this reason, affinity handling is disabled by default. Can be safely enabled if nothing # else modifies affinity settings. +# Note: enabling affinity has been known to cause problems with NumPy and R NO_AFFINITY = 1 # If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus From 9d8be1578983d9fec6a1a7ae81d4ef9c1ac4c08c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 18:24:11 +0100 Subject: [PATCH 114/189] Fix inline assembly constraints rework indices to allow marking argument lda4 as input and output. For #2009 --- kernel/x86_64/sgemv_n_microk_nehalem-4.c | 54 ++++++++++++------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c index 11a3e943b7..d21232bfaf 100644 --- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c @@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( - "movss (%2), %%xmm12 \n\t" // x0 - "movss 4(%2), %%xmm13 \n\t" // x1 - "movss 8(%2), %%xmm14 \n\t" // x2 - "movss 12(%2), %%xmm15 \n\t" // x3 + "movss (%3), %%xmm12 \n\t" // x0 + "movss 4(%3), %%xmm13 \n\t" // x1 + "movss 8(%3), %%xmm14 \n\t" // x2 + "movss 12(%3), %%xmm15 \n\t" // x3 "shufps $0, %%xmm12, %%xmm12\n\t" "shufps $0, %%xmm13, %%xmm13\n\t" "shufps $0, %%xmm14, %%xmm14\n\t" "shufps $0, %%xmm15, %%xmm15\n\t" - "movss 16(%2), %%xmm0 \n\t" // x4 - "movss 20(%2), %%xmm1 \n\t" // x5 - "movss 24(%2), %%xmm2 \n\t" // x6 - "movss 28(%2), %%xmm3 \n\t" // x7 + "movss 16(%3), %%xmm0 \n\t" // x4 + "movss 20(%3), %%xmm1 \n\t" // x5 + "movss 24(%3), %%xmm2 \n\t" // x6 + "movss 28(%3), %%xmm3 \n\t" // x7 "shufps $0, %%xmm0 , %%xmm0 \n\t" "shufps $0, %%xmm1 , %%xmm1 \n\t" "shufps $0, %%xmm2 , %%xmm2 \n\t" @@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "1: \n\t" "xorps %%xmm4 , %%xmm4 \n\t" "xorps %%xmm5 , %%xmm5 \n\t" - "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y + "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y ".p2align 1 \n\t" - "movups (%4,%0,4), %%xmm8 \n\t" - "movups (%5,%0,4), %%xmm9 \n\t" - "movups (%6,%0,4), %%xmm10 \n\t" - "movups (%7,%0,4), %%xmm11 \n\t" + "movups (%5,%0,4), %%xmm8 \n\t" + "movups (%6,%0,4), %%xmm9 \n\t" + "movups (%7,%0,4), %%xmm10 \n\t" + "movups (%8,%0,4), %%xmm11 \n\t" ".p2align 1 \n\t" "mulps %%xmm12, %%xmm8 \n\t" "mulps %%xmm13, %%xmm9 \n\t" @@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addps %%xmm10, %%xmm4 \n\t" "addps %%xmm11, %%xmm5 \n\t" - "movups (%4,%8,4), %%xmm8 \n\t" - "movups (%5,%8,4), %%xmm9 \n\t" - "movups (%6,%8,4), %%xmm10 \n\t" - "movups (%7,%8,4), %%xmm11 \n\t" + "movups (%5,%2,4), %%xmm8 \n\t" + "movups (%6,%2,4), %%xmm9 \n\t" + "movups (%7,%2,4), %%xmm10 \n\t" + "movups (%8,%2,4), %%xmm11 \n\t" ".p2align 1 \n\t" "mulps %%xmm0 , %%xmm8 \n\t" "mulps %%xmm1 , %%xmm9 \n\t" @@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addps %%xmm10, %%xmm4 \n\t" "addps %%xmm11, %%xmm5 \n\t" - "addq $4 , %8 \n\t" + "addq $4 , %2 \n\t" "addps %%xmm5 , %%xmm4 \n\t" "addq $4 , %0 \n\t" "mulps %%xmm6 , %%xmm4 \n\t" "subq $4 , %1 \n\t" "addps %%xmm4 , %%xmm7 \n\t" - "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y + "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y "jnz 1b \n\t" : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", From e976557d2965efb687aaaf88e7829bdd9438a7a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 18:36:39 +0100 Subject: [PATCH 115/189] Fix inline assembly constraints rework indices to allow marking argument lda as input and output. --- kernel/x86_64/sgemv_n_microk_sandy-4.c | 130 ++++++++++++------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c index b35daa35b0..3fc46542b7 100644 --- a/kernel/x86_64/sgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c @@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( "vzeroupper \n\t" - "vbroadcastss (%2), %%ymm12 \n\t" // x0 - "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 - "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 - "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 - "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 - "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 - "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 - "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + "vbroadcastss (%3), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 "vbroadcastss (%9), %%ymm6 \n\t" // alpha @@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" - "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y - "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" - "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" - "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" - "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" + "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t" + "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t" + "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t" + "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" - "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" - "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" - "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" - "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" + "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t" + "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t" + "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t" + "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" @@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" - "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y - "addq $4, %8 \n\t" + "addq $4, %2 \n\t" "addq $4, %0 \n\t" "subq $4, %1 \n\t" @@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" - "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y - "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" - "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" - "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" - "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t" + "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" - "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" - "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" - "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" + "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" + "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" + "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t" + "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" @@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" - "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y - "addq $8, %8 \n\t" + "addq $8, %2 \n\t" "addq $8, %0 \n\t" "subq $8, %1 \n\t" @@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" - "prefetcht0 192(%4,%0,4) \n\t" - "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" - "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" "prefetcht0 192(%5,%0,4) \n\t" - "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" - "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" + "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t" + "prefetcht0 192(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "prefetcht0 192(%6,%0,4) \n\t" - "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" - "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" "prefetcht0 192(%7,%0,4) \n\t" - "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" - "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t" + "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t" + "prefetcht0 192(%8,%0,4) \n\t" + "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t" + "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "prefetcht0 192(%4,%8,4) \n\t" - "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" - "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" - "prefetcht0 192(%5,%8,4) \n\t" - "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" - "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" + "prefetcht0 192(%5,%2,4) \n\t" + "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" + "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t" + "prefetcht0 192(%6,%2,4) \n\t" + "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" + "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "prefetcht0 192(%6,%8,4) \n\t" - "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" - "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" - "prefetcht0 192(%7,%8,4) \n\t" - "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" - "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" + "prefetcht0 192(%7,%2,4) \n\t" + "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t" + "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t" + "prefetcht0 192(%8,%2,4) \n\t" + "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t" + "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" @@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" - "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y - "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y + "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y + "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y - "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y - "addq $16, %8 \n\t" + "addq $16, %2 \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" @@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", From efb9038f7273cddc1ef30fce6ed4df7967a2fb03 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 18:46:17 +0100 Subject: [PATCH 116/189] Fix inline assembly constraints --- kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 194 ++++++++++----------- 1 file changed, 97 insertions(+), 97 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c index 31001c7f3d..bbf06c84b5 100644 --- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c +++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c @@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( - "vbroadcastss (%2), %%xmm12 \n\t" // x0 - "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 - "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 - "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 - "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 - "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 - "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 - "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 + "vbroadcastss (%3), %%xmm12 \n\t" // x0 + "vbroadcastss 4(%3), %%xmm13 \n\t" // x1 + "vbroadcastss 8(%3), %%xmm14 \n\t" // x2 + "vbroadcastss 12(%3), %%xmm15 \n\t" // x3 + "vbroadcastss 16(%3), %%xmm0 \n\t" // x4 + "vbroadcastss 20(%3), %%xmm1 \n\t" // x5 + "vbroadcastss 24(%3), %%xmm2 \n\t" // x6 + "vbroadcastss 28(%3), %%xmm3 \n\t" // x7 "vbroadcastss (%9), %%xmm8 \n\t" // alpha @@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t" "addq $4 , %0 \n\t" - "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" - "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" - "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" - "addq $4 , %8 \n\t" + "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t" + "addq $4 , %2 \n\t" "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t" - "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" + "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" "subq $4 , %1 \n\t" - "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y "2: \n\t" @@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" - - "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" - "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" - "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" - "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y - "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y + "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" + "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" + "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y "addq $8 , %0 \n\t" - "addq $8 , %8 \n\t" + "addq $8 , %2 \n\t" "subq $8 , %1 \n\t" @@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t" "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t" - "prefetcht0 192(%4,%0,4) \n\t" - "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" "prefetcht0 192(%5,%0,4) \n\t" - "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" "prefetcht0 192(%6,%0,4) \n\t" - "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" "prefetcht0 192(%7,%0,4) \n\t" - "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" + "prefetcht0 192(%8,%0,4) \n\t" + "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" ".align 2 \n\t" - "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" - - "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" - - "prefetcht0 192(%4,%8,4) \n\t" - "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" - "prefetcht0 192(%5,%8,4) \n\t" - "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" - "prefetcht0 192(%6,%8,4) \n\t" - "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" - "prefetcht0 192(%7,%8,4) \n\t" - "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" + "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t" + + "prefetcht0 192(%5,%2,4) \n\t" + "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" + "prefetcht0 192(%6,%2,4) \n\t" + "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" + "prefetcht0 192(%7,%2,4) \n\t" + "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" + "prefetcht0 192(%8,%2,4) \n\t" + "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" - "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t" - "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" - "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" - "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" - "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" + "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" + "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" + "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" + "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" "addq $16, %0 \n\t" - "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y - "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y - "addq $16, %8 \n\t" - "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y - "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y + "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y + "addq $16, %2 \n\t" + "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y + "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y "subq $16, %1 \n\t" "jnz 1b \n\t" @@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", From 8242b1fe3f6c3a49b342d99157cd04632267c009 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 18:51:09 +0100 Subject: [PATCH 117/189] Fix inline assembly constraints --- dgemv_n_microk_piledriver-4.c | 247 ++++++++++++++++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 dgemv_n_microk_piledriver-4.c diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c new file mode 100644 index 0000000000..466931b82f --- /dev/null +++ b/dgemv_n_microk_piledriver-4.c @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%3), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 + "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 + "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 + "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 + "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 + + "vbroadcastsd (%9), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + + "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + + "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" + "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + + "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y + + "addq $4 , %2 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "2: \n\t" + + "cmpq $0, %1 \n\t" + "je 3f \n\t" + + + ".align 16 \n\t" + "1: \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" + "addq $8 , %0 \n\t" + "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" + "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" + "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" + "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" + "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "addq $8 , %2 \n\t" + "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y + "subq $8 , %1 \n\t" + "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y + + "jnz 1b \n\t" + + "3: \n\t" + "vzeroupper \n\t" + + : + "+r" (i), // 0 + "+r" (n), // 1 + "+r" (lda4) // 2 + : + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x4 1 +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%2), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + + "vbroadcastsd (%8), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "2: \n\t" + + "cmpq $0, %1 \n\t" + "je 3f \n\t" + + + ".align 16 \n\t" + "1: \n\t" + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y + "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + + "3: \n\t" + "vzeroupper \n\t" + + : + "+r" (i), // 0 + "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From f9bb76d29af48f448a8ab2bdfffc962d9623a3df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 20:06:48 +0100 Subject: [PATCH 118/189] Fix inline assembly constraints in Bulldozer TRSM kernels rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009 --- kernel/x86_64/dtrsm_kernel_RT_bulldozer.c | 96 ++++---- kernel/x86_64/strsm_kernel_LN_bulldozer.c | 252 ++++++++++----------- kernel/x86_64/strsm_kernel_LT_bulldozer.c | 256 +++++++++++----------- kernel/x86_64/strsm_kernel_RN_bulldozer.c | 54 ++--- kernel/x86_64/strsm_kernel_RT_bulldozer.c | 54 ++--- 5 files changed, 356 insertions(+), 356 deletions(-) diff --git a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c index 54df5b3594..35ed4cc013 100644 --- a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c +++ b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c @@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " prefetcht0 384(%2,%1,8) \n\t" - " prefetcht0 384(%3,%1,8) \n\t" - " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vmovddup 8(%3,%1,2), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%6,%1,8) \n\t" + " prefetcht0 384(%7,%1,8) \n\t" + " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vmovddup 8(%7,%1,2), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 2f \n\t" - " prefetcht0 384(%2,%1,8) \n\t" - " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vmovddup 8(%3,%1,2), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%6,%1,8) \n\t" + " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vmovddup 8(%7,%1,2), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 2f \n\t" - " prefetcht0 384(%2,%1,8) \n\t" - " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vmovddup 8(%3,%1,2), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%6,%1,8) \n\t" + " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vmovddup 8(%7,%1,2), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 2f \n\t" - " prefetcht0 384(%2,%1,8) \n\t" - " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b - " vmovddup 8(%3,%1,2), %%xmm1 \n\t" - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%6,%1,8) \n\t" + " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b + " vmovddup 8(%7,%1,2), %%xmm1 \n\t" + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" // i = 1 - " vmovddup (%7), %%xmm1 \n\t" // read b - " vmovddup 8(%7), %%xmm0 \n\t" // read bb + " vmovddup (%3), %%xmm1 \n\t" // read b + " vmovddup 8(%3), %%xmm0 \n\t" // read bb " vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - " vmovups %%xmm12 , (%6) \n\t" // write a - " vmovups %%xmm13 , 16(%6) \n\t" // write a - " vmovups %%xmm14 , 32(%6) \n\t" // write a - " vmovups %%xmm15 , 48(%6) \n\t" // write a + " vmovups %%xmm12 , (%2) \n\t" // write a + " vmovups %%xmm13 , 16(%2) \n\t" // write a + " vmovups %%xmm14 , 32(%2) \n\t" // write a + " vmovups %%xmm15 , 48(%2) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" @@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" " \n\t" // i = 0 - " subq $16 , %7 \n\t" // b = b - 2 - " subq $64 , %6 \n\t" // a = a - 8 + " subq $16 , %3 \n\t" // b = b - 2 + " subq $64 , %2 \n\t" // a = a - 8 - " vmovddup (%7), %%xmm0 \n\t" // read bb + " vmovddup (%3), %%xmm0 \n\t" // read bb " vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t" - " vmovups %%xmm8 , (%6) \n\t" // write a - " vmovups %%xmm9 , 16(%6) \n\t" - " vmovups %%xmm10 , 32(%6) \n\t" - " vmovups %%xmm11 , 48(%6) \n\t" + " vmovups %%xmm8 , (%2) \n\t" // write a + " vmovups %%xmm9 , 16(%2) \n\t" + " vmovups %%xmm10 , 32(%2) \n\t" + " vmovups %%xmm11 , 48(%2) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" @@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_LN_bulldozer.c b/kernel/x86_64/strsm_kernel_LN_bulldozer.c index 1b8991c6cf..3cd215000b 100644 --- a/kernel/x86_64/strsm_kernel_LN_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_LN_bulldozer.c @@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" - " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] + " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] + " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] + " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] + " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] + " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] + " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i] + " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9 , read aa[i] " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i] + " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8 , read aa[i] " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i] + " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7 , read aa[i] " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i] + " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6 , read aa[i] " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i] + " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5 , read aa[i] " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i] + " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4 , read aa[i] " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i] + " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3 , read aa[i] " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i] + " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2 , read aa[i] " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i] + " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1 , read aa[i] " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i] + " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0 , read aa[i] " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_LT_bulldozer.c b/kernel/x86_64/strsm_kernel_LT_bulldozer.c index 0623dddb0c..a4a62491cd 100644 --- a/kernel/x86_64/strsm_kernel_LT_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_LT_bulldozer.c @@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" - " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i] + " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0, read aa[i] " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i] + " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1, read aa[i] " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i] + " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2, read aa[i] " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i] + " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3, read aa[i] " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i] + " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4, read aa[i] " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i] + " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5, read aa[i] " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i] + " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6, read aa[i] " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i] + " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7, read aa[i] " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i] + " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8, read aa[i] " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i] + " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9, read aa[i] " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] + " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] + " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] + " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] + " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] + " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] + " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (c), // 4 + "r" (c1), // 5 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_RN_bulldozer.c b/kernel/x86_64/strsm_kernel_RN_bulldozer.c index 4cc557d552..c11c84cec8 100644 --- a/kernel/x86_64/strsm_kernel_RN_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_RN_bulldozer.c @@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" // i = 0 - " vbroadcastss (%7), %%xmm0 \n\t" // read bb - " vbroadcastss 4(%7), %%xmm1 \n\t" // read b + " vbroadcastss (%3), %%xmm0 \n\t" // read bb + " vbroadcastss 4(%3), %%xmm1 \n\t" // read b " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - " vmovups %%xmm8 , (%6) \n\t" // write a - " vmovups %%xmm9 , 16(%6) \n\t" - " vmovups %%xmm10 , 32(%6) \n\t" - " vmovups %%xmm11 , 48(%6) \n\t" + " vmovups %%xmm8 , (%2) \n\t" // write a + " vmovups %%xmm9 , 16(%2) \n\t" + " vmovups %%xmm10 , 32(%2) \n\t" + " vmovups %%xmm11 , 48(%2) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" @@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t" " \n\t" // i = 1 - " addq $8 , %7 \n\t" // b = b + 2 - " addq $64 , %6 \n\t" // a = a + 16 + " addq $8 , %3 \n\t" // b = b + 2 + " addq $64 , %2 \n\t" // a = a + 16 - " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb + " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - " vmovups %%xmm12 , (%6) \n\t" // write a - " vmovups %%xmm13 , 16(%6) \n\t" // write a - " vmovups %%xmm14 , 32(%6) \n\t" // write a - " vmovups %%xmm15 , 48(%6) \n\t" // write a + " vmovups %%xmm12 , (%2) \n\t" // write a + " vmovups %%xmm13 , 16(%2) \n\t" // write a + " vmovups %%xmm14 , 32(%2) \n\t" // write a + " vmovups %%xmm15 , 48(%2) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" @@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (c), // 4 + "r" (c1), // 5 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_RT_bulldozer.c b/kernel/x86_64/strsm_kernel_RT_bulldozer.c index 73f6e8a956..326ca29761 100644 --- a/kernel/x86_64/strsm_kernel_RT_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_RT_bulldozer.c @@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" // i = 1 - " vbroadcastss (%7), %%xmm1 \n\t" // read b - " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb + " vbroadcastss (%3), %%xmm1 \n\t" // read b + " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - " vmovups %%xmm12 , (%6) \n\t" // write a - " vmovups %%xmm13 , 16(%6) \n\t" // write a - " vmovups %%xmm14 , 32(%6) \n\t" // write a - " vmovups %%xmm15 , 48(%6) \n\t" // write a + " vmovups %%xmm12 , (%2) \n\t" // write a + " vmovups %%xmm13 , 16(%2) \n\t" // write a + " vmovups %%xmm14 , 32(%2) \n\t" // write a + " vmovups %%xmm15 , 48(%2) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" @@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" " \n\t" // i = 0 - " subq $8 , %7 \n\t" // b = b - 2 - " subq $64 , %6 \n\t" // a = a - 16 + " subq $8 , %3 \n\t" // b = b - 2 + " subq $64 , %2 \n\t" // a = a - 16 - " vbroadcastss (%7), %%xmm0 \n\t" // read bb + " vbroadcastss (%3), %%xmm0 \n\t" // read bb " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - " vmovups %%xmm8 , (%6) \n\t" // write a - " vmovups %%xmm9 , 16(%6) \n\t" - " vmovups %%xmm10 , 32(%6) \n\t" - " vmovups %%xmm11 , 48(%6) \n\t" + " vmovups %%xmm8 , (%2) \n\t" // write a + " vmovups %%xmm9 , 16(%2) \n\t" + " vmovups %%xmm10 , 32(%2) \n\t" + " vmovups %%xmm11 , 48(%2) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" @@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (c), // 4 + "r" (c1), // 5 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", From 56089991e2305ce692482186825c44c89a535518 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sat, 16 Feb 2019 23:26:13 +0100 Subject: [PATCH 119/189] fix the the --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index bba3d15884..91f42e3960 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -72,7 +72,7 @@ VERSION = 0.3.6.dev # You can define the maximum number of threads. Basically it should be less # than or equal to the number of CPU threads. If you don't specify one, it's -# automatically detected by the the build system. +# automatically detected by the build system. # If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to # restrict NUM_THREADS to the number of physical cores. By default, the automatic # detection includes logical CPUs, thus allowing the use of SMT. From 78d9910236739e98a16244679bbd814f1d79ca7f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Feb 2019 20:59:48 +0100 Subject: [PATCH 120/189] Correct range_n limiting same bug as seen in #1388, somehow missed in corresponding PR #1389 --- driver/level2/trmv_thread.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 24b881a93b..00092e9569 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -346,8 +346,8 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - if (range_n[num_cpu] > m) range_n[num_cpu] = m; - + if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; + } queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; @@ -386,8 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - if (range_n[num_cpu] > m) range_n[num_cpu] = m; - + if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; From e29b0cfcc439b1598ba26486763b3cfa46583a9e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Feb 2019 21:03:30 +0100 Subject: [PATCH 121/189] Allow multithreading TRMV again revert workaround introduced for issue #1332 as the actual cause appears to be my incorrect fix from #1262 (see #1388) --- interface/trmv.c | 5 +---- interface/ztrmv.c | 3 --- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/interface/trmv.c b/interface/trmv.c index 7c40ae976f..2e52527a3c 100644 --- a/interface/trmv.c +++ b/interface/trmv.c @@ -218,11 +218,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP -/* nthreads = num_cpu_avail(2); + nthreads = num_cpu_avail(2); -FIXME trmv_thread was found to be broken, see issue 1332 */ - nthreads = 1; - if (nthreads == 1) { #endif diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 0e16632e06..4c47e9e913 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -239,9 +239,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } else nthreads = 1; -/* FIXME TRMV multithreading appears to be broken, see issue 1332*/ - nthreads = 1; - if(nthreads > 1) { buffer_size = n > 16 ? 0 : n * 4 + 40; } From 45333d57931ddc64fb3e8a091e0616dd9528cef1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Feb 2019 22:16:33 +0100 Subject: [PATCH 122/189] Fix error introduced during cleanup --- driver/level2/trmv_thread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 00092e9569..43eeb40d25 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -347,7 +347,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; - } + queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; @@ -387,6 +387,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; + queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; From 343b301d14875a17ff4357bd98bea29d0df70741 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Feb 2019 10:27:48 +0100 Subject: [PATCH 123/189] Reduce list of kernels in the dynamic arch build to make compilation complete reliably within the 1h limit again --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 741c662910..44a616aaaf 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -55,7 +55,7 @@ before_build: - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. + - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. build_script: - cmake --build . From e5df5958cc263550614ecdc0177d9e7704eb0c58 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 24 Feb 2019 20:39:25 +0200 Subject: [PATCH 124/189] init From 6eee1beac524b5582a6c6de14d9d35a78c1ece74 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 24 Feb 2019 20:41:02 +0200 Subject: [PATCH 125/189] move fix to right place --- dgemv_n_microk_piledriver-4.c | 247 -------------------- kernel/x86_64/dgemv_n_microk_piledriver-4.c | 98 ++++---- 2 files changed, 49 insertions(+), 296 deletions(-) delete mode 100644 dgemv_n_microk_piledriver-4.c diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c deleted file mode 100644 index 466931b82f..0000000000 --- a/dgemv_n_microk_piledriver-4.c +++ /dev/null @@ -1,247 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - - -#define HAVE_KERNEL_4x8 1 -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); - -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vzeroupper \n\t" - "vbroadcastsd (%3), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 - "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 - "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 - "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 - "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 - - "vbroadcastsd (%9), %%ymm6 \n\t" // alpha - - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - - "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - - "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" - "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" - - "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - - "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y - - "addq $4 , %2 \n\t" - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - - "2: \n\t" - - "cmpq $0, %1 \n\t" - "je 3f \n\t" - - - ".align 16 \n\t" - "1: \n\t" - - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y - - "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" - "addq $8 , %0 \n\t" - "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" - "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" - "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" - "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" - "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" - - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - - "addq $8 , %2 \n\t" - "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y - "subq $8 , %1 \n\t" - "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y - - "jnz 1b \n\t" - - "3: \n\t" - "vzeroupper \n\t" - - : - "+r" (i), // 0 - "+r" (n), // 1 - "+r" (lda4) // 2 - : - "r" (x), // 3 - "r" (y), // 4 - "r" (ap[0]), // 5 - "r" (ap[1]), // 6 - "r" (ap[2]), // 7 - "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -#define HAVE_KERNEL_4x4 1 -static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); - -static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vzeroupper \n\t" - "vbroadcastsd (%2), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 - - "vbroadcastsd (%8), %%ymm6 \n\t" // alpha - - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y - - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - - "2: \n\t" - - "cmpq $0, %1 \n\t" - "je 3f \n\t" - - - ".align 16 \n\t" - "1: \n\t" - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - - "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y - "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y - - "addq $8 , %0 \n\t" - "subq $8 , %1 \n\t" - "jnz 1b \n\t" - - "3: \n\t" - "vzeroupper \n\t" - - : - "+r" (i), // 0 - "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (alpha) // 8 - : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c index 530780bab7..466931b82f 100644 --- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c +++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c @@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( "vzeroupper \n\t" - "vbroadcastsd (%2), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 - "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 - "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 - "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 - "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 + "vbroadcastsd (%3), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 + "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 + "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 + "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 + "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 "vbroadcastsd (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" "jz 2f \n\t" - "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" - "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" - "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" + "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" + "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y - "addq $4 , %8 \n\t" + "addq $4 , %2 \n\t" "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" @@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" + "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" "addq $8 , %0 \n\t" - "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" - "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" - "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" - "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" - "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" + "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" + "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" + "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" + "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" + "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - "addq $8 , %8 \n\t" + "addq $8 , %2 \n\t" "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y "subq $8 , %1 \n\t" - "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y + "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y "jnz 1b \n\t" @@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", From 918a0cc4d1548617478f925c8341461c055268e5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Feb 2019 17:55:36 +0100 Subject: [PATCH 126/189] Fix missing -c option in AVX512 test --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index 38f9170ca1..d93b756d53 100644 --- a/c_check +++ b/c_check @@ -232,7 +232,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { ($fh,$tmpf) = tempfile( UNLINK => 1 ); $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; - $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; + $args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf"; my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { From fd34820b99bd302ed2b31ca0e5fedeb492a179c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Feb 2019 17:58:31 +0100 Subject: [PATCH 127/189] Fix AVX512 test always returning false due to missing compiler option --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 6b602c1b0f..88bb081a69 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -78,7 +78,7 @@ endif() if (X86_64 OR X86) file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include \n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") -execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) +execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() From d66214c94628bb2050b2ab83361d1ac54d3373b5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 28 Feb 2019 09:58:25 +0100 Subject: [PATCH 128/189] Make x86_32 imply NO_AVX2, NO_AVX512 in addition to NO_AVX fixes #2033 --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 67c8cd1972..bbd777448b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -155,7 +155,7 @@ GETARCH_FLAGS += -DNO_AVX endif ifeq ($(BINARY), 32) -GETARCH_FLAGS += -DNO_AVX +GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 endif ifeq ($(NO_AVX2), 1) From 2ffb72718787bea52f7958d2fe5b91c489cd2aee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 28 Feb 2019 10:51:54 +0100 Subject: [PATCH 129/189] Keep xcode8.3 for osx BINARY=32 build as xcode10 deprecated i386 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index ec5dc8a9bf..eee7674fe7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -160,6 +160,7 @@ matrix: - BTYPE="BINARY=64 INTERFACE64=1" - <<: *test-macos + osx_image: xcode8.3 env: - BTYPE="BINARY=32" From c4868d11c02f1ac97e71afdef3dc49429678959b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Mar 2019 09:23:03 +0100 Subject: [PATCH 130/189] Make sure that AVX512 is disabled in 32bit builds for #2033 --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index bbd777448b..53f89b2fa6 100644 --- a/Makefile.system +++ b/Makefile.system @@ -156,6 +156,7 @@ endif ifeq ($(BINARY), 32) GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 +NO_AVX512 = 1 endif ifeq ($(NO_AVX2), 1) From 25427926bc8b74a48e335ae05c56cbfd8d0187b9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 2 Mar 2019 23:36:36 +0100 Subject: [PATCH 131/189] Improve handling of NO_STATIC and NO_SHARED to avoid surprises from defining either as zero. Fixes #2035 by addressing some concerns from #1422 --- Makefile | 2 +- Makefile.install | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 21096f893c..273fde33ed 100644 --- a/Makefile +++ b/Makefile @@ -96,7 +96,7 @@ endif @echo shared : -ifndef NO_SHARED +ifneq ($(NO_SHARED), 1) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so diff --git a/Makefile.install b/Makefile.install index 069c96c6aa..fefecd98d5 100644 --- a/Makefile.install +++ b/Makefile.install @@ -58,14 +58,14 @@ ifndef NO_LAPACKE endif #for install static library -ifndef NO_STATIC +ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library -ifndef NO_SHARED +ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @@ -106,14 +106,14 @@ ifndef NO_LAPACKE endif #for install static library -ifndef NO_STATIC +ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library -ifndef NO_SHARED +ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @@ -138,7 +138,7 @@ endif @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" -ifndef NO_SHARED +ifneq ($(NO_SHARED),1) #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" From e5c316c6b94bf689b2fb20603c0c00cb72fc4ec9 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 3 Mar 2019 08:59:27 +0200 Subject: [PATCH 132/189] init From e4a79be6bb9fac2ba18d820d83bc7bf9173a63c2 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 3 Mar 2019 09:05:11 +0200 Subject: [PATCH 133/189] address warning introed with #1814 et al --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 09851f15c0..c30ca71cb9 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2584,7 +2584,7 @@ void *blas_memory_alloc(int procpos){ int position; #if defined(WHEREAMI) && !defined(USE_OPENMP) - int mypos; + int mypos = 0; #endif void *map_address; From af480b02a4a45df377acf9be0d6078609bb345c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Mar 2019 14:17:07 +0100 Subject: [PATCH 134/189] Restore locking optimizations for OpenMP case restore another accidentally dropped part of #1468 that was missed in #2004 to address performance regression reported in #1461 --- driver/others/memory.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 2e185593e8..a40cb442af 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2647,21 +2647,26 @@ void *blas_memory_alloc(int procpos){ position = 0; +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); +#endif do { -/* if (!memory[position].used) { */ -/* blas_lock(&memory[position].lock);*/ - +#if defined(USE_OPENMP) + if (!memory[position].used) { + blas_lock(&memory[position].lock); +#endif if (!memory[position].used) goto allocation; -/* blas_unlock(&memory[position].lock);*/ -/* } */ - +#if defined(USE_OPENMP) + blas_unlock(&memory[position].lock); + } +#endif position ++; } while (position < NUM_BUFFERS); - UNLOCK_COMMAND(&alloc_lock); - +#if defined(SMP) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif goto error; allocation : @@ -2671,9 +2676,11 @@ void *blas_memory_alloc(int procpos){ #endif memory[position].used = 1; - +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); - +#else + blas_unlock(&memory[position].lock); +#endif if (!memory[position].addr) { do { #ifdef DEBUG From 783ba8058fbc6d5f0a56d27bc368b659448b1fb1 Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Mon, 4 Mar 2019 16:30:50 +0800 Subject: [PATCH 135/189] HiSilicon tsv110 CPUs optimization branch add HiSilicon tsv110 CPUs optimization branch --- kernel/arm64/KERNEL.TSV110 | 175 +++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 kernel/arm64/KERNEL.TSV110 diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110 new file mode 100644 index 0000000000..04d6940d7a --- /dev/null +++ b/kernel/arm64/KERNEL.TSV110 @@ -0,0 +1,175 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SDOTKERNEL = dot.S +DDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + From 53f482ee72e56b31ace7860199c8fb3027af5303 Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Mon, 4 Mar 2019 16:41:21 +0800 Subject: [PATCH 136/189] add TARGET support for HiSilicon tsv110 CPUs --- Makefile.arm64 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.arm64 b/Makefile.arm64 index cd16dbfaed..4d10ff6844 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -38,3 +38,8 @@ ifeq ($(CORE), THUNDERX2T99) CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif + +ifeq ($(CORE), TSV110) +CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 +FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 +endif From 760842dda1fd8f0475216b46ca25fc016f671d05 Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Mon, 4 Mar 2019 16:45:22 +0800 Subject: [PATCH 137/189] add TARGET support for HiSilicon tsv110 CPUs --- getarch.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/getarch.c b/getarch.c index 242d080044..ac58c82266 100644 --- a/getarch.c +++ b/getarch.c @@ -1065,6 +1065,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_TSV110 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "TSV110" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DTSV110 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "tsv110" +#define CORENAME "TSV110" +#else +#endif + + #ifdef FORCE_ZARCH_GENERIC #define FORCE #define ARCHITECTURE "ZARCH" From fb4dae71240be9ad1e55792a46b38f8e107cb70a Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Mon, 4 Mar 2019 16:48:49 +0800 Subject: [PATCH 138/189] add TARGET support for HiSilicon tsv110 CPUs --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index 3a5a322344..aebd0dd187 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -90,6 +90,7 @@ CORTEXA73 FALKOR THUNDERX THUNDERX2T99 +TSV110 9.System Z: ZARCH_GENERIC From e4864a8933f6875bbb434887dc9120dbcf6be4dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Mar 2019 21:17:08 +0100 Subject: [PATCH 139/189] Fix module definition conflicts between LAPACK and ReLAPACK for #2043 --- CMakeLists.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9de894f9ce..a27c1c0fc9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,10 +75,10 @@ endif () set(SUBDIRS ${BLASDIRS}) if (NOT NO_LAPACK) - list(APPEND SUBDIRS lapack) if(BUILD_RELAPACK) list(APPEND SUBDIRS relapack/src) endif() + list(APPEND SUBDIRS lapack) endif () # set which float types we want to build for @@ -224,6 +224,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES SOVERSION ${OpenBLAS_MAJOR_VERSION} ) +if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) + if (NOT MSVC) + target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") + else() + target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE") + endif() +endif() + if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") if (NOT DEFINED ARCH) set(ARCH_IN "x86_64") From 11cfd0bd75a1ce8714ca3abf6867d3f45548dab1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Mar 2019 16:04:25 +0100 Subject: [PATCH 140/189] Do not compile in AVX512 check if AVX support is disabled xgetbv is function depends on NO_AVX being undefined - we could change that too, but that combo is unlikely to work anyway --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 99c9254acb..46dfaea6cd 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -322,7 +322,7 @@ int support_avx2(){ } int support_avx512(){ -#ifndef NO_AVX512 +#if !defined(NO_AVX) && !defined(NO_AVX512) int eax, ebx, ecx, edx; int ret=0; From 4290afdae247337261b5ca0ea76e5bfcad2cc4a9 Mon Sep 17 00:00:00 2001 From: ken-cunningham-webuse Date: Wed, 6 Mar 2019 20:55:06 -0800 Subject: [PATCH 141/189] ctest.c : add __POWERPC__ for PowerMac --- ctest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctest.c b/ctest.c index 0571e9e028..5e869b901f 100644 --- a/ctest.c +++ b/ctest.c @@ -113,7 +113,7 @@ ARCH_X86 ARCH_X86_64 #endif -#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) +#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__) ARCH_POWER #endif From b7f59da42d3978234e7e6ed293365b66f340189d Mon Sep 17 00:00:00 2001 From: Celelibi Date: Thu, 7 Mar 2019 16:39:41 +0100 Subject: [PATCH 142/189] Fix crash in sgemm SSE/nano kernel on x86_64 Fix bug #2047. Signed-off-by: Celelibi --- kernel/x86_64/gemm_kernel_4x8_nano.S | 2 +- kernel/x86_64/gemm_kernel_8x4_sse.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/gemm_kernel_4x8_nano.S b/kernel/x86_64/gemm_kernel_4x8_nano.S index 074562804c..e29520fa1b 100644 --- a/kernel/x86_64/gemm_kernel_4x8_nano.S +++ b/kernel/x86_64/gemm_kernel_4x8_nano.S @@ -135,7 +135,7 @@ #endif movq %rsp, %rbx # save old stack - subq $128 + LOCAL_BUFFER_SIZE, %rsp + subq $256 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING diff --git a/kernel/x86_64/gemm_kernel_8x4_sse.S b/kernel/x86_64/gemm_kernel_8x4_sse.S index c4ef1f809a..1602c13c50 100644 --- a/kernel/x86_64/gemm_kernel_8x4_sse.S +++ b/kernel/x86_64/gemm_kernel_8x4_sse.S @@ -383,7 +383,7 @@ EMMS movq %rsp, %rbx # save old stack - subq $128 + LOCAL_BUFFER_SIZE, %rsp + subq $256 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING From b0c714ef602095c764b58c0a9ba68fddd9008c73 Mon Sep 17 00:00:00 2001 From: ken-cunningham-webuse Date: Thu, 7 Mar 2019 11:36:35 -0800 Subject: [PATCH 143/189] param.h : enable defines for PPC970 on DarwinOS fixes: gemm.c: In function 'sgemm_': ../common_param.h:981:18: error: 'SGEMM_DEFAULT_P' undeclared (first use in this function) #define SGEMM_P SGEMM_DEFAULT_P ^ --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 3cc400b548..48b7ef383d 100644 --- a/param.h +++ b/param.h @@ -1999,7 +1999,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_DARWIN) #if L2_SIZE == 1024976 #define SGEMM_DEFAULT_P 320 #define DGEMM_DEFAULT_P 256 From f7a06463d9a0db120cc530a3298f3290855ccbe9 Mon Sep 17 00:00:00 2001 From: ken-cunningham-webuse Date: Thu, 7 Mar 2019 11:41:58 -0800 Subject: [PATCH 144/189] common_power.h: force DCBT_ARG 0 on PPC970 Darwin without this, we see ../kernel/power/gemv_n.S:427:Parameter syntax error and many more similar entries that relates to this assembly command dcbt 8, r24, r18 this change makes the DCBT_ARG = 0 and openblas builds through to completion on PowerMac 970 Tests pass --- common_power.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_power.h b/common_power.h index e3a1a7aef4..68087b071e 100644 --- a/common_power.h +++ b/common_power.h @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || ( defined(PPC970) && defined(OS_DARWIN) ) #define DCBT_ARG 0 #else #define DCBT_ARG 8 From 5b95534afcc80d54f51bd766b617fd3f494ec65a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Mar 2019 11:21:16 +0100 Subject: [PATCH 145/189] Make TARGET=GENERIC compatible with DYNAMIC_ARCH=1 for issue #2048 --- kernel/Makefile.L3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index eafcfb1b41..bf5fffe867 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif -ifeq ($(TARGET), GENERIC) +ifeq ($(CORE), GENERIC) USE_TRMM = 1 endif From f074d7d1463c15bbf838b2305f259160281dead3 Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Tue, 12 Mar 2019 16:05:19 +0800 Subject: [PATCH 146/189] make DYNAMIC_ARCH=1 package work on TSV110. --- cpuid_arm64.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 5077d7b11c..a5e731d747 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -39,6 +39,8 @@ // Cavium #define CPU_THUNDERX 7 #define CPU_THUNDERX2T99 8 +//Hisilicon +#define CPU_TSV110 9 static char *cpuname[] = { "UNKNOWN", @@ -49,7 +51,8 @@ static char *cpuname[] = { "CORTEXA73", "FALKOR", "THUNDERX", - "THUNDERX2T99" + "THUNDERX2T99", + "TSV110" }; static char *cpuname_lower[] = { @@ -61,7 +64,8 @@ static char *cpuname_lower[] = { "cortexa73", "falkor", "thunderx", - "thunderx2t99" + "thunderx2t99", + "tsv110" }; int get_feature(char *search) @@ -145,6 +149,9 @@ int detect(void) return CPU_THUNDERX; else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) return CPU_THUNDERX2T99; + // HiSilicon + else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) + return CPU_TSV110; } p = (char *) NULL ; @@ -286,6 +293,21 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; + + case CPU_TSV110: + printf("#define TSV110 \n"); + printf("#define L1_CODE_SIZE 65536 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 4 \n"); + printf("#define L1_DATA_SIZE 65536 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 4 \n"); + printf("#define L2_SIZE 524228 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; } } From 7e3eb9b25d26ca9be337acf0b0fd2c647e353e0c Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Tue, 12 Mar 2019 16:11:01 +0800 Subject: [PATCH 147/189] make DYNAMIC_ARCH=1 package work on TSV110 --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 3cc400b548..79fb05380a 100644 --- a/param.h +++ b/param.h @@ -2591,7 +2591,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(CORTEXA53) || defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ - defined(FALKOR) + defined(FALKOR) || defined(TSV110) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 From b1393c7a97e2da1b64e1f779bdf68b7af0924543 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Mar 2019 16:03:56 +0100 Subject: [PATCH 148/189] Add Intel Denverton for #2048 --- cpuid_x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index c45ddd9680..884d4b78ae 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1359,6 +1359,8 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 12: // Apollo Lake + case 15: + // Denverton return CPUTYPE_NEHALEM; } break; @@ -1376,9 +1378,9 @@ int get_cpuname(void){ } break; case 9: - case 8: + case 8: switch (model) { - case 14: // Kaby Lake + case 14: // Kaby Lake and refreshes if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) From 04f2226ea6edd95decf888b67bbdd4a8de530b54 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Mar 2019 16:09:55 +0100 Subject: [PATCH 149/189] Add Intel Denverton --- driver/others/dynamic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 99c9254acb..895bacb501 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -566,8 +566,8 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } - //Apollo Lake - if (model == 12) { + //Apollo Lake or Denverton + if (model == 12 || model == 15) { return &gotoblas_NEHALEM; } return NULL; From c3e30b2bc2234dfafc9e674c8ab5723fabeb04c5 Mon Sep 17 00:00:00 2001 From: Sacha Date: Wed, 13 Mar 2019 23:21:54 +1000 Subject: [PATCH 150/189] Change 64-bit detection as explained in #2056 --- cmake/system_check.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 88bb081a69..f30a946b49 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -39,7 +39,11 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") - set(X86_64 1) + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + set(X86_64 1) + else() + set(X86 1) + endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") set(X86 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") From 4fc17d0d754b7905667fb84a68cf37a0d28a93bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Mar 2019 19:20:23 +0100 Subject: [PATCH 151/189] Trivial typo fix as suggested in #2022 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 91f42e3960..8f72c5a79c 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -199,7 +199,7 @@ NO_AFFINITY = 1 # been reported to be optimal for certain workloads (50 is the recommended value for Julia). # GEMM_MULTITHREAD_THRESHOLD = 4 -# If you need santy check by comparing reference BLAS. It'll be very +# If you need sanity check by comparing results to reference BLAS. It'll be very # slow (Not implemented yet). # SANITY_CHECK = 1 From e608d4f7fe1a2085b22af206d0c8c2cc128c1e9a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Mar 2019 22:10:28 +0100 Subject: [PATCH 152/189] Disable the AVX512 DGEMM kernel (again) Due to as yet unresolved errors seen in #1955 and #2029 --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index acc6356d60..5d0a300b5e 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -7,7 +7,7 @@ SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c +#DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c DGEMMINCOPY = dgemm_ncopy_8_skylakex.c DGEMMITCOPY = dgemm_tcopy_8_skylakex.c From 1006ff8a7bc4ee77150d6f13483838c96789e3fc Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Fri, 15 Mar 2019 15:06:30 +0100 Subject: [PATCH 153/189] Use POSIX getenv on Cygwin The Windows-native GetEnvironmentVariable cannot be relied on, as Cygwin does not always copy environment variables set through Cygwin to the Windows environment block, particularly after fork(). --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 7fcd5e3163..f239c3d788 100644 --- a/common.h +++ b/common.h @@ -439,7 +439,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 typedef char env_var_t[MAX_PATH]; #define readenv(p, n) 0 #else -#ifdef OS_WINDOWS +#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) typedef char env_var_t[MAX_PATH]; #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) #else From 4ad694eda1ff79040778648d44cda5b8f774c38d Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Mon, 18 Mar 2019 20:32:48 +0100 Subject: [PATCH 154/189] Fix for #2063: The DllMain used in Cygwin did not run the thread memory pool cleanup upon THREAD_DETACH which is needed when compiled with USE_TLS=1. --- driver/others/memory.c | 11 +++++++++-- exports/dllinit.c | 24 +++++++++++++++++------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index ed407a8580..ac8545f350 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1313,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) { free(map_address); } +#ifdef SMP +void blas_thread_memory_cleanup(void) { + blas_memory_cleanup((void*)get_memory_table()); +} +#endif + + void blas_shutdown(void){ #ifdef SMP BLASFUNC(blas_thread_shutdown)(); @@ -1322,7 +1329,7 @@ void blas_shutdown(void){ /* Only cleanupIf we were built for threading and TLS was initialized */ if (local_storage_key) #endif - blas_memory_cleanup((void*)get_memory_table()); + blas_thread_memory_cleanup(); #ifdef SEEK_ADDRESS base_address = 0UL; @@ -1552,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser break; case DLL_THREAD_DETACH: #if defined(SMP) - blas_memory_cleanup((void*)get_memory_table()); + blas_thread_memory_cleanup(); #endif break; case DLL_PROCESS_DETACH: diff --git a/exports/dllinit.c b/exports/dllinit.c index 02ff092e99..0e1bb34e3f 100644 --- a/exports/dllinit.c +++ b/exports/dllinit.c @@ -40,15 +40,25 @@ void gotoblas_init(void); void gotoblas_quit(void); +#if defined(SMP) && defined(USE_TLS) +void blas_thread_memory_cleanup(void); +#endif BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { - - if (reason == DLL_PROCESS_ATTACH) { - gotoblas_init(); - } - - if (reason == DLL_PROCESS_DETACH) { - gotoblas_quit(); + switch(reason) { + case DLL_PROCESS_ATTACH: + gotoblas_init(); + break; + case DLL_PROCESS_DETACH: + gotoblas_quit(); + break; + case DLL_THREAD_ATTACH: + break; + case DLL_THREAD_DETACH: +#if defined(SMP) && defined(USE_TLS) + blas_thread_memory_cleanup(void); +#endif + break; } return TRUE; From 8ba9e2a61a1cf34e9b2efc5af61f5ebaaf6ab902 Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Tue, 19 Mar 2019 10:22:02 +0100 Subject: [PATCH 155/189] Also call CloseHandle on each thread, as well as on the event so as to not leak thread handles. --- driver/others/blas_server_win32.c | 5 +++++ exports/dllinit.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index bae344c593..0b38ee3658 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){ SetEvent(pool.killed); for(i = 0; i < blas_num_threads - 1; i++){ + // Could also just use WaitForMultipleObjects WaitForSingleObject(blas_threads[i], 5); //INFINITE); #ifndef OS_WINDOWSSTORE // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP TerminateThread(blas_threads[i],0); #endif + CloseHandle(blas_threads[i]); } + CloseHandle(pool.filled); + CloseHandle(pool.killed); + blas_server_avail = 0; } diff --git a/exports/dllinit.c b/exports/dllinit.c index 0e1bb34e3f..4a05c0e146 100644 --- a/exports/dllinit.c +++ b/exports/dllinit.c @@ -56,7 +56,7 @@ BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { break; case DLL_THREAD_DETACH: #if defined(SMP) && defined(USE_TLS) - blas_thread_memory_cleanup(void); + blas_thread_memory_cleanup(); #endif break; } From b043a5962e3785c9879f671fca8e7226dc70ff4f Mon Sep 17 00:00:00 2001 From: Ayappan P Date: Mon, 25 Mar 2019 18:53:25 +0530 Subject: [PATCH 156/189] AIX asm syntax changes needed for shared object creation --- common_power.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/common_power.h b/common_power.h index 68087b071e..60de48a631 100644 --- a/common_power.h +++ b/common_power.h @@ -598,9 +598,14 @@ REALNAME:;\ #ifndef __64BIT__ #define PROLOGUE \ .machine "any";\ + .toc;\ .globl .REALNAME;\ + .globl REALNAME;\ + .csect REALNAME[DS],3;\ +REALNAME:;\ + .long .REALNAME, TOC[tc0], 0;\ .csect .text[PR],5;\ -.REALNAME:; +.REALNAME: #define EPILOGUE \ _section_.text:;\ @@ -611,9 +616,14 @@ _section_.text:;\ #define PROLOGUE \ .machine "any";\ + .toc;\ .globl .REALNAME;\ + .globl REALNAME;\ + .csect REALNAME[DS],3;\ +REALNAME:;\ + .llong .REALNAME, TOC[tc0], 0;\ .csect .text[PR], 5;\ -.REALNAME:; +.REALNAME: #define EPILOGUE \ _section_.text:;\ From 853a18bc17628fb1e8615503304ceedef9d45030 Mon Sep 17 00:00:00 2001 From: AbdelRauf Date: Thu, 14 Mar 2019 10:42:04 +0000 Subject: [PATCH 157/189] power9 makefile. dgemm based on power8 kernel with following changes : 32x unrolled 16x4 kernel and 8x4 kernel using (lxv stxv butterfly rank1 update). improvement from 17 to 22-23gflops. dtrmm cases were added into dgemm itself --- Makefile.power | 10 +- TargetList.txt | 1 + common.h | 5 + common_power.h | 8 +- cpuid_power.c | 8 +- getarch.c | 12 + kernel/Makefile.L3 | 4 + kernel/power/KERNEL.POWER9 | 184 ++ kernel/power/casum.c | 2 +- kernel/power/ccopy.c | 2 +- kernel/power/crot.c | 2 +- kernel/power/cswap.c | 2 +- kernel/power/dasum.c | 2 +- kernel/power/daxpy.c | 2 +- kernel/power/dcopy.c | 2 +- kernel/power/ddot.c | 2 +- kernel/power/dgemm_kernel_power9.S | 249 ++ kernel/power/dgemm_logic_power9.S | 1981 +++++++++++++++ kernel/power/dgemm_macros_power9.S | 3623 ++++++++++++++++++++++++++++ kernel/power/dgemv_n.c | 2 +- kernel/power/drot.c | 2 +- kernel/power/dscal.c | 2 +- kernel/power/dswap.c | 2 +- kernel/power/sasum.c | 2 +- kernel/power/scopy.c | 2 +- kernel/power/sdot.c | 2 +- kernel/power/srot.c | 2 +- kernel/power/sscal.c | 2 +- kernel/power/sswap.c | 2 +- kernel/power/zasum.c | 2 +- kernel/power/zaxpy.c | 7 +- kernel/power/zcopy.c | 2 +- kernel/power/zdot.c | 2 +- kernel/power/zscal.c | 2 +- kernel/power/zswap.c | 2 +- param.h | 31 + 36 files changed, 6133 insertions(+), 36 deletions(-) create mode 100644 kernel/power/KERNEL.POWER9 create mode 100644 kernel/power/dgemm_kernel_power9.S create mode 100644 kernel/power/dgemm_logic_power9.S create mode 100644 kernel/power/dgemm_macros_power9.S diff --git a/Makefile.power b/Makefile.power index a49372ad73..195f1930f8 100644 --- a/Makefile.power +++ b/Makefile.power @@ -9,7 +9,15 @@ else USE_OPENMP = 1 endif - +ifeq ($(CORE), POWER9) +ifeq ($(USE_OPENMP), 1) +COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +else +COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math +endif +endif ifeq ($(CORE), POWER8) ifeq ($(USE_OPENMP), 1) diff --git a/TargetList.txt b/TargetList.txt index 3d04a57cf3..44e539c095 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -48,6 +48,7 @@ POWER5 POWER6 POWER7 POWER8 +POWER9 PPCG4 PPC970 PPC970MP diff --git a/common.h b/common.h index 7fcd5e3163..b30a71ff1a 100644 --- a/common.h +++ b/common.h @@ -348,6 +348,11 @@ typedef int blasint; #endif #endif +#ifdef POWER9 +#ifndef YIELDING +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif +#endif /* #ifdef PILEDRIVER diff --git a/common_power.h b/common_power.h index e3a1a7aef4..ddbee9412f 100644 --- a/common_power.h +++ b/common_power.h @@ -39,7 +39,7 @@ #ifndef COMMON_POWER #define COMMON_POWER -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #define MB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory") #else @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) #define DCBT_ARG 0 #else #define DCBT_ARG 8 @@ -263,7 +263,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define L1_PREFETCH dcbtst #endif -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #define L1_DUALFETCH #define L1_PREFETCHSIZE (16 + 128 * 100) #define L1_PREFETCH dcbtst @@ -802,7 +802,7 @@ Lmcount$lazy_ptr: #define BUFFER_SIZE ( 2 << 20) #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) -#elif defined(POWER8) +#elif defined(POWER8) || defined(POWER9) #define BUFFER_SIZE ( 64 << 20) #else #define BUFFER_SIZE ( 16 << 20) diff --git a/cpuid_power.c b/cpuid_power.c index 82a3f4aace..d5ba6fb2ce 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -94,7 +94,7 @@ char *corename[] = { "CELL", "PPCG4", "POWER8", - "POWER8" + "POWER9" }; int detect(void){ @@ -124,7 +124,7 @@ int detect(void){ if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; - if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; @@ -156,7 +156,7 @@ int detect(void){ if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; - if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; + if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; return CPUTYPE_POWER5; @@ -180,7 +180,7 @@ int id; __asm __volatile("mfpvr %0" : "=r"(id)); switch ( id >> 16 ) { case 0x4e: // POWER9 - return CPUTYPE_POWER8; + return CPUTYPE_POWER9; break; case 0x4d: case 0x4b: // POWER8/8E diff --git a/getarch.c b/getarch.c index 78ba0fefdb..34d46905ac 100644 --- a/getarch.c +++ b/getarch.c @@ -618,6 +618,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER8" #endif +#if defined(FORCE_POWER9) +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER9" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER9 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "power9" +#define CORENAME "POWER9" +#endif #ifdef FORCE_PPCG4 #define FORCE diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 9258f216dd..db9fccd30a 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -44,6 +44,10 @@ ifeq ($(CORE), POWER8) USE_TRMM = 1 endif +ifeq ($(CORE), POWER9) +USE_TRMM = 1 +endif + ifeq ($(ARCH), zarch) USE_TRMM = 1 endif diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 new file mode 100644 index 0000000000..86a9319714 --- /dev/null +++ b/kernel/power/KERNEL.POWER9 @@ -0,0 +1,184 @@ +#SGEMM_BETA = ../generic/gemm_beta.c +#DGEMM_BETA = ../generic/gemm_beta.c +#CGEMM_BETA = ../generic/zgemm_beta.c +#ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = strmm_kernel_16x8_power8.S +DTRMMKERNEL = dgemm_kernel_power9.S +CTRMMKERNEL = ctrmm_kernel_8x4_power8.S +ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S + +SGEMMKERNEL = sgemm_kernel_16x8_power8.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = sgemm_tcopy_16_power8.S +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_power9.S +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = dgemm_tcopy_16_power8.S +DGEMMONCOPY = dgemm_ncopy_4_power8.S +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_8x4_power8.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = cgemm_tcopy_8_power8.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o + +ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c +ZGEMMITCOPY = zgemm_tcopy_8_power8.S +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. +#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S +#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S + +#Pure C for other kernels +#SAMAXKERNEL = ../arm/amax.c +#DAMAXKERNEL = ../arm/amax.c +#CAMAXKERNEL = ../arm/zamax.c +#ZAMAXKERNEL = ../arm/zamax.c +# +#SAMINKERNEL = ../arm/amin.c +#DAMINKERNEL = ../arm/amin.c +#CAMINKERNEL = ../arm/zamin.c +#ZAMINKERNEL = ../arm/zamin.c +# +#SMAXKERNEL = ../arm/max.c +#DMAXKERNEL = ../arm/max.c +# +#SMINKERNEL = ../arm/min.c +#DMINKERNEL = ../arm/min.c +# +ISAMAXKERNEL = isamax.c +IDAMAXKERNEL = idamax.c +ICAMAXKERNEL = icamax.c +IZAMAXKERNEL = izamax.c +# +ISAMINKERNEL = isamin.c +IDAMINKERNEL = idamin.c +ICAMINKERNEL = icamin.c +IZAMINKERNEL = izamin.c +# +#ISMAXKERNEL = ../arm/imax.c +#IDMAXKERNEL = ../arm/imax.c +# +#ISMINKERNEL = ../arm/imin.c +#IDMINKERNEL = ../arm/imin.c +# +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c +# +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c +# +SCOPYKERNEL = scopy.c +DCOPYKERNEL = dcopy.c +CCOPYKERNEL = ccopy.c +ZCOPYKERNEL = zcopy.c +# +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +DSDOTKERNEL = sdot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c +# +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c +# +SROTKERNEL = srot.c +DROTKERNEL = drot.c +CROTKERNEL = crot.c +ZROTKERNEL = zrot.c +# +SSCALKERNEL = sscal.c +DSCALKERNEL = dscal.c +CSCALKERNEL = zscal.c +ZSCALKERNEL = zscal.c +# +SSWAPKERNEL = sswap.c +DSWAPKERNEL = dswap.c +CSWAPKERNEL = cswap.c +ZSWAPKERNEL = zswap.c +# + +SGEMVNKERNEL = sgemv_n.c +DGEMVNKERNEL = dgemv_n.c +CGEMVNKERNEL = cgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c +# +SGEMVTKERNEL = sgemv_t.c +DGEMVTKERNEL = dgemv_t.c +CGEMVTKERNEL = cgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c + + +#SSYMV_U_KERNEL = ../generic/symv_k.c +#SSYMV_L_KERNEL = ../generic/symv_k.c +#DSYMV_U_KERNEL = ../generic/symv_k.c +#DSYMV_L_KERNEL = ../generic/symv_k.c +#QSYMV_U_KERNEL = ../generic/symv_k.c +#QSYMV_L_KERNEL = ../generic/symv_k.c +#CSYMV_U_KERNEL = ../generic/zsymv_k.c +#CSYMV_L_KERNEL = ../generic/zsymv_k.c +#ZSYMV_U_KERNEL = ../generic/zsymv_k.c +#ZSYMV_L_KERNEL = ../generic/zsymv_k.c +#XSYMV_U_KERNEL = ../generic/zsymv_k.c +#XSYMV_L_KERNEL = ../generic/zsymv_k.c + +#ZHEMV_U_KERNEL = ../generic/zhemv_k.c +#ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +LSAME_KERNEL = ../generic/lsame.c +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +#Dump kernel +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/power/casum.c b/kernel/power/casum.c index d1108581d3..a9ece07685 100644 --- a/kernel/power/casum.c +++ b/kernel/power/casum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "casum_microk_power8.c" #endif diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c index ce7d674753..50df84cc50 100644 --- a/kernel/power/ccopy.c +++ b/kernel/power/ccopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "ccopy_microk_power8.c" #endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 40e350ba3f..959a9eda06 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) static void crot_kernel_8 (long n, float *x, float *y, float c, float s) { diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index da97c896e8..31e02fe5a4 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "cswap_microk_power8.c" #endif diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 73962c2f21..d0e060977c 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "dasum_microk_power8.c" #endif diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c index df0572e8ee..f09611ff09 100644 --- a/kernel/power/daxpy.c +++ b/kernel/power/daxpy.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "daxpy_microk_power8.c" #endif diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c index 059c0e5a94..27b39144ba 100644 --- a/kernel/power/dcopy.c +++ b/kernel/power/dcopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "dcopy_microk_power8.c" #endif diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c index e43470e23d..f985df1c5a 100644 --- a/kernel/power/ddot.c +++ b/kernel/power/ddot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "ddot_microk_power8.c" #endif diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S new file mode 100644 index 0000000000..a1762dcf20 --- /dev/null +++ b/kernel/power/dgemm_kernel_power9.S @@ -0,0 +1,249 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld + + + + +#define STACKSIZE (512 ) +#define ALPHA_SP (296+192)(SP) +#define FZERO (304+192)(SP) + + + +#define M r3 +#define N r4 +#define K r5 + +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 + + + +#define alpha_r vs18 + +#define o0 0 + + +#define T4 r12 +#define T3 r11 +#define C4 r14 +#define o8 r15 +#define o24 r16 +#define C2 r17 +#define L r18 +#define T1 r19 +#define C3 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "dgemm_macros_power9.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv v20, 288(SP) + stxv v21, 304(SP) + stxv v22, 320(SP) + stxv v23, 336(SP) + stxv v24, 352(SP) + stxv v25, 368(SP) + stxv v26, 384(SP) + stxv v27, 400(SP) + stxv v28, 416(SP) + stxv v29, 432(SP) + stxv v30, 448(SP) + stxv v31, 464(SP) + + + stfd f1, ALPHA_SP + stw r0, FZERO + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + + + addi T1, SP, 296+192 + + + li PRE, 384 + li o8 , 8 + li o16, 16 + li o24, 24 + li o32, 32 + li o48, 48 + + + lxvdsx alpha_r, 0, T1 + +#include "dgemm_logic_power9.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + lxv v20, 288(SP) + lxv v21, 304(SP) + lxv v22, 320(SP) + lxv v23, 336(SP) + lxv v24, 352(SP) + lxv v25, 368(SP) + lxv v26, 384(SP) + lxv v27, 400(SP) + lxv v28, 416(SP) + lxv v29, 432(SP) + lxv v30, 448(SP) + lxv v31, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/dgemm_logic_power9.S b/kernel/power/dgemm_logic_power9.S new file mode 100644 index 0000000000..251839d19e --- /dev/null +++ b/kernel/power/dgemm_logic_power9.S @@ -0,0 +1,1981 @@ +/*************************************************************************** +Copyright (c) 2013-2019 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define MY_ALIGN .align 3 + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + + srawi. J, N, 2 + ble LDGEMM_L4_END + +LDGEMM_L4_BEGIN: + + + li T1, 128 + li T2, 256 + + mr AO, A + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + + + dcbt A, T1 + dcbt A, T2 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LDGEMM_L4x16_END + + MY_ALIGN +LDGEMM_L4x16_BEGIN: + + li L, -128 + + + SAVE4x16_REGS + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 +#else + mr BO, B +#endif + + and T1, CO, L + and T2, C2, L + and T3, C3, L + and T4, C4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T3,K,TEMP_REG,16,4 + srawi. L, T3, 5 +#else + srawi. L, K, 5 +#endif + + ble LDGEMM_L4x16_SUB0 + + + MY_ALIGN +LDGEMM_L4x16_LOOP_START: + + li T2, 512 + + + LOAD4x16_1 + ##OffsetA=128 OffsetB=32 + addi AO,AO,2176 + # addi BO,BO,32 + addic. L, L, -1 + + ble LDGEMM_L4x16_LOOP_END + + + mtctr L + + MY_ALIGN + +LDGEMM_L4x16_LOOP: + + #dcbt AO, PRE + KERNEL4x16_I1_L2_2 -2048,32, 0,0 + KERNEL4x16_I1_L2_2 -2048,32, 1,0 + KERNEL4x16_I1_L2_2 -2048,32, 2,0 + KERNEL4x16_I1_L2_2 -2048,32, 3,0 + KERNEL4x16_I1_L2_2 -2048,32, 4,0 + KERNEL4x16_I1_L2_2 -2048,32, 5,0 + KERNEL4x16_I1_L2_2 -2048,32, 6,0 + KERNEL4x16_I1_L2_2 -2048,32, 7,0 + KERNEL4x16_I1_L2_2 -2048,32, 8,0 + KERNEL4x16_I1_L2_2 -2048,32, 9,0 + KERNEL4x16_I1_L2_2 -2048,32, 10,0 + KERNEL4x16_I1_L2_2 -2048,32, 11,0 + KERNEL4x16_I1_L2_2 -2048,32, 12,0 + KERNEL4x16_I1_L2_2 -2048,32, 13,0 + KERNEL4x16_I1_L2_2 -2048,32, 14,0 + KERNEL4x16_I1_L2_2 -2048,32, 15,1 + + + bdnz LDGEMM_L4x16_LOOP + + MY_ALIGN + MY_ALIGN +LDGEMM_L4x16_LOOP_END: + + KERNEL4x16_I1_L2_2 -2048,32, 0,0 + KERNEL4x16_I1_L2_2 -2048,32, 1,0 + KERNEL4x16_I1_L2_2 -2048,32, 2,0 + KERNEL4x16_I1_L2_2 -2048,32, 3,0 + KERNEL4x16_I1_L2_2 -2048,32, 4,0 + KERNEL4x16_I1_L2_2 -2048,32, 5,0 + KERNEL4x16_I1_L2_2 -2048,32, 6,0 + KERNEL4x16_I1_L2_2 -2048,32, 7,0 + KERNEL4x16_I1_L2_2 -2048,32, 8,0 + KERNEL4x16_I1_L2_2 -2048,32, 9,0 + KERNEL4x16_I1_L2_2 -2048,32, 10,0 + KERNEL4x16_I1_L2_2 -2048,32, 11,0 + KERNEL4x16_I1_L2_2 -2048,32, 12,0 + KERNEL4x16_I1_L2_2 -2048,32, 13,0 + KERNEL4x16_I1_L2_2 -2048,32, 14,0 + KERNEL4x16_I1_L2_3 -2048,32, 15,1 + b LDGEMM_L4x16_SUB1 + + + MY_ALIGN +LDGEMM_L4x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 31 +#else + andi. L, K, 31 +#endif + KERNEL4x16 1 + + addic. L, L, -1 + ble LDGEMM_L4x16_SAVE + b LDGEMM_L4x16_SUB2 + MY_ALIGN +LDGEMM_L4x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 31 +#else + andi. L, K, 31 +#endif + ble LDGEMM_L4x16_SAVE + MY_ALIGN +LDGEMM_L4x16_SUB2: + + andi. T1,L, 16 + ble LDGEMM_L4x16_SUB2_8 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_2 128,32, 1,0 + KERNEL4x16_I1_L2_2 128,32, 2,0 + KERNEL4x16_I1_L2_2 128,32, 3,0 + KERNEL4x16_I1_L2_2 128,32, 4,0 + KERNEL4x16_I1_L2_2 128,32, 5,0 + KERNEL4x16_I1_L2_2 128,32, 6,0 + KERNEL4x16_I1_L2_3 128,32, 7,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_8: + andi. T1,L, 8 + ble LDGEMM_L4x16_SUB2_4 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_2 128,32, 1,0 + KERNEL4x16_I1_L2_2 128,32, 2,0 + KERNEL4x16_I1_L2_3 128,32, 3,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_4: + andi. T1,L, 4 + ble LDGEMM_L4x16_SUB2_2 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_3 128,32, 1,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_2: + andi. T1,L, 2 + ble LDGEMM_L4x16_SUB2_1 + LOAD4x16_0 + KERNEL4x16_I1_L2_3 128,32, 0,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_1: + andi. T1,L, 1 + ble LDGEMM_L4x16_SAVE + KERNEL4x16 0 +# addic. L, L, -1 +# bgt LDGEMM_L4x16_SUB2 + + MY_ALIGN +LDGEMM_L4x16_SAVE: + SAVE4x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4 +#endif + addic. I, I, -1 + bgt+ LDGEMM_L4x16_BEGIN + +LDGEMM_L4x16_END: + +LDGEMM_L4x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L4x1_END + + andi. T1, M, 8 + ble LDGEMM_L4x8_END + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,4 + srawi. L, T3, 4 +#else + mr BO, B + srawi. L, K, 4 +#endif + + + ble LDGEMM_L4x8_SUB0 + +LDGEMM_L4x8_LOOP_START: + + + LOAD4x8_1 + ##OffsetA=64 OffsetB=32 + + + addic. L, L, -1 + + ble LDGEMM_L4x8_LOOP_END + + mtctr L + MY_ALIGN + +LDGEMM_L4x8_LOOP: + + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_2 64,32, 3,0 + KERNEL4x8_I1_L2_2 64,32, 4,0 + KERNEL4x8_I1_L2_2 64,32, 5,0 + KERNEL4x8_I1_L2_2 64,32, 6,0 + KERNEL4x8_I1_L2_2 64,32, 7,1 + + bdnz LDGEMM_L4x8_LOOP + MY_ALIGN +LDGEMM_L4x8_LOOP_END: + + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_2 64,32, 3,0 + KERNEL4x8_I1_L2_2 64,32, 4,0 + KERNEL4x8_I1_L2_2 64,32, 5,0 + KERNEL4x8_I1_L2_2 64,32, 6,0 + KERNEL4x8_I1_L2_3 64,32, 7,1 + + b LDGEMM_L4x8_SUB1 + MY_ALIGN +LDGEMM_L4x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 15 +#else + andi. L, K, 15 +#endif + KERNEL4x8 1 + + addic. L, L, -1 + ble LDGEMM_L4x8_SAVE + b LDGEMM_L4x8_SUB2 + MY_ALIGN +LDGEMM_L4x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 15 +#else + andi. L, K, 15 +#endif + ble LDGEMM_L4x8_SAVE + MY_ALIGN +LDGEMM_L4x8_SUB2: + + andi. T1,L, 8 + ble LDGEMM_L4x8_SUB2_4 + LOAD4x8_0 + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_3 64,32, 3,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_4: + andi. T1,L, 4 + ble LDGEMM_L4x8_SUB2_2 + LOAD4x8_0 + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_3 64,32, 1,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_2: + andi. T1,L, 2 + ble LDGEMM_L4x8_SUB2_1 + LOAD4x8_0 + KERNEL4x8_I1_L2_3 64,32, 0,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_1: + andi. T1,L, 1 + ble LDGEMM_L4x8_SAVE + KERNEL4x8 0 + + MY_ALIGN +LDGEMM_L4x8_SAVE: + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4 +#endif +LDGEMM_L4x8_END: + +LDGEMM_L4x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L4x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x4_SUB4 + +LDGEMM_L4x4_LOOP_START: + + #dcbt AO, PRE + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + addic. L, L, -2 + ble LDGEMM_L4x4_LOOP_END + + MY_ALIGN + +LDGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + addic. L, L, -1 + bgt LDGEMM_L4x4_LOOP + +LDGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b LDGEMM_L4x4_SUB1 + +LDGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b LDGEMM_L4x4_SUB1 + +LDGEMM_L4x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x4_SAVE + b LDGEMM_L4x4_SUB2 + +LDGEMM_L4x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x4_SAVE + +LDGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x4_SUB2 + +LDGEMM_L4x4_SAVE: + + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4 +#endif +LDGEMM_L4x4_END: + +LDGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x2_SUB4 + +LDGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble LDGEMM_L4x2_LOOP_END + + MY_ALIGN + +LDGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt LDGEMM_L4x2_LOOP + +LDGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b LDGEMM_L4x2_SUB1 + +LDGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b LDGEMM_L4x2_SUB1 + +LDGEMM_L4x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x2_SAVE + b LDGEMM_L4x2_SUB2 + +LDGEMM_L4x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x2_SAVE + +LDGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x2_SUB2 + +LDGEMM_L4x2_SAVE: + + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4 +#endif +LDGEMM_L4x2_END: + +LDGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x1_SUB4 + +LDGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble LDGEMM_L4x1_LOOP_END + + MY_ALIGN + +LDGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt LDGEMM_L4x1_LOOP + +LDGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b LDGEMM_L4x1_SUB1 + +LDGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b LDGEMM_L4x1_SUB1 + +LDGEMM_L4x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x1_SAVE + b LDGEMM_L4x1_SUB2 + +LDGEMM_L4x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x1_SAVE + +LDGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x1_SUB2 + +LDGEMM_L4x1_SAVE: + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4 +#endif +LDGEMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + addic. J, J, -1 + bgt LDGEMM_L4_BEGIN + + andi. T2, N, 3 + ble .L999 + +LDGEMM_L4_END: + + b LDGEMM_L2_BEGIN + +.L999_H1: + + b .L999 + +LDGEMM_L2_BEGIN: + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + andi. T1, N, 2 + ble LDGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 4 + ble LDGEMM_L2x16_END + +LDGEMM_L2x16_BEGIN: + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,16,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x16_SUB4 + +LDGEMM_L2x16_LOOP_START: + + #dcbt AO, PRE + LOAD2x16_1 + #dcbt AO, PRE + KERNEL2x16_I1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble LDGEMM_L2x16_LOOP_END + + MY_ALIGN + +LDGEMM_L2x16_LOOP: + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt LDGEMM_L2x16_LOOP + +LDGEMM_L2x16_LOOP_END: + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b LDGEMM_L2x16_SUB1 + +LDGEMM_L2x16_SUB4: + + #dcbt AO, PRE + KERNEL2x16_SUBI1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b LDGEMM_L2x16_SUB1 + +LDGEMM_L2x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x16_SAVE + b LDGEMM_L2x16_SUB2 + +LDGEMM_L2x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x16_SAVE + +LDGEMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x16_SUB2 + +LDGEMM_L2x16_SAVE: + + SAVE2x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2 +#endif + addic. I, I, -1 + bgt LDGEMM_L2x16_BEGIN + +LDGEMM_L2x16_END: + +LDGEMM_L2x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L2x1_END + + andi. T1, M, 8 + ble LDGEMM_L2x8_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x8_SUB4 + +LDGEMM_L2x8_LOOP_START: + + #dcbt AO, PRE + LOAD2x8_1 + KERNEL2x8_I1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble LDGEMM_L2x8_LOOP_END + + MY_ALIGN + +LDGEMM_L2x8_LOOP: + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt LDGEMM_L2x8_LOOP + +LDGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b LDGEMM_L2x8_SUB1 + +LDGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b LDGEMM_L2x8_SUB1 + +LDGEMM_L2x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x8_SAVE + b LDGEMM_L2x8_SUB2 + +LDGEMM_L2x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x8_SAVE + +LDGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x8_SUB2 + +LDGEMM_L2x8_SAVE: + + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2 +#endif +LDGEMM_L2x8_END: + +LDGEMM_L2x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x4_SUB4 + +LDGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble LDGEMM_L2x4_LOOP_END + + MY_ALIGN + +LDGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt LDGEMM_L2x4_LOOP + +LDGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b LDGEMM_L2x4_SUB1 + +LDGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b LDGEMM_L2x4_SUB1 + +LDGEMM_L2x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x4_SAVE + b LDGEMM_L2x4_SUB2 + +LDGEMM_L2x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x4_SAVE + +LDGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x4_SUB2 + +LDGEMM_L2x4_SAVE: + + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2 +#endif +LDGEMM_L2x4_END: + +LDGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x2_SUB4 + +LDGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble LDGEMM_L2x2_LOOP_END + + MY_ALIGN + +LDGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt LDGEMM_L2x2_LOOP + +LDGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b LDGEMM_L2x2_SUB1 + +LDGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b LDGEMM_L2x2_SUB1 + +LDGEMM_L2x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x2_SAVE + b LDGEMM_L2x2_SUB2 + +LDGEMM_L2x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x2_SAVE + +LDGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x2_SUB2 + +LDGEMM_L2x2_SAVE: + + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2 +#endif +LDGEMM_L2x2_END: + +LDGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x1_SUB4 + +LDGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble LDGEMM_L2x1_LOOP_END + + MY_ALIGN + +LDGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt LDGEMM_L2x1_LOOP + +LDGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b LDGEMM_L2x1_SUB1 + +LDGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b LDGEMM_L2x1_SUB1 + +LDGEMM_L2x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x1_SAVE + b LDGEMM_L2x1_SUB2 + +LDGEMM_L2x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x1_SAVE + +LDGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x1_SUB2 + +LDGEMM_L2x1_SAVE: + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2 +#endif +LDGEMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif +LDGEMM_L2_END: +LDGEMM_L1_BEGIN: + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + andi. T1, N, 1 + ble LDGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 4 + ble LDGEMM_L1x16_END + +LDGEMM_L1x16_BEGIN: + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,16,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x16_SUB4 + +LDGEMM_L1x16_LOOP_START: + + #dcbt AO, PRE + LOAD1x16_1 + #dcbt AO, PRE + KERNEL1x16_I1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble LDGEMM_L1x16_LOOP_END + + MY_ALIGN + +LDGEMM_L1x16_LOOP: + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt LDGEMM_L1x16_LOOP + +LDGEMM_L1x16_LOOP_END: + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b LDGEMM_L1x16_SUB1 + +LDGEMM_L1x16_SUB4: + + #dcbt AO, PRE + KERNEL1x16_SUBI1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b LDGEMM_L1x16_SUB1 + +LDGEMM_L1x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x16_SAVE + b LDGEMM_L1x16_SUB2 + +LDGEMM_L1x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x16_SAVE + +LDGEMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x16_SUB2 + +LDGEMM_L1x16_SAVE: + + SAVE1x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1 +#endif + addic. I, I, -1 + bgt LDGEMM_L1x16_BEGIN + +LDGEMM_L1x16_END: + +LDGEMM_L1x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L1x1_END + + andi. T1, M, 8 + ble LDGEMM_L1x8_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x8_SUB4 + +LDGEMM_L1x8_LOOP_START: + + #dcbt AO, PRE + LOAD1x8_1 + KERNEL1x8_I1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble LDGEMM_L1x8_LOOP_END + + MY_ALIGN + +LDGEMM_L1x8_LOOP: + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt LDGEMM_L1x8_LOOP + +LDGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b LDGEMM_L1x8_SUB1 + +LDGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b LDGEMM_L1x8_SUB1 + +LDGEMM_L1x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x8_SAVE + b LDGEMM_L1x8_SUB2 + +LDGEMM_L1x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x8_SAVE + +LDGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x8_SUB2 + +LDGEMM_L1x8_SAVE: + + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1 +#endif +LDGEMM_L1x8_END: + +LDGEMM_L1x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x4_SUB4 + +LDGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble LDGEMM_L1x4_LOOP_END + + MY_ALIGN + +LDGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt LDGEMM_L1x4_LOOP + +LDGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b LDGEMM_L1x4_SUB1 + +LDGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b LDGEMM_L1x4_SUB1 + +LDGEMM_L1x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x4_SAVE + b LDGEMM_L1x4_SUB2 + +LDGEMM_L1x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x4_SAVE + +LDGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x4_SUB2 + +LDGEMM_L1x4_SAVE: + + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1 +#endif +LDGEMM_L1x4_END: + +LDGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x2_SUB4 + +LDGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble LDGEMM_L1x2_LOOP_END + + MY_ALIGN + +LDGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt LDGEMM_L1x2_LOOP + +LDGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b LDGEMM_L1x2_SUB1 + +LDGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b LDGEMM_L1x2_SUB1 + +LDGEMM_L1x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x2_SAVE + b LDGEMM_L1x2_SUB2 + +LDGEMM_L1x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x2_SAVE + +LDGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x2_SUB2 + +LDGEMM_L1x2_SAVE: + + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1 +#endif +LDGEMM_L1x2_END: + +LDGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x1_SUB4 + +LDGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble LDGEMM_L1x1_LOOP_END + + MY_ALIGN + +LDGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt LDGEMM_L1x1_LOOP + +LDGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b LDGEMM_L1x1_SUB1 + +LDGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b LDGEMM_L1x1_SUB1 + +LDGEMM_L1x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x1_SAVE + b LDGEMM_L1x1_SUB2 + +LDGEMM_L1x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x1_SAVE + +LDGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x1_SUB2 + +LDGEMM_L1x1_SAVE: + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1 +#endif +LDGEMM_L1x1_END: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif +LDGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_power9.S b/kernel/power/dgemm_macros_power9.S new file mode 100644 index 0000000000..c4b8270b82 --- /dev/null +++ b/kernel/power/dgemm_macros_power9.S @@ -0,0 +1,3623 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/********************************************************************* +* Macros for N=4, M=16 * +*********************************************************************/ +.macro LOAD4x16_1 + LOAD4x16 1 +.endm + +.macro LOAD4x16_0 + LOAD4x16 0 +.endm +.macro LOAD4x16 Zero + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + lxv vs4, 64(AO) + lxv vs5, 80(AO) + lxv vs6, 96(AO) + lxv vs7, 112(AO) +.if \Zero==1 + xxlxor vs32,vs32,vs32 + xxlxor vs33,vs33,vs33 + xxlxor vs34,vs34,vs34 + xxlxor vs35,vs35,vs35 + xxlxor vs36,vs36,vs36 + xxlxor vs37,vs37,vs37 + xxlxor vs38,vs38,vs38 + xxlxor vs39,vs39,vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endif +.endm + + +#define unit_size 8 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +.macro KERNEL4x16_L1_L2 Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0 +.endm + + + +.macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete + +.if \First ==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 +.endif + lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG) + lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 +.else + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 +.endif + lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG) + lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG) + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs31, vs30, vs30,2 +.if \First ==1 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + +.else + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + +.endif + lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + +.else + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 +.endif + lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + + + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + +.else + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 +.endif + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 +.if \Complete==0 + lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG) +.endif + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 +.if \Complete==0 + lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG) + lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endif + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 +.if \Complete==0 + lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG) +.endif + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 +.if \Complete==0 + lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG) +.endif + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 +.if \Complete==0 + lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG) +.endif + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + + xvmaddadp vs60, vs12, vs31 + + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + + xvmaddadp vs63, vs15, vs31 + .if \IsLast==1 + .if \Complete==1 + addi \AREG, \AREG, DISP32(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,32+\OffsetB) + .else + addi \AREG, \AREG, DISP32(\Index,256) + addi \BREG, \BREG, DISP8(\Index,64) + .endif + .endif + + +.endm + + + +.macro KERNEL4x16 First + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + lxv vs4, 64(AO) + lxv vs5, 80(AO) + lxv vs6, 96(AO) + lxv vs7, 112(AO) + + + + addi BO, BO, 32 + addi AO, AO, 128 + +.if \First==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + +.endif +.endm + +.macro SAVE4x16_REGS + add C2, CO, LDC + add C3, C2, LDC + add C4, C3, LDC +.endm + +.macro SAVE4x16 +#ifndef TRMMKERNEL + lxv vs0, 0(CO) + lxv vs2, 16(CO) + lxv vs4, 32(CO) + lxv vs6, 48(CO) +#endif + xxpermdi vs8, vs40,vs32,1 + xxpermdi vs9 ,vs32,vs40,1 +#ifndef TRMMKERNEL + lxv vs24, 64(CO) + lxv vs26, 80(CO) + lxv vs28, 96(CO) + lxv vs30, 112(CO) +#endif + xxpermdi vs10, vs41,vs33,1 + xxpermdi vs11 ,vs33,vs41,1 +#ifndef TRMMKERNEL + lxv vs1, 0(C2) + lxv vs3, 16(C2) + lxv vs5, 32(C2) + lxv vs7, 48(C2) +#endif + xxpermdi vs12, vs42,vs34,1 + xxpermdi vs13 ,vs34,vs42,1 +#ifndef TRMMKERNEL + lxv vs25, 64(C2) + lxv vs27, 80(C2) +#endif + xxpermdi vs14, vs43,vs35,1 + xxpermdi vs15 ,vs35,vs43,1 +#ifndef TRMMKERNEL + lxv vs29, 96(C2) + lxv vs31, 112(C2) +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + +#endif + xxpermdi vs8, vs44,vs36,1 + xxpermdi vs9 ,vs36,vs44,1 + xxpermdi vs10, vs45,vs37,1 + xxpermdi vs11 ,vs37,vs45,1 +#ifndef TRMMKERNEL + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r +#endif + xxpermdi vs12, vs46,vs38,1 + xxpermdi vs13 ,vs38,vs46,1 + xxpermdi vs14, vs47,vs39,1 + xxpermdi vs15 ,vs39,vs47,1 + +#ifndef TRMMKERNEL + xvmaddadp vs24, vs8, alpha_r + xvmaddadp vs25, vs9, alpha_r + xvmaddadp vs26, vs10, alpha_r + xvmaddadp vs27, vs11, alpha_r + + xvmaddadp vs28, vs12, alpha_r + xvmaddadp vs29, vs13, alpha_r + xvmaddadp vs30, vs14, alpha_r + xvmaddadp vs31, vs15, alpha_r +#else + xvmuldp vs24, vs8, alpha_r + xvmuldp vs25, vs9, alpha_r + xvmuldp vs26, vs10, alpha_r + xvmuldp vs27, vs11, alpha_r + + xvmuldp vs28, vs12, alpha_r + xvmuldp vs29, vs13, alpha_r + xvmuldp vs30, vs14, alpha_r + xvmuldp vs31, vs15, alpha_r + +#endif + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) + + stxv vs24, 64(CO) + stxv vs26, 80(CO) + stxv vs28, 96(CO) + stxv vs30, 112(CO) + + stxv vs1, 0(C2) + stxv vs3, 16(C2) + stxv vs5, 32(C2) + stxv vs7, 48(C2) + + stxv vs25, 64(C2) + stxv vs27, 80(C2) + stxv vs29, 96(C2) + stxv vs31, 112(C2) +#ifndef TRMMKERNEL + lxv vs0, 0(C3) + lxv vs2, 16(C3) + lxv vs4, 32(C3) + lxv vs6, 48(C3) +#endif + xxpermdi vs8, vs56,vs48,1 + xxpermdi vs9 ,vs48,vs56,1 +#ifndef TRMMKERNEL + lxv vs24, 64(C3) + lxv vs26, 80(C3) +#endif + xxpermdi vs10, vs57,vs49,1 + xxpermdi vs11 ,vs49,vs57,1 +#ifndef TRMMKERNEL + lxv vs28, 96(C3) + lxv vs30, 112(C3) +#endif + xxpermdi vs12, vs58,vs50,1 + xxpermdi vs13 ,vs50,vs58,1 +#ifndef TRMMKERNEL + lxv vs1, 0(C4) + lxv vs3, 16(C4) +#endif + xxpermdi vs14, vs59,vs51,1 + xxpermdi vs15 ,vs51,vs59,1 +#ifndef TRMMKERNEL + lxv vs5, 32(C4) + lxv vs7, 48(C4) + + lxv vs25, 64(C4) + lxv vs27, 80(C4) + lxv vs29, 96(C4) + lxv vs31, 112(C4) +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + +#endif + + xxpermdi vs8, vs60,vs52,1 + xxpermdi vs9 ,vs52,vs60,1 + xxpermdi vs10, vs61,vs53,1 + xxpermdi vs11 ,vs53,vs61,1 +#ifndef TRMMKERNEL + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r +#endif + + + xxpermdi vs12, vs62,vs54,1 + xxpermdi vs13 ,vs54,vs62,1 + xxpermdi vs14, vs63,vs55,1 + xxpermdi vs15 ,vs55,vs63,1 +#ifndef TRMMKERNEL + xvmaddadp vs24, vs8, alpha_r + xvmaddadp vs25, vs9, alpha_r + xvmaddadp vs26, vs10, alpha_r + xvmaddadp vs27, vs11, alpha_r + + xvmaddadp vs28, vs12, alpha_r + xvmaddadp vs29, vs13, alpha_r + xvmaddadp vs30, vs14, alpha_r + xvmaddadp vs31, vs15, alpha_r +#else + xvmuldp vs24, vs8, alpha_r + xvmuldp vs25, vs9, alpha_r + xvmuldp vs26, vs10, alpha_r + xvmuldp vs27, vs11, alpha_r + + xvmuldp vs28, vs12, alpha_r + xvmuldp vs29, vs13, alpha_r + xvmuldp vs30, vs14, alpha_r + xvmuldp vs31, vs15, alpha_r +#endif + stxv vs0, 0(C3) + stxv vs2, 16(C3) + stxv vs4, 32(C3) + stxv vs6, 48(C3) + + stxv vs24, 64(C3) + stxv vs26, 80(C3) + stxv vs28, 96(C3) + stxv vs30, 112(C3) + + stxv vs1, 0(C4) + stxv vs3, 16(C4) + stxv vs5, 32(C4) + stxv vs7, 48(C4) + + stxv vs25, 64(C4) + stxv vs27, 80(C4) + stxv vs29, 96(C4) + stxv vs31, 112(C4) + + addi CO, CO, 128 +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD4x8_1 + LOAD4x8 1 +.endm + +.macro LOAD4x8_0 + LOAD4x8 0 +.endm +.macro LOAD4x8 Zero + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + +.if \Zero==1 + xxlxor vs32,vs32,vs32 + xxlxor vs33,vs33,vs33 + xxlxor vs34,vs34,vs34 + xxlxor vs35,vs35,vs35 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + +.endif +.endm + + + +.macro KERNEL4x8_L1_L2 Index,IsLast + KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0 +.endm + + + +.macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index,0+\OffsetA)(AO) + lxv vs9, DISP16(\Index,16+\OffsetA)(AO) +.if \First ==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 +.endif + + lxv vs10, DISP16(\Index,32+\OffsetA)(AO) + lxv vs11, DISP16(\Index,48+\OffsetA)(AO) + + + +.if \First ==1 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + +.else + + lxv vs28, DISP8(\Index,0 +\OffsetB)(BO) + lxv vs30, DISP8(\Index,16 +\OffsetB)(BO) + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + +.endif + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs31, vs30, vs30,2 +.if \First ==1 + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.else + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + +.endif + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(AO) + lxv vs1, DISP16(\Index,80+\OffsetA)(AO) +.endif + + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.if \Complete==0 + lxv vs2, DISP16(\Index,96+\OffsetA)(AO) + lxv vs3, DISP16(\Index,112+\OffsetA)(AO) +.endif + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 +.if \Complete==0 + lxv vs24, DISP8(\Index,32 +\OffsetB)(BO) + lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) +.endif + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endif + + .if \IsLast==1 + .if \Complete==1 + addi AO, AO, DISP16(\Index,64+\OffsetA) + addi BO, BO, DISP8(\Index,32+\OffsetB) + .else + addi AO, AO, DISP16(\Index,128) + addi BO, BO, DISP8(\Index,64) + .endif + .endif + + +.endm + + + +.macro KERNEL4x8 First + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + + + addi BO, BO, 32 + addi AO, AO, 64 + +.if \First==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + +.endif +.endm + + + +.macro SAVE4x8 + add T2, CO, LDC + add T3, T2, LDC + add T4, T3, LDC +#ifndef TRMMKERNEL + lxv vs0, 0(CO) + lxv vs2, 16(CO) +#endif + xxpermdi vs8, vs40,vs32,1 + xxpermdi vs9 ,vs32,vs40,1 +#ifndef TRMMKERNEL + lxv vs4, 32(CO) + lxv vs6, 48(CO) +#endif + xxpermdi vs10, vs41,vs33,1 + xxpermdi vs11 ,vs33,vs41,1 +#ifndef TRMMKERNEL + lxv vs1, 0(T2) + lxv vs3, 16(T2) +#endif + xxpermdi vs12, vs42,vs34,1 + xxpermdi vs13 ,vs34,vs42,1 +#ifndef TRMMKERNEL + lxv vs5, 32(T2) + lxv vs7, 48(T2) +#endif + xxpermdi vs14, vs43,vs35,1 + xxpermdi vs15 ,vs35,vs43,1 + + + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r + + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r + +#endif + + + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) + + + stxv vs1, 0(T2) + stxv vs3, 16(T2) + stxv vs5, 32(T2) + stxv vs7, 48(T2) + + + xxpermdi vs8, vs56,vs48,1 + xxpermdi vs9 ,vs48,vs56,1 +#ifndef TRMMKERNEL + lxv vs0, 0(T3) + lxv vs2, 16(T3) +#endif + xxpermdi vs10, vs57,vs49,1 + xxpermdi vs11 ,vs49,vs57,1 +#ifndef TRMMKERNEL + lxv vs4, 32(T3) + lxv vs6, 48(T3) +#endif + xxpermdi vs12, vs58,vs50,1 + xxpermdi vs13 ,vs50,vs58,1 +#ifndef TRMMKERNEL + lxv vs1, 0(T4) + lxv vs3, 16(T4) +#endif + xxpermdi vs14, vs59,vs51,1 + xxpermdi vs15 ,vs51,vs59,1 +#ifndef TRMMKERNEL + lxv vs5, 32(T4) + lxv vs7, 48(T4) + + + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r + + + + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + + + + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r + +#endif + + + stxv vs0, 0(T3) + stxv vs2, 16(T3) + stxv vs4, 32(T3) + stxv vs6, 48(T3) + + + stxv vs1, 0(T4) + stxv vs3, 16(T4) + stxv vs5, 32(T4) + stxv vs7, 48(T4) + + + + addi CO, CO, 64 +.endm + + +/********************************************************************* +* Macros for N=4, M=4 * +*********************************************************************/ + +.macro LOAD4x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=4, M=2 * +*********************************************************************/ + +.macro LOAD4x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r +#else + xvmuldp vs0, vs48, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r +#else + xvmuldp vs8, vs56, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=4, M=1 * +*********************************************************************/ + +.macro LOAD4x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs48, alpha_r +#else + xsmuldp vs0, vs48, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs56, alpha_r +#else + xsmuldp vs8, vs56, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=2, M=16 * +*********************************************************************/ + +.macro LOAD2x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL2x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro SAVE2x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD2x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=2, M=4 * +*********************************************************************/ + +.macro LOAD2x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=2, M=2 * +*********************************************************************/ + +.macro LOAD2x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=2, M=1 * +*********************************************************************/ + +.macro LOAD2x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=1, M=16 * +*********************************************************************/ + +.macro LOAD1x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL1x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro SAVE1x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD1x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=1, M=4 * +*********************************************************************/ + +.macro LOAD1x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=1, M=2 * +*********************************************************************/ + +.macro LOAD1x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=1, M=1 * +*********************************************************************/ + +.macro LOAD1x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + addi CO, CO, 8 + +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif + +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm \ No newline at end of file diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c index 57f9f9e728..b458e11fcb 100644 --- a/kernel/power/dgemv_n.c +++ b/kernel/power/dgemv_n.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "dgemv_n_microk_power8.c" #endif diff --git a/kernel/power/drot.c b/kernel/power/drot.c index 3e107486f6..baeb542051 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "drot_microk_power8.c" #endif diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index f32dc4bad6..779a08e9ce 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "dscal_microk_power8.c" #endif diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index fd2dec9c49..52b7f50dad 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "dswap_microk_power8.c" #endif diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index fb10b1d27e..5908347d3d 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "sasum_microk_power8.c" #endif diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c index 167c29babc..5e3fe45a57 100644 --- a/kernel/power/scopy.c +++ b/kernel/power/scopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "scopy_microk_power8.c" #endif diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c index 4fdc2f5b55..ae527dde9d 100644 --- a/kernel/power/sdot.c +++ b/kernel/power/sdot.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "sdot_microk_power8.c" #endif diff --git a/kernel/power/srot.c b/kernel/power/srot.c index d2910ff875..6af813c161 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "srot_microk_power8.c" #endif diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index bd5cdc43fe..4f3ba56980 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "sscal_microk_power8.c" #endif diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 932652b376..23d13280fb 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "sswap_microk_power8.c" #endif diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c index 0b6b87d46d..f61c62e75b 100644 --- a/kernel/power/zasum.c +++ b/kernel/power/zasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "zasum_microk_power8.c" #endif diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c index dd7ab6c3cc..f0f8c69108 100644 --- a/kernel/power/zaxpy.c +++ b/kernel/power/zaxpy.c @@ -36,19 +36,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "zaxpy_microk_power8.c" #endif #ifndef HAVE_KERNEL_4 -static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { BLASLONG register i = 0; BLASLONG register ix = 0; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; + while(i < n) diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c index a7658f7ab9..b21d6ef15c 100644 --- a/kernel/power/zcopy.c +++ b/kernel/power/zcopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "zcopy_microk_power8.c" #endif diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c index b83f832b13..fd36c7f448 100644 --- a/kernel/power/zdot.c +++ b/kernel/power/zdot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "zdot_microk_power8.c" #endif diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 14d677f249..a1b441d2c8 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 5ec1eee2e1..1d8826f414 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) +#if defined(POWER8) || defined(POWER9) #include "zswap_microk_power8.c" #endif diff --git a/param.h b/param.h index fa6730208d..938a82a9e4 100644 --- a/param.h +++ b/param.h @@ -2230,6 +2230,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(POWER9) + +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 65536 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 1280 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 640 +#define ZGEMM_DEFAULT_P 320 + +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 384 +#define CGEMM_DEFAULT_Q 640 +#define ZGEMM_DEFAULT_Q 640 + +#define SYMV_P 8 + +#endif #if defined(SPARC) && defined(V7) From 4f9d3e4b28e9a5dfbe70e0a4f4f54517e5b3d6ca Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 12:37:13 +0100 Subject: [PATCH 158/189] Expose CBLAS interfaces for I?MIN and I?MAX --- cblas.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cblas.h b/cblas.h index d340a20371..e3dacb737f 100644 --- a/cblas.h +++ b/cblas.h @@ -88,6 +88,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + +CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); From 3d1e36d4cb15eb94098d2ab0a3413413c7aec2c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 12:38:41 +0100 Subject: [PATCH 159/189] Build CBLAS interfaces for I?MIN and I?MAX --- interface/Makefile | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 20ec74e9ee..2b996c7de0 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -263,7 +263,8 @@ CSBLAS1OBJS = \ cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ - cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) + cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ + cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) CSBLAS2OBJS = \ cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ @@ -280,7 +281,8 @@ CDBLAS1OBJS = \ cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ - cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) + cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ + cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) CDBLAS2OBJS = \ cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ @@ -300,7 +302,8 @@ CCBLAS1OBJS = \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ - cblas_caxpby.$(SUFFIX) + cblas_caxpby.$(SUFFIX) \ + cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) CCBLAS2OBJS = \ cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ @@ -326,7 +329,9 @@ CZBLAS1OBJS = \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ - cblas_zaxpby.$(SUFFIX) + cblas_zaxpby.$(SUFFIX) \ + cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) + CZBLAS2OBJS = \ cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \ @@ -1383,6 +1388,18 @@ cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) +cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) From 5c42287c4fa88e295a5c0bc9b58e3915148408be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 21:58:03 +0100 Subject: [PATCH 160/189] Add declarations for ?sum and cblas_?sum --- cblas.h | 5 +++++ common_c.h | 2 ++ common_d.h | 2 ++ common_interface.h | 7 +++++++ common_level1.h | 7 +++++++ common_macro.h | 6 ++++++ common_param.h | 6 ++++++ common_s.h | 2 ++ common_z.h | 2 ++ 9 files changed, 39 insertions(+) diff --git a/cblas.h b/cblas.h index e3dacb737f..1a87074d6b 100644 --- a/cblas.h +++ b/cblas.h @@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX); diff --git a/common_c.h b/common_c.h index ce0f2a5bdb..40ecf5b8b8 100644 --- a/common_c.h +++ b/common_c.h @@ -19,6 +19,7 @@ #define CDOTC_K cdotc_k #define CNRM2_K cnrm2_k #define CSCAL_K cscal_k +#define CSUM_K csum_k #define CSWAP_K cswap_k #define CROT_K csrot_k @@ -249,6 +250,7 @@ #define CDOTC_K gotoblas -> cdotc_k #define CNRM2_K gotoblas -> cnrm2_k #define CSCAL_K gotoblas -> cscal_k +#define CSUM_K gotoblas -> csum_k #define CSWAP_K gotoblas -> cswap_k #define CROT_K gotoblas -> csrot_k diff --git a/common_d.h b/common_d.h index ad99451867..94dc3eea88 100644 --- a/common_d.h +++ b/common_d.h @@ -19,6 +19,7 @@ #define DDOTC_K ddot_k #define DNRM2_K dnrm2_k #define DSCAL_K dscal_k +#define DSUM_K dsum_k #define DSWAP_K dswap_k #define DROT_K drot_k @@ -174,6 +175,7 @@ #define DDOTC_K gotoblas -> ddot_k #define DNRM2_K gotoblas -> dnrm2_k #define DSCAL_K gotoblas -> dscal_k +#define DSUM_K gotoblas -> dsum_k #define DSWAP_K gotoblas -> dswap_k #define DROT_K gotoblas -> drot_k diff --git a/common_interface.h b/common_interface.h index 15f69e02f1..c350ac8ec0 100644 --- a/common_interface.h +++ b/common_interface.h @@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *); double BLASFUNC(dzasum)(blasint *, double *, blasint *); xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); +FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *); +FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *); +double BLASFUNC(dsum) (blasint *, double *, blasint *); +xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *); +double BLASFUNC(dzsum)(blasint *, double *, blasint *); +xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *); + blasint BLASFUNC(isamax)(blasint *, float *, blasint *); blasint BLASFUNC(idamax)(blasint *, double *, blasint *); blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); diff --git a/common_level1.h b/common_level1.h index 32ffd6f188..74cafb6dbb 100644 --- a/common_level1.h +++ b/common_level1.h @@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG); double zasum_k (BLASLONG, double *, BLASLONG); xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); +float ssum_k (BLASLONG, float *, BLASLONG); +double dsum_k (BLASLONG, double *, BLASLONG); +xdouble qsum_k (BLASLONG, xdouble *, BLASLONG); +float csum_k (BLASLONG, float *, BLASLONG); +double zsum_k (BLASLONG, double *, BLASLONG); +xdouble xsum_k (BLASLONG, xdouble *, BLASLONG); + float samax_k (BLASLONG, float *, BLASLONG); double damax_k (BLASLONG, double *, BLASLONG); xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 15ba6f9db9..d2503aa65e 100644 --- a/common_macro.h +++ b/common_macro.h @@ -66,6 +66,7 @@ #define DOTC_K QDOTC_K #define NRM2_K QNRM2_K #define SCAL_K QSCAL_K +#define SUM_K QSUM_K #define SWAP_K QSWAP_K #define ROT_K QROT_K @@ -356,6 +357,7 @@ #define DOTC_K DDOTC_K #define NRM2_K DNRM2_K #define SCAL_K DSCAL_K +#define SUM_K DSUM_K #define SWAP_K DSWAP_K #define ROT_K DROT_K @@ -658,6 +660,7 @@ #define DOTC_K SDOTC_K #define NRM2_K SNRM2_K #define SCAL_K SSCAL_K +#define SUM_K SSUM_K #define SWAP_K SSWAP_K #define ROT_K SROT_K @@ -962,6 +965,7 @@ #define DOTC_K XDOTC_K #define NRM2_K XNRM2_K #define SCAL_K XSCAL_K +#define SUM_K XSUM_K #define SWAP_K XSWAP_K #define ROT_K XROT_K @@ -1363,6 +1367,7 @@ #define DOTC_K ZDOTC_K #define NRM2_K ZNRM2_K #define SCAL_K ZSCAL_K +#define SUM_K ZSUM_K #define SWAP_K ZSWAP_K #define ROT_K ZROT_K @@ -1785,6 +1790,7 @@ #define DOTC_K CDOTC_K #define NRM2_K CNRM2_K #define SCAL_K CSCAL_K +#define SUM_K CSUM_K #define SWAP_K CSWAP_K #define ROT_K CROT_K diff --git a/common_param.h b/common_param.h index 8f162c01f5..574d5e176d 100644 --- a/common_param.h +++ b/common_param.h @@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); + float (*ssum_k) (BLASLONG, float *, BLASLONG); int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); + double (*dsum_k) (BLASLONG, double *, BLASLONG); int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); @@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG); int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); @@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG); + float (*csum_k) (BLASLONG, float *, BLASLONG); int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); double (*znrm2_k) (BLASLONG, double *, BLASLONG); double (*zasum_k) (BLASLONG, double *, BLASLONG); + double (*zsum_k) (BLASLONG, double *, BLASLONG); int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG); int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); diff --git a/common_s.h b/common_s.h index 3c1600859e..23c432f7c5 100644 --- a/common_s.h +++ b/common_s.h @@ -12,6 +12,7 @@ #define ISMAX_K ismax_k #define ISMIN_K ismin_k #define SASUM_K sasum_k +#define SSUM_K ssum_k #define SAXPYU_K saxpy_k #define SAXPYC_K saxpy_k #define SCOPY_K scopy_k @@ -170,6 +171,7 @@ #define ISMAX_K gotoblas -> ismax_k #define ISMIN_K gotoblas -> ismin_k #define SASUM_K gotoblas -> sasum_k +#define SSUM_K gotoblas -> ssum_k #define SAXPYU_K gotoblas -> saxpy_k #define SAXPYC_K gotoblas -> saxpy_k #define SCOPY_K gotoblas -> scopy_k diff --git a/common_z.h b/common_z.h index b4f58bb0c8..f1e78dd088 100644 --- a/common_z.h +++ b/common_z.h @@ -19,6 +19,7 @@ #define ZDOTC_K zdotc_k #define ZNRM2_K znrm2_k #define ZSCAL_K zscal_k +#define ZSUM_K zsum_k #define ZSWAP_K zswap_k #define ZROT_K zdrot_k @@ -249,6 +250,7 @@ #define ZDOTC_K gotoblas -> zdotc_k #define ZNRM2_K gotoblas -> znrm2_k #define ZSCAL_K gotoblas -> zscal_k +#define ZSUM_K gotoblas -> zsum_k #define ZSWAP_K gotoblas -> zswap_k #define ZROT_K gotoblas -> zdrot_k From 79cfc24a6208b869ec79ee26d3f3eab6af3b8aea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 21:59:18 +0100 Subject: [PATCH 161/189] Add interface for ?sum (derived from ?asum) --- interface/CMakeLists.txt | 3 ++ interface/Makefile | 56 +++++++++++++++++------ interface/sum.c | 97 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+), 13 deletions(-) create mode 100644 interface/sum.c diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 8b25344c01..f76d5c13f7 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES rotm.c rotmg.c # N.B. these do not have complex counterparts rot.c asum.c + sum.c ) # these will have 'z' prepended for the complex version @@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") + GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX") endif () if (${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") @@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") + GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") endif () endforeach () diff --git a/interface/Makefile b/interface/Makefile index 2b996c7de0..f0577796d5 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -25,7 +25,7 @@ SBLAS1OBJS = \ saxpy.$(SUFFIX) sswap.$(SUFFIX) \ scopy.$(SUFFIX) sscal.$(SUFFIX) \ sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ - sasum.$(SUFFIX) snrm2.$(SUFFIX) \ + sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \ smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ @@ -51,7 +51,7 @@ DBLAS1OBJS = \ daxpy.$(SUFFIX) dswap.$(SUFFIX) \ dcopy.$(SUFFIX) dscal.$(SUFFIX) \ ddot.$(SUFFIX) \ - dasum.$(SUFFIX) dnrm2.$(SUFFIX) \ + dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \ dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ @@ -76,7 +76,7 @@ CBLAS1OBJS = \ caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ - scasum.$(SUFFIX) scnrm2.$(SUFFIX) \ + scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \ scamax.$(SUFFIX) icamax.$(SUFFIX) \ scamin.$(SUFFIX) icamin.$(SUFFIX) \ csrot.$(SUFFIX) crotg.$(SUFFIX) \ @@ -105,7 +105,7 @@ ZBLAS1OBJS = \ zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ - dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \ + dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \ dzamax.$(SUFFIX) izamax.$(SUFFIX) \ dzamin.$(SUFFIX) izamin.$(SUFFIX) \ zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ @@ -146,7 +146,7 @@ QBLAS1OBJS = \ qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ qcopy.$(SUFFIX) qscal.$(SUFFIX) \ qdot.$(SUFFIX) \ - qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ + qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ @@ -168,7 +168,7 @@ XBLAS1OBJS = \ xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ - qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ + qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ @@ -203,7 +203,7 @@ ifdef QUAD_PRECISION QBLAS1OBJS = \ qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ qcopy.$(SUFFIX) qscal.$(SUFFIX) \ - qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ + qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ @@ -224,7 +224,7 @@ QBLAS3OBJS = \ XBLAS1OBJS = \ xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ - qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ + qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ @@ -264,7 +264,7 @@ CSBLAS1OBJS = \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ - cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) + cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) CSBLAS2OBJS = \ cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ @@ -282,7 +282,7 @@ CDBLAS1OBJS = \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ - cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) + cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) CDBLAS2OBJS = \ cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ @@ -303,7 +303,7 @@ CCBLAS1OBJS = \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ cblas_caxpby.$(SUFFIX) \ - cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) + cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) CCBLAS2OBJS = \ cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ @@ -330,7 +330,7 @@ CZBLAS1OBJS = \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ cblas_zaxpby.$(SUFFIX) \ - cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) + cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) CZBLAS2OBJS = \ @@ -565,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -c $< -o $(@F) +ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c $(CC) $(CFLAGS) -c $< -o $(@F) @@ -1412,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1419,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) diff --git a/interface/sum.c b/interface/sum.c new file mode 100644 index 0000000000..dfdcc5dcc5 --- /dev/null +++ b/interface/sum.c @@ -0,0 +1,97 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + FLOATRET ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = (FLOATRET)SUM_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, n); + + IDEBUG_END; + + return ret; +} + +#else +#ifdef COMPLEX +FLOAT CNAME(blasint n, void *vx, blasint incx){ + FLOAT *x = (FLOAT*) vx; +#else +FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ +#endif + + FLOAT ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = SUM_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, n); + + IDEBUG_END; + + return ret; +} + +#endif From b9f4943a14ef8ff4a1bde192f491b2efa02eff40 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:01:13 +0100 Subject: [PATCH 162/189] Add ?sum --- kernel/CMakeLists.txt | 1 + kernel/Makefile.L1 | 61 ++++++++++++++++++++++++++++++++++++++----- kernel/setparam-ref.c | 12 ++++----- 3 files changed, 61 insertions(+), 13 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 2a330df4e9..ad15b8f250 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type}) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index a8f9cf0974..970703230a 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -340,6 +340,32 @@ ifndef XSCALKERNEL XSCALKERNEL = zscal.S endif +### SUM ### + +ifndef SSUMKERNEL +SSUMKERNEL = sum.S +endif + +ifndef DSUMKERNEL +DSUMKERNEL = sum.S +endif + +ifndef CSUMKERNEL +CSUMKERNEL = zsum.S +endif + +ifndef ZSUMKERNEL +ZSUMKERNEL = zsum.S +endif + +ifndef QSUMKERNEL +QSUMKERNEL = sum.S +endif + +ifndef XSUMKERNEL +XSUMKERNEL = zsum.S +endif + ### SWAP ### ifndef SSWAPKERNEL @@ -453,7 +479,7 @@ endif SBLASOBJS += \ samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \ isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \ - sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ + sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ saxpby_k$(TSUFFIX).$(SUFFIX) @@ -463,31 +489,32 @@ DBLASOBJS += \ idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ - daxpby_k$(TSUFFIX).$(SUFFIX) + daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) QBLASOBJS += \ qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ - qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) + qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ + qsum_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \ cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \ - cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) + cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \ zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \ zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \ - zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) + zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX) XBLASOBJS += \ xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \ xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \ xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ - xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) + xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) ### AMAX ### @@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE $(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ - +### ASUM ### $(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ @@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ +### SUM ### +$(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ + +### AXPY ### $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 6d4028b0b2..2985003f35 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -70,7 +70,7 @@ gotoblas_t TABLE_NAME = { samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, - snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS, + snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS, dsdot_kTS, srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, sgemv_nTS, sgemv_tTS, sger_kTS, @@ -126,7 +126,7 @@ gotoblas_t TABLE_NAME = { damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, - dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS, + dnrm2_kTS, dasum_kTS, dsum_kTS, dcopy_kTS, ddot_kTS, drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, dgemv_nTS, dgemv_tTS, dger_kTS, dsymv_LTS, dsymv_UTS, @@ -178,7 +178,7 @@ gotoblas_t TABLE_NAME = { qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, - qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS, + qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS, qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, qgemv_nTS, qgemv_tTS, qger_kTS, qsymv_LTS, qsymv_UTS, @@ -234,7 +234,7 @@ gotoblas_t TABLE_NAME = { #endif camax_kTS, camin_kTS, icamax_kTS, icamin_kTS, - cnrm2_kTS, casum_kTS, ccopy_kTS, + cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS, cdotu_kTS, cdotc_kTS, csrot_kTS, caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, @@ -369,7 +369,7 @@ gotoblas_t TABLE_NAME = { #endif zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS, - znrm2_kTS, zasum_kTS, zcopy_kTS, + znrm2_kTS, zasum_kTS, zsum_kTS, zcopy_kTS, zdotu_kTS, zdotc_kTS, zdrot_kTS, zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS, @@ -500,7 +500,7 @@ gotoblas_t TABLE_NAME = { XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N), xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS, - xnrm2_kTS, xasum_kTS, xcopy_kTS, + xnrm2_kTS, xasum_kTS, xsum_kTS, xcopy_kTS, xdotu_kTS, xdotc_kTS, xqrot_kTS, xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS, From c3cfc6986b9b2b38af7324591dd4a54c21a093a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:05:11 +0100 Subject: [PATCH 163/189] Add implementations of ssum/dsum and csum/zsum as trivial copies of asum/zsasum with the fabs calls replaced by fmov to preserve code structure --- kernel/alpha/sum.S | 206 +++++++++++++++++++++++++++++++++++++++++++ kernel/alpha/zsum.S | 208 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 414 insertions(+) create mode 100644 kernel/alpha/sum.S create mode 100644 kernel/alpha/zsum.S diff --git a/kernel/alpha/sum.S b/kernel/alpha/sum.S new file mode 100644 index 0000000000..3902817a70 --- /dev/null +++ b/kernel/alpha/sum.S @@ -0,0 +1,206 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + ble N, $L999 + + sra N, 3, I + fclr s1 + fclr s2 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t1 + SXADDQ INCX, X, X + fclr t2 + + LD a1, 0 * SIZE(X) + fclr t3 + SXADDQ INCX, X, X + fclr s3 + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + ldl $31, PREFETCHSIZE * 2 * SIZE(X) + fmov a0, t0 + lda I, -1(I) + + ADD s1, t1, s1 + LD a6, 0 * SIZE(X) + fmov a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + LD a7, 0 * SIZE(X) + fmov a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + fmov a3, t3 + SXADDQ INCX, X, X + + ADD s0, t0, s0 + LD a1, 0 * SIZE(X) + fmov a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a2, 0 * SIZE(X) + fmov a5, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + LD a3, 0 * SIZE(X) + fmov a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + fmov a7, t3 + SXADDQ INCX, X, X + + LD a5, 0 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD a6, 0 * SIZE(X) + fmov a0, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a7, 0 * SIZE(X) + fmov a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fmov a2, t2 + ADD s3, t3, s3 + fmov a3, t3 + + ADD s0, t0, s0 + fmov a4, t0 + ADD s1, t1, s1 + fmov a5, t1 + ADD s2, t2, s2 + fmov a6, t2 + ADD s3, t3, s3 + fmov a7, t3 + + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + ADD s0, s1, s0 + ADD s2, s3, s2 + .align 4 + +$L15: + and N, 7, I + ADD s0, s2, s0 + unop + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, s0 + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + fmov a0, t0 + + lda I, -1(I) + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, s0 + ret + EPILOGUE diff --git a/kernel/alpha/zsum.S b/kernel/alpha/zsum.S new file mode 100644 index 0000000000..1ad0eb137d --- /dev/null +++ b/kernel/alpha/zsum.S @@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + addq INCX, INCX, INCX + + fclr s1 + unop + fclr t1 + ble N, $L999 + + fclr s2 + sra N, 2, I + fclr s3 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t2 + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a2, 0 * SIZE(X) + fclr t3 + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X + lda I, -1(I) + + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + ldl $31, PREFETCHSIZE * SIZE(X) + fmov a0, t0 + lda I, -1(I) + + ADD s1, t1, s1 + LD a6, 0 * SIZE(X) + fmov a1, t1 + unop + + ADD s2, t2, s2 + LD a7, 1 * SIZE(X) + fmov a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + fmov a3, t3 + unop + + ADD s0, t0, s0 + LD a1, 1 * SIZE(X) + fmov a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a2, 0 * SIZE(X) + fmov a5, t1 + unop + + ADD s2, t2, s2 + LD a3, 1 * SIZE(X) + fmov a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + fmov a7, t3 + unop + + LD a5, 1 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD a6, 0 * SIZE(X) + fmov a0, t0 + + ADD s1, t1, s1 + LD a7, 1 * SIZE(X) + fmov a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fmov a2, t2 + ADD s3, t3, s3 + fmov a3, t3 + + ADD s0, t0, s0 + fmov a4, t0 + ADD s1, t1, s1 + fmov a5, t1 + ADD s2, t2, s2 + fmov a6, t2 + ADD s3, t3, s3 + fmov a7, t3 + + ADD s2, t2, s2 + ADD s3, t3, s3 + + .align 4 + +$L15: + ADD s0, s2, s0 + and N, 3, I + ADD s1, s3, s1 + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, s0 + LD a0, 0 * SIZE(X) + fmov a0, t0 + lda I, -1(I) + + ADD s1, t1, s1 + LD a1, 1 * SIZE(X) + fmov a1, t1 + SXADDQ INCX, X, X + + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, s0 + ADD s1, t1, s1 + + ADD s0, s1, s0 + ret + EPILOGUE From 94ab4e6fb262a03752cb1a54a5731cb8b0b2dc43 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:11:38 +0100 Subject: [PATCH 164/189] Add ARM implementations of ?sum (trivial copies of the respective ?asum with the fabs calls removed) --- kernel/arm/KERNEL.ARMV5 | 5 + kernel/arm/KERNEL.ARMV6 | 3 + kernel/arm/sum.c | 51 +++++ kernel/arm/sum_vfp.S | 425 ++++++++++++++++++++++++++++++++++++++++ kernel/arm/zsum.c | 57 ++++++ 5 files changed, 541 insertions(+) create mode 100644 kernel/arm/sum.c create mode 100644 kernel/arm/sum_vfp.S create mode 100644 kernel/arm/zsum.c diff --git a/kernel/arm/KERNEL.ARMV5 b/kernel/arm/KERNEL.ARMV5 index 10808e2d93..e977dda3a0 100644 --- a/kernel/arm/KERNEL.ARMV5 +++ b/kernel/arm/KERNEL.ARMV5 @@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 960dae67b0..b773a5ba03 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -37,6 +37,9 @@ DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S ZASUMKERNEL = asum_vfp.S +SSUMKERNEL = sum_vfp.S +DSUMKERNEL = sum_vfp.S + SAXPYKERNEL = axpy_vfp.S DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c new file mode 100644 index 0000000000..7b78ec61a4 --- /dev/null +++ b/kernel/arm/sum.c @@ -0,0 +1,51 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* trivial copy of asum.c with the ABS() removed * +**************************************************************************************/ + + +#include "common.h" +#include + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += x[i]; + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/arm/sum_vfp.S b/kernel/arm/sum_vfp.S new file mode 100644 index 0000000000..d33d99ed3e --- /dev/null +++ b/kernel/arm/sum_vfp.S @@ -0,0 +1,425 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed * +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + vldmia.f64 X!, { d4 - d5 } + vadd.f64 d0 , d0, d4 + vldmia.f64 X!, { d6 - d7 } + vadd.f64 d1 , d1, d5 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + +.endm + +.macro KERNEL_F1 + + vldmia.f64 X!, { d4 } + vadd.f64 d0 , d0, d4 + +.endm + + +.macro KERNEL_S4 + + vldmia.f64 X, { d4 } + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + vldmia.f64 X, { d4 } + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + vldmia.f64 X, { d4 } + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + vldmia.f64 X, { d4 } + vadd.f64 d0 , d0, d4 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f64 X, { d4 } + vadd.f64 d0 , d0, d4 + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + vldmia.f32 X!, { s4 - s5 } + vadd.f32 s0 , s0, s4 + vldmia.f32 X!, { s6 - s7 } + vadd.f32 s1 , s1, s5 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + +.endm + +.macro KERNEL_F1 + + vldmia.f32 X!, { s4 } + vadd.f32 s0 , s0, s4 + +.endm + + +.macro KERNEL_S4 + + vldmia.f32 X, { s4 } + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + vldmia.f32 X, { s4 } + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + vldmia.f32 X, { s4 } + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + vldmia.f32 X, { s4 } + vadd.f32 s0 , s0, s4 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f32 X, { s4 } + vadd.f32 s0 , s0, s4 + add X, X, INC_X + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + vldmia.f64 X!, { d4 - d5 } + vadd.f64 d0 , d0, d4 + vldmia.f64 X!, { d6 - d7 } + vadd.f64 d1 , d1, d5 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + + pld [ X, #X_PRE ] + vldmia.f64 X!, { d4 - d5 } + vadd.f64 d0 , d0, d4 + vldmia.f64 X!, { d6 - d7 } + vadd.f64 d1 , d1, d5 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + + +.endm + +.macro KERNEL_F1 + + vldmia.f64 X!, { d4 } + vadd.f64 d0 , d0, d4 + + vldmia.f64 X!, { d4 } + vadd.f64 d0 , d0, d4 + + +.endm + + +.macro KERNEL_S4 + + vldmia.f64 X, { d4 -d5 } + vadd.f64 d0 , d0, d4 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + vldmia.f64 X, { d4 -d5 } + vadd.f64 d0 , d0, d4 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + vldmia.f64 X, { d4 -d5 } + vadd.f64 d0 , d0, d4 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + vldmia.f64 X, { d4 -d5 } + vadd.f64 d0 , d0, d4 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f64 X, { d4 -d5 } + vadd.f64 d0 , d0, d4 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + vldmia.f32 X!, { s4 - s5 } + vadd.f32 s0 , s0, s4 + vldmia.f32 X!, { s6 - s7 } + vadd.f32 s1 , s1, s5 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + + vldmia.f32 X!, { s4 - s5 } + vadd.f32 s0 , s0, s4 + vldmia.f32 X!, { s6 - s7 } + vadd.f32 s1 , s1, s5 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + + +.endm + +.macro KERNEL_F1 + + vldmia.f32 X!, { s4 } + vadd.f32 s0 , s0, s4 + + vldmia.f32 X!, { s4 } + vadd.f32 s0 , s0, s4 + +.endm + + +.macro KERNEL_S4 + + vldmia.f32 X, { s4 -s5 } + vadd.f32 s0 , s0, s4 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + vldmia.f32 X, { s4 -s5 } + vadd.f32 s0 , s0, s4 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + vldmia.f32 X, { s4 -s5 } + vadd.f32 s0 , s0, s4 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + vldmia.f32 X, { s4 -s5 } + vadd.f32 s0 , s0, s4 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + vldmia.f32 X, { s4 -s5 } + vadd.f32 s0 , s0, s4 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + +.endm + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + movs r12, #0 // clear floating point register + vmov s0, r12 + vmov s1, r12 +#if defined(DOUBLE) + vcvt.f64.f32 d0, s0 + vcvt.f64.f32 d1, s1 +#endif + + cmp N, #0 + ble asum_kernel_L999 + + cmp INC_X, #0 + beq asum_kernel_L999 + + cmp INC_X, #1 + bne asum_kernel_S_BEGIN + + +asum_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble asum_kernel_F1 + + .align 5 + +asum_kernel_F4: + +#if !defined(DOUBLE) && !defined(COMPLEX) + pld [ X, #X_PRE ] +#endif + KERNEL_F4 + + subs I, I, #1 + ble asum_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne asum_kernel_F4 + +asum_kernel_F1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne asum_kernel_F10 + + b asum_kernel_L999 + +asum_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + asrs I, N, #2 // I = N / 4 + ble asum_kernel_S1 + + .align 5 + +asum_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne asum_kernel_S4 + +asum_kernel_S1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S10 + + +asum_kernel_L999: + + +#if defined(DOUBLE) + vadd.f64 d0 , d0, d1 // set return value +#else + vadd.f32 s0 , s0, s1 // set return value +#endif + +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov r0, s0 +#else + vmov r0, r1, d0 +#endif +#endif + + bx lr + + EPILOGUE + diff --git a/kernel/arm/zsum.c b/kernel/arm/zsum.c new file mode 100644 index 0000000000..cd24f99957 --- /dev/null +++ b/kernel/arm/zsum.c @@ -0,0 +1,57 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* trivial copy of zasum.c with the ABS() removed * +**************************************************************************************/ + + +#include "common.h" +#include + +#define CSUM1(x,i) x[i]+x[i+1] + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CSUM1(x,i); + i += inc_x2; + } + return(sumf); +} + + From 3e3ccb90118e58a92e59a5e7e3cf3209b3d925f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:13:36 +0100 Subject: [PATCH 165/189] Add ARM64 implementations of ?sum as trivial copies of the respective ?asum kernels with the fabs calls removed --- kernel/arm64/csum.S | 164 ++++++++++++++++++++++++++++++++++++++ kernel/arm64/sum.S | 186 ++++++++++++++++++++++++++++++++++++++++++++ kernel/arm64/zsum.S | 158 +++++++++++++++++++++++++++++++++++++ 3 files changed, 508 insertions(+) create mode 100644 kernel/arm64/csum.S create mode 100644 kernel/arm64/sum.S create mode 100644 kernel/arm64/zsum.S diff --git a/kernel/arm64/csum.S b/kernel/arm64/csum.S new file mode 100644 index 0000000000..90746bc392 --- /dev/null +++ b/kernel/arm64/csum.S @@ -0,0 +1,164 @@ +/******************************************************************************* +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define REG0 wzr +#define SUMF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 + +/******************************************************************************/ + +.macro KERNEL_F1 + ld1 {v1.2s}, [X], #8 + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, TMPF, s2 + fadd SUMF, SUMF, TMPF +.endm + +.macro KERNEL_F8 + ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X] + add X, X, #64 + + PRFM PLDL1KEEP, [X, #1024] + + fadd v1.4s, v1.4s, v2.4s + fadd v3.4s, v3.4s, v4.4s + fadd v0.4s, v0.4s, v1.4s + fadd v0.4s, v0.4s, v3.4s +.endm + +.macro KERNEL_F8_FINALIZE + ext v1.16b, v0.16b, v0.16b, #8 + fadd v0.2s, v0.2s, v1.2s + faddp SUMF, v0.2s +.endm + +.macro INIT_S + lsl INC_X, INC_X, #3 +.endm + +.macro KERNEL_S1 + ld1 {v1.2s}, [X], INC_X + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, TMPF, s2 + fadd SUMF, SUMF, TMPF + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SUMF, REG0 + fmov s1, SUMF + + cmp N, xzr + ble .Lcsum_kernel_L999 + cmp INC_X, xzr + ble .Lcsum_kernel_L999 + + cmp INC_X, #1 + bne .Lcsum_kernel_S_BEGIN + +.Lcsum_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq .Lcsum_kernel_F1 + +.Lcsum_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne .Lcsum_kernel_F8 + + KERNEL_F8_FINALIZE + +.Lcsum_kernel_F1: + + ands I, N, #7 + ble .Lcsum_kernel_L999 + +.Lcsum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne .Lcsum_kernel_F10 + +.Lcsum_kernel_L999: + ret + +.Lcsum_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble .Lcsum_kernel_S1 + +.Lcsum_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne .Lcsum_kernel_S4 + +.Lcsum_kernel_S1: + + ands I, N, #3 + ble .Lcsum_kernel_L999 + +.Lcsum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne .Lcsum_kernel_S10 + + ret + + EPILOGUE diff --git a/kernel/arm64/sum.S b/kernel/arm64/sum.S new file mode 100644 index 0000000000..16d0dc4e44 --- /dev/null +++ b/kernel/arm64/sum.S @@ -0,0 +1,186 @@ +/******************************************************************************* +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define REG0 wzr +#define SUMF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define REG0 xzr +#define SUMF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + ldr TMPF, [X], #SZ + fadd SUMF, SUMF, TMPF +.endm + +.macro KERNEL_F8 +#if !defined(DOUBLE) + ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0] + fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0] + fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0] + PRFM PLDL1KEEP, [X, #1024] +#else // DOUBLE + ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X] + add X, X, #64 + + PRFM PLDL1KEEP, [X, #1024] + + fadd v2.2d, v2.2d, v3.2d + fadd v4.2d, v4.2d, v5.2d + fadd v0.2d, v0.2d, v2.2d + fadd v0.2d, v0.2d, v4.2d +#endif +.endm + +.macro KERNEL_F8_FINALIZE +#if !defined(DOUBLE) + ext v1.16b, v0.16b, v0.16b, #8 + fadd v0.2s, v0.2s, v1.2s + faddp SUMF, v0.2s +#else + faddp SUMF, v0.2d +#endif +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 +#else + lsl INC_X, INC_X, #3 +#endif +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X + fadd SUMF, SUMF, TMPF +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SUMF, REG0 +#if !defined(DOUBLE) + fmov s1, SUMF +#else + fmov d1, SUMF +#endif + + cmp N, xzr + ble .Lsum_kernel_L999 + cmp INC_X, xzr + ble .Lsum_kernel_L999 + + cmp INC_X, #1 + bne .Lsum_kernel_S_BEGIN + +.Lsum_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq .Lsum_kernel_F1 + +.Lsum_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne .Lsum_kernel_F8 + + KERNEL_F8_FINALIZE + +.Lsum_kernel_F1: + + ands I, N, #7 + ble .Lsum_kernel_L999 + +.Lsum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne .Lsum_kernel_F10 + +.Lsum_kernel_L999: + ret + +.Lsum_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble .Lsum_kernel_S1 + +.Lsum_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne .Lsum_kernel_S4 + +.Lsum_kernel_S1: + + ands I, N, #3 + ble .Lsum_kernel_L999 + +.Lsum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne .Lsum_kernel_S10 + + ret + + EPILOGUE diff --git a/kernel/arm64/zsum.S b/kernel/arm64/zsum.S new file mode 100644 index 0000000000..67ea3cb4d0 --- /dev/null +++ b/kernel/arm64/zsum.S @@ -0,0 +1,158 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define REG0 xzr +#define SUMF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 + +/******************************************************************************/ + +.macro KERNEL_F1 + ld1 {v1.2d}, [X], #16 + faddp TMPF, v1.2d + fadd SUMF, SUMF, TMPF +.endm + +.macro KERNEL_F4 + ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 + + fadd v1.2d, v1.2d, v2.2d + fadd v3.2d, v3.2d, v4.2d + + fadd v0.2d, v0.2d, v1.2d + fadd v0.2d, v0.2d, v3.2d + + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro KERNEL_F4_FINALIZE + faddp SUMF, v0.2d +.endm + +.macro INIT_S + lsl INC_X, INC_X, #4 +.endm + +.macro KERNEL_S1 + ld1 {v1.2d}, [X], INC_X + faddp TMPF, v1.2d + fadd SUMF, SUMF, TMPF +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SUMF, REG0 + + cmp N, xzr + ble .Lzsum_kernel_L999 + cmp INC_X, xzr + ble .Lzsum_kernel_L999 + + cmp INC_X, #1 + bne .Lzsum_kernel_S_BEGIN + +.Lzsum_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq .Lzsum_kernel_F1 + +.Lzsum_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne .Lzsum_kernel_F4 + + KERNEL_F4_FINALIZE + +.Lzsum_kernel_F1: + + ands I, N, #3 + ble .Lzsum_kernel_L999 + +.Lzsum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne .Lzsum_kernel_F10 + +.Lzsum_kernel_L999: + ret + +.Lzsum_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble .Lzsum_kernel_S1 + +.Lzsum_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne .Lzsum_kernel_S4 + +.Lzsum_kernel_S1: + + ands I, N, #3 + ble .Lzsum_kernel_L999 + +.Lzsum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne .Lzsum_kernel_S10 + + ret + + EPILOGUE From f8b82bc6dc0c7650fa757c6eadf1906e0bc50950 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:18:03 +0100 Subject: [PATCH 166/189] Add ia64 implementation of ?sum as trivial copy of asum with the fabs calls removed --- kernel/ia64/KERNEL | 4 + kernel/ia64/sum.S | 358 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 362 insertions(+) create mode 100644 kernel/ia64/sum.S diff --git a/kernel/ia64/KERNEL b/kernel/ia64/KERNEL index 10a7e61e25..870aac473e 100644 --- a/kernel/ia64/KERNEL +++ b/kernel/ia64/KERNEL @@ -60,6 +60,10 @@ CASUMKERNEL = asum.S ZASUMKERNEL = asum.S XASUMKERNEL = asum.S +CSUMKERNEL = sum.S +ZSUMKERNEL = sum.S +XSUMKERNEL = sum.S + CNRM2KERNEL = nrm2.S ZNRM2KERNEL = nrm2.S XNRM2KERNEL = nrm2.S diff --git a/kernel/ia64/sum.S b/kernel/ia64/sum.S new file mode 100644 index 0000000000..561d5d7715 --- /dev/null +++ b/kernel/ia64/sum.S @@ -0,0 +1,358 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2019, The OpenBLAS project */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16 + 8) +#else +#define PREFETCH_SIZE (32 * 16 + 16) +#endif + +#ifndef COMPLEX +#define COMPADD 0 +#define STRIDE INCX +#else +#define COMPADD 1 +#define STRIDE SIZE +#endif + +#define PRE1 r2 + +#define I r17 +#define J r18 +#define INCX16 r21 + +#define PR r30 +#define ARLC r31 + +#define N r32 +#define X r33 +#define INCX r34 + + + PROLOGUE + .prologue + PROFCODE + { .mfi + adds PRE1 = PREFETCH_SIZE * SIZE, X + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + ;; + .body +#ifdef F_INTERFACE + { .mmi + LDINT N = [N] + LDINT INCX = [INCX] + nop.i 0 + } + ;; +#ifndef USE64BITINT + { .mii + nop.m 0 + sxt4 N = N + sxt4 INCX = INCX + } + ;; +#endif +#endif + { .mmi + cmp.lt p0, p6 = r0, INCX + cmp.lt p0, p7 = r0, N + shr I = N, (4 - COMPADD) + } + { .mbb + and J = ((1 << (4 - COMPADD)) - 1), N + (p6) br.ret.sptk.many b0 + (p7) br.ret.sptk.many b0 + } + ;; + { .mfi + adds I = -1, I + mov f10 = f0 + mov PR = pr + } + { .mfi + cmp.eq p9, p0 = r0, J + mov f9 = f0 + tbit.z p0, p12 = N, 3 - COMPADD + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + cmp.ne p17, p0 = r0, r0 + mov ar.ec= 3 + } + { .mfi + cmp.ne p18, p0 = r0, r0 + mov f11 = f0 + shl INCX = INCX, BASE_SHIFT + COMPADD + } + ;; + { .mmi +#ifdef XDOUBLE + shladd INCX16 = INCX, (3 - COMPADD), r0 +#else + shladd INCX16 = INCX, (4 - COMPADD), r0 +#endif + cmp.ne p19, p0 = r0, r0 + mov ar.lc = I + } + { .mmb + cmp.gt p8 ,p0 = r0, I +#ifdef COMPLEX + adds INCX = - SIZE, INCX +#else + nop.m 0 +#endif + (p8) br.cond.dpnt .L55 + } + ;; + .align 32 + +.L52: + { .mmf + (p16) lfetch.nt1 [PRE1], INCX16 + (p16) LDFD f32 = [X], STRIDE + } + { .mfb + (p19) FADD f8 = f8, f71 + } + ;; + { .mmf + (p16) LDFD f35 = [X], INCX + } + { .mfb + (p19) FADD f9 = f9, f74 + } + ;; + { .mmf + (p16) LDFD f38 = [X], STRIDE + } + { .mfb + (p19) FADD f10 = f10, f77 + } + ;; + { .mmf + (p16) LDFD f41 = [X], INCX + } + { .mfb + (p19) FADD f11 = f11, f80 + } + ;; + { .mmf + (p16) LDFD f44 = [X], STRIDE + } + { .mfb + (p18) FADD f8 = f8, f34 + } + ;; + { .mmf + (p16) LDFD f47 = [X], INCX + } + { .mfb + (p18) FADD f9 = f9, f37 + } + ;; + { .mmf + (p16) LDFD f50 = [X], STRIDE + } + { .mfb + (p18) FADD f10 = f10, f40 + } + ;; + { .mmf + (p16) LDFD f53 = [X], INCX + } + { .mfb + (p18) FADD f11 = f11, f43 + } + ;; + { .mmf +#ifdef XDOUBLE + (p16) lfetch.nt1 [PRE1], INCX16 +#endif + (p16) LDFD f56 = [X], STRIDE + } + { .mfb + (p18) FADD f8 = f8, f46 + } + ;; + { .mmf + (p16) LDFD f59 = [X], INCX + } + { .mfb + (p18) FADD f9 = f9, f49 + } + ;; + { .mmf + (p16) LDFD f62 = [X], STRIDE + } + { .mfb + (p18) FADD f10 = f10, f52 + } + ;; + { .mmf + (p16) LDFD f65 = [X], INCX + } + { .mfb + (p18) FADD f11 = f11, f55 + } + ;; + { .mmf + (p16) LDFD f68 = [X], STRIDE + } + { .mfb + (p18) FADD f8 = f8, f58 + } + ;; + { .mmf + (p16) LDFD f71 = [X], INCX + } + { .mfb + (p18) FADD f9 = f9, f61 + } + ;; + { .mmf + (p16) LDFD f74 = [X], STRIDE + } + { .mfb + (p18) FADD f10 = f10, f64 + } + ;; + { .mmf + (p16) LDFD f77 = [X], INCX + } + { .mfb + (p18) FADD f11 = f11, f67 + br.ctop.sptk.few .L52 + } + ;; + FADD f8 = f8, f71 + FADD f9 = f9, f74 + FADD f10 = f10, f77 + FADD f11 = f11, f80 + .align 32 + ;; +.L55: + (p12) LDFD f32 = [X], STRIDE + (p9) br.cond.dptk .L998 + ;; + (p12) LDFD f33 = [X], INCX + ;; + (p12) LDFD f34 = [X], STRIDE + ;; + (p12) LDFD f35 = [X], INCX + tbit.z p0, p13 = N, (2 - COMPADD) + ;; + (p12) LDFD f36 = [X], STRIDE + tbit.z p0, p14 = N, (1 - COMPADD) + ;; + (p12) LDFD f37 = [X], INCX +#ifndef COMPLEX + tbit.z p0, p15 = N, 0 +#endif + ;; + (p12) LDFD f38 = [X], STRIDE + ;; + (p12) LDFD f39 = [X], INCX + ;; + (p13) LDFD f40 = [X], STRIDE + ;; + (p13) LDFD f41 = [X], INCX + ;; + (p13) LDFD f42 = [X], STRIDE + (p12) FADD f8 = f8, f32 + ;; + (p13) LDFD f43 = [X], INCX + (p12) FADD f9 = f9, f33 + ;; + (p14) LDFD f44 = [X], STRIDE + (p12) FADD f10 = f10, f34 + ;; + (p14) LDFD f45 = [X], INCX + (p12) FADD f11 = f11, f35 + ;; +#ifndef COMPLEX + (p15) LDFD f46 = [X] +#endif + (p12) FADD f8 = f8, f36 + ;; + (p12) FADD f9 = f9, f37 + (p12) FADD f10 = f10, f38 + (p12) FADD f11 = f11, f39 + ;; + (p13) FADD f8 = f8, f40 + (p13) FADD f9 = f9, f41 +#ifndef COMPLEX +#endif + (p13) FADD f10 = f10, f42 + ;; + (p13) FADD f11 = f11, f43 + (p14) FADD f8 = f8, f44 + (p14) FADD f9 = f9, f45 +#ifndef COMPLEX + (p15) FADD f10 = f10, f46 +#endif + ;; + .align 32 + +.L998: + { .mfi + FADD f8 = f8, f9 + mov ar.lc = ARLC + } + { .mmf + FADD f10 = f10, f11 + } + ;; + { .mii + mov pr = PR, -65474 + } + ;; + { .mfb + FADD f8 = f8, f10 + br.ret.sptk.many b0 + } + EPILOGUE From cdbe0f0235b0d23b19daeb40fab98ec83260197c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:20:14 +0100 Subject: [PATCH 167/189] Add MIPS implementation of ?sum as trivial copy of ?asum with the fabs calls removed --- kernel/mips/KERNEL.P5600 | 5 ++++ kernel/mips/sum.c | 47 ++++++++++++++++++++++++++++++++++++ kernel/mips/zsum.c | 52 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) create mode 100644 kernel/mips/sum.c create mode 100644 kernel/mips/zsum.c diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 1ab1930698..9a6e06d673 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -30,6 +30,11 @@ IDMAXKERNEL = ../mips/imax.c ISMINKERNEL = ../mips/imin.c IDMINKERNEL = ../mips/imin.c +SSUMKERNEL = ../mips/sum.c +DSUMKERNEL = ../mips/sum.c +CSUMKERNEL = ../mips/zsum.c +ZSUMKERNEL = ../mips/zsum.c + ifdef HAVE_MSA SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c diff --git a/kernel/mips/sum.c b/kernel/mips/sum.c new file mode 100644 index 0000000000..8ce3812a19 --- /dev/null +++ b/kernel/mips/sum.c @@ -0,0 +1,47 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += x[i]; + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/mips/zsum.c b/kernel/mips/zsum.c new file mode 100644 index 0000000000..01f8ced7c7 --- /dev/null +++ b/kernel/mips/zsum.c @@ -0,0 +1,52 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#define CSUM1(x,i) x[i]+x[i+1] + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CSUM1(x,i); + i += inc_x2; + } + return(sumf); +} + + From 688fa9201c74a8cc1eafd85ebf36cd74f4bf89f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:22:15 +0100 Subject: [PATCH 168/189] Add MIPS64 implementation of ?sum as trivial copy of ?asum with the fabs replaced by mov to preserve code structure --- kernel/mips64/sum.S | 332 +++++++++++++++++++++++++++++++++++++++++++ kernel/mips64/zsum.S | 204 ++++++++++++++++++++++++++ 2 files changed, 536 insertions(+) create mode 100644 kernel/mips64/sum.S create mode 100644 kernel/mips64/zsum.S diff --git a/kernel/mips64/sum.S b/kernel/mips64/sum.S new file mode 100644 index 0000000000..261630d49d --- /dev/null +++ b/kernel/mips64/sum.S @@ -0,0 +1,332 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 + +#define t1 $f10 +#define t2 $f11 +#define t3 $f12 +#define t4 $f13 + +#define s1 $f0 +#define s2 $f1 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC $0, s1 + + MTC $0, s2 + dsll INCX, INCX, BASE_SHIFT + + blez N, .L999 + li TEMP, SIZE + + bne INCX, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + + LD a5, 4 * SIZE(X) + MOV t1, a1 + LD a6, 5 * SIZE(X) + MOV t2, a2 + LD a7, 6 * SIZE(X) + MOV t3, a3 + + MOV t4, a4 + daddiu I, I, -1 + + blez I, .L13 + LD a8, 7 * SIZE(X) + .align 3 + +.L12: + ADD s1, s1, t1 + LD a1, 8 * SIZE(X) + + MOV t1, a5 + daddiu I, I, -1 + + ADD s2, s2, t2 + LD a2, 9 * SIZE(X) + + MOV t2, a6 + NOP + + ADD s1, s1, t3 + LD a3, 10 * SIZE(X) + + MOV t3, a7 + NOP + + ADD s2, s2, t4 + LD a4, 11 * SIZE(X) + + MOV t4, a8 + daddiu X, X, 8 * SIZE + + ADD s1, s1, t1 + LD a5, 4 * SIZE(X) + + MOV t1, a1 + NOP + + ADD s2, s2, t2 + LD a6, 5 * SIZE(X) + + MOV t2, a2 + NOP + + ADD s1, s1, t3 + LD a7, 6 * SIZE(X) + + MOV t3, a3 + NOP + + ADD s2, s2, t4 + LD a8, 7 * SIZE(X) + + bgtz I, .L12 + MOV t4, a4 + .align 3 + +.L13: + ADD s1, s1, t1 + daddiu X, X, 8 * SIZE + + MOV t1, a5 + NOP + + ADD s2, s2, t2 + MOV t2, a6 + + ADD s1, s1, t3 + MOV t3, a7 + + ADD s2, s2, t4 + MOV t4, a8 + + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + MOV t1, a1 + + ADD s1, s1, t1 + + bgtz I, .L16 + daddiu X, X, SIZE + + j .L999 + NOP + .align 3 + +.L20: + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + LD a3, 0 * SIZE(X) + daddu X, X, INCX + + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + LD a5, 0 * SIZE(X) + daddu X, X, INCX + + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + MOV t1, a1 + LD a7, 0 * SIZE(X) + + MOV t2, a2 + daddu X, X, INCX + + MOV t3, a3 + LD a8, 0 * SIZE(X) + + MOV t4, a4 + daddiu I, I, -1 + + blez I, .L24 + daddu X, X, INCX + .align 3 + +.L23: + ADD s1, s1, t1 + LD a1, 0 * SIZE(X) + + MOV t1, a5 + daddu X, X, INCX + + ADD s2, s2, t2 + LD a2, 0 * SIZE(X) + + MOV t2, a6 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a3, 0 * SIZE(X) + + MOV t3, a7 + daddu X, X, INCX + + ADD s2, s2, t4 + LD a4, 0 * SIZE(X) + + MOV t4, a8 + daddu X, X, INCX + + ADD s1, s1, t1 + LD a5, 0 * SIZE(X) + + MOV t1, a1 + daddu X, X, INCX + + ADD s2, s2, t2 + LD a6, 0 * SIZE(X) + + MOV t2, a2 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a7, 0 * SIZE(X) + + MOV t3, a3 + daddu X, X, INCX + + ADD s2, s2, t4 + LD a8, 0 * SIZE(X) + + MOV t4, a4 + daddiu I, I, -1 + + bgtz I, .L23 + daddu X, X, INCX + .align 3 + +.L24: + ADD s1, s1, t1 + MOV t1, a5 + + ADD s2, s2, t2 + MOV t2, a6 + + ADD s1, s1, t3 + MOV t3, a7 + + ADD s2, s2, t4 + MOV t4, a8 + + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + MOV t1, a1 + daddu X, X, INCX + + bgtz I, .L26 + ADD s1, s1, t1 + .align 3 + +.L999: + j $31 + ADD s1, s1, s2 + + EPILOGUE diff --git a/kernel/mips64/zsum.S b/kernel/mips64/zsum.S new file mode 100644 index 0000000000..129b97900e --- /dev/null +++ b/kernel/mips64/zsum.S @@ -0,0 +1,204 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 + +#define t1 $f10 +#define t2 $f11 +#define t3 $f12 +#define t4 $f13 + +#define s1 $f0 +#define s2 $f1 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC $0, s1 + + MTC $0, s2 + dsll INCX, INCX, ZBASE_SHIFT + + blez N, .L999 + dsra I, N, 2 + + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + + MOV t1, a1 + MOV t2, a2 + + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + + MOV t3, a3 + MOV t4, a4 + daddiu I, I, -1 + + blez I, .L24 + daddu X, X, INCX + .align 3 + +.L23: + ADD s1, s1, t1 + LD a1, 0 * SIZE(X) + + MOV t1, a5 + daddiu I, I, -1 + + ADD s2, s2, t2 + LD a2, 1 * SIZE(X) + + MOV t2, a6 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a3, 0 * SIZE(X) + + MOV t3, a7 + NOP + + ADD s2, s2, t4 + LD a4, 1 * SIZE(X) + + MOV t4, a8 + daddu X, X, INCX + + ADD s1, s1, t1 + LD a5, 0 * SIZE(X) + + MOV t1, a1 + NOP + + ADD s2, s2, t2 + LD a6, 1 * SIZE(X) + + MOV t2, a2 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a7, 0 * SIZE(X) + + MOV t3, a3 + LD a8, 1 * SIZE(X) + + ADD s2, s2, t4 + daddu X, X, INCX + + bgtz I, .L23 + MOV t4, a4 + .align 3 + +.L24: + ADD s1, s1, t1 + MOV t1, a5 + + ADD s2, s2, t2 + MOV t2, a6 + + ADD s1, s1, t3 + MOV t3, a7 + + ADD s2, s2, t4 + MOV t4, a8 + + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + MOV t1, a1 + daddiu I, I, -1 + MOV t2, a2 + daddu X, X, INCX + + ADD s1, s1, t1 + bgtz I, .L26 + ADD s2, s2, t2 + .align 3 + +.L999: + j $31 + ADD s1, s1, s2 + + EPILOGUE From 706dfe263b7e3fb20dca7c7e9fdab79c9e86cb13 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:23:42 +0100 Subject: [PATCH 169/189] Add POWER implementation of ?sum as trivial copy of ?asum with the fabs replaced by fmr to preserve code structure --- kernel/power/sum.S | 446 +++++++++++++++++++++++++++++++++++++++++++ kernel/power/zsum.S | 452 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 898 insertions(+) create mode 100644 kernel/power/sum.S create mode 100644 kernel/power/zsum.S diff --git a/kernel/power/sum.S b/kernel/power/sum.S new file mode 100644 index 0000000000..eda2c5f2c0 --- /dev/null +++ b/kernel/power/sum.S @@ -0,0 +1,446 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fmr f16, f8 + fmr f17, f9 + fmr f18, f10 + fmr f19, f11 + + fmr f20, f12 + fmr f21, f13 + fmr f22, f14 + fmr f23, f15 + bdz LL(20) + .align 4 + +LL(10): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + LFD f8, 16 * SIZE(X) + LFD f9, 17 * SIZE(X) + LFD f10, 18 * SIZE(X) + LFD f11, 19 * SIZE(X) + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + LFD f12, 20 * SIZE(X) + LFD f13, 21 * SIZE(X) + LFD f14, 22 * SIZE(X) + LFD f15, 23 * SIZE(X) + + FADD f0, f0, f16 + fmr f16, f8 + FADD f1, f1, f17 + fmr f17, f9 + + FADD f2, f2, f18 + fmr f18, f10 + FADD f3, f3, f19 + fmr f19, f11 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + FADD f4, f4, f20 + fmr f20, f12 + FADD f5, f5, f21 + fmr f21, f13 + + FADD f6, f6, f22 + fmr f22, f14 + FADD f7, f7, f23 + fmr f23, f15 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + + FADD f0, f0, f8 + + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fmr f16, f8 + fmr f17, f9 + fmr f18, f10 + fmr f19, f11 + + fmr f20, f12 + fmr f21, f13 + fmr f22, f14 + fmr f23, f15 + bdz LL(120) + .align 4 + +LL(110): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + + FADD f0, f0, f16 + fmr f16, f8 + FADD f1, f1, f17 + fmr f17, f9 + + FADD f2, f2, f18 + fmr f18, f10 + FADD f3, f3, f19 + fmr f19, f11 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + FADD f4, f4, f20 + fmr f20, f12 + FADD f5, f5, f21 + fmr f21, f13 + + FADD f6, f6, f22 + fmr f22, f14 + FADD f7, f7, f23 + fmr f23, f15 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + FADD f0, f0, f8 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zsum.S b/kernel/power/zsum.S new file mode 100644 index 0000000000..8396012e8a --- /dev/null +++ b/kernel/power/zsum.S @@ -0,0 +1,452 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCXM1 r9 +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fmr f16, f8 + fmr f17, f9 + fmr f18, f10 + fmr f19, f11 + + fmr f20, f12 + fmr f21, f13 + fmr f22, f14 + fmr f23, f15 + bdz LL(20) + .align 4 + +LL(10): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + LFD f8, 16 * SIZE(X) + LFD f9, 17 * SIZE(X) + LFD f10, 18 * SIZE(X) + LFD f11, 19 * SIZE(X) + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + LFD f12, 20 * SIZE(X) + LFD f13, 21 * SIZE(X) + LFD f14, 22 * SIZE(X) + LFD f15, 23 * SIZE(X) + + FADD f0, f0, f16 + fmr f16, f8 + FADD f1, f1, f17 + fmr f17, f9 + + FADD f2, f2, f18 + fmr f18, f10 + FADD f3, f3, f19 + fmr f19, f11 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + FADD f4, f4, f20 + fmr f20, f12 + FADD f5, f5, f21 + fmr f21, f13 + + FADD f6, f6, f22 + fmr f22, f14 + FADD f7, f7, f23 + fmr f23, f15 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + FADD f0, f0, f8 + FADD f1, f1, f9 + + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fmr f16, f8 + fmr f17, f9 + fmr f18, f10 + fmr f19, f11 + + fmr f20, f12 + fmr f21, f13 + fmr f22, f14 + fmr f23, f15 + bdz LL(120) + .align 4 + +LL(110): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + + FADD f0, f0, f16 + fmr f16, f8 + FADD f1, f1, f17 + fmr f17, f9 + + FADD f2, f2, f18 + fmr f18, f10 + FADD f3, f3, f19 + fmr f19, f11 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + FADD f4, f4, f20 + fmr f20, f12 + FADD f5, f5, f21 + fmr f21, f13 + + FADD f6, f6, f22 + fmr f22, f14 + FADD f7, f7, f23 + fmr f23, f15 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + FADD f0, f0, f16 + fmr f16, f24 + FADD f1, f1, f17 + fmr f17, f25 + + FADD f2, f2, f18 + fmr f18, f26 + FADD f3, f3, f19 + fmr f19, f27 + + FADD f4, f4, f20 + fmr f20, f28 + FADD f5, f5, f21 + fmr f21, f29 + + FADD f6, f6, f22 + fmr f22, f30 + FADD f7, f7, f23 + fmr f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + FADD f0, f0, f8 + FADD f1, f1, f9 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE From 70f2a4e0d70609f13c9f35112b90516830c30689 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:25:06 +0100 Subject: [PATCH 170/189] Add SPARC implementation of ?sum as trivial copy of ?asum with the fabs replaced by fmov to preserve code structure --- kernel/sparc/sum.S | 325 +++++++++++++++++++++++++++++++++++++++++++ kernel/sparc/zsum.S | 327 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 652 insertions(+) create mode 100644 kernel/sparc/sum.S create mode 100644 kernel/sparc/zsum.S diff --git a/kernel/sparc/sum.S b/kernel/sparc/sum.S new file mode 100644 index 0000000000..f26abb85f9 --- /dev/null +++ b/kernel/sparc/sum.S @@ -0,0 +1,325 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + sll INCX, BASE_SHIFT, INCX + + FMOV c1, c2 + FMOV c1, t1 + FMOV c1, t2 + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 0 + ble .LL19 + cmp INCX, SIZE + bne .LL50 + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [X + 1 * SIZE], a2 + cmp I, 0 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 128 + +.LL11: + FADD c1, t1, c1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FMOV a1, t1 + LDF [X + 0 * SIZE], a1 + + FADD c2, t2, c2 + add I, -1, I + FMOV a2, t2 + LDF [X + 1 * SIZE], a2 + + FADD c1, t3, c1 + cmp I, 0 + FMOV a3, t3 + LDF [X + 2 * SIZE], a3 + + FADD c2, t4, c2 + nop + FMOV a4, t4 + LDF [X + 3 * SIZE], a4 + + FADD c1, t1, c1 + nop + FMOV a5, t1 + LDF [X + 4 * SIZE], a5 + + FADD c2, t2, c2 + nop + FMOV a6, t2 + LDF [X + 5 * SIZE], a6 + + FADD c1, t3, c1 + FMOV a7, t3 + LDF [X + 6 * SIZE], a7 + add X, 8 * SIZE, X + + FADD c2, t4, c2 + FMOV a8, t4 + bg,pt %icc, .LL11 + LDF [X - 1 * SIZE], a8 + +.LL12: + FADD c1, t1, c1 + FMOV a1, t1 + FADD c2, t2, c2 + FMOV a2, t2 + + FADD c1, t3, c1 + FMOV a3, t3 + FADD c2, t4, c2 + FMOV a4, t4 + + FADD c1, t1, c1 + FMOV a5, t1 + FADD c2, t2, c2 + FMOV a6, t2 + + FADD c1, t3, c1 + FMOV a7, t3 + FADD c2, t4, c2 + FMOV a8, t4 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add I, -1, I + cmp I, 0 + FADD c1, t1, c1 + FMOV a1, t1 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FADD c1, t1, c1 + add I, -1, I + FMOV a1, t1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + + FADD c2, t2, c2 + cmp I, 0 + FMOV a2, t2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + + FADD c1, t3, c1 + FMOV a3, t3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + + FADD c2, t4, c2 + FMOV a4, t4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FADD c1, t1, c1 + FMOV a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + + FADD c2, t2, c2 + FMOV a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + + FADD c1, t3, c1 + FMOV a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + + FADD c2, t4, c2 + FMOV a8, t4 + LDF [X + 0 * SIZE], a8 + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FADD c1, t1, c1 + FMOV a1, t1 + FADD c2, t2, c2 + FMOV a2, t2 + + FADD c1, t3, c1 + FMOV a3, t3 + FADD c2, t4, c2 + FMOV a4, t4 + + FADD c1, t1, c1 + FMOV a5, t1 + FADD c2, t2, c2 + FMOV a6, t2 + + FADD c1, t3, c1 + FMOV a7, t3 + FADD c2, t4, c2 + FMOV a8, t4 + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + FADD c1, t1, c1 + add I, -1, I + FMOV a1, t1 + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zsum.S b/kernel/sparc/zsum.S new file mode 100644 index 0000000000..bc167dc72a --- /dev/null +++ b/kernel/sparc/zsum.S @@ -0,0 +1,327 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + sll INCX, ZBASE_SHIFT, INCX + + FMOV c1, c2 + FMOV c1, t1 + FMOV c1, t2 + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 0 + ble .LL19 + nop + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [X + 1 * SIZE], a2 + cmp I, 0 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 32 + +.LL11: + FADD c1, t1, c1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FMOV a1, t1 + LDF [X + 0 * SIZE], a1 + + FADD c2, t2, c2 + add I, -1, I + FMOV a2, t2 + LDF [X + 1 * SIZE], a2 + + FADD c1, t3, c1 + cmp I, 0 + FMOV a3, t3 + LDF [X + 2 * SIZE], a3 + + FADD c2, t4, c2 + nop + FMOV a4, t4 + LDF [X + 3 * SIZE], a4 + + FADD c1, t1, c1 + nop + FMOV a5, t1 + LDF [X + 4 * SIZE], a5 + + FADD c2, t2, c2 + nop + FMOV a6, t2 + LDF [X + 5 * SIZE], a6 + + FADD c1, t3, c1 + FMOV a7, t3 + LDF [X + 6 * SIZE], a7 + add X, 8 * SIZE, X + + FADD c2, t4, c2 + FMOV a8, t4 + bg,pt %icc, .LL11 + LDF [X - 1 * SIZE], a8 + +.LL12: + FADD c1, t1, c1 + FMOV a1, t1 + FADD c2, t2, c2 + FMOV a2, t2 + + FADD c1, t3, c1 + FMOV a3, t3 + FADD c2, t4, c2 + FMOV a4, t4 + + FADD c1, t1, c1 + FMOV a5, t1 + FADD c2, t2, c2 + FMOV a6, t2 + + FADD c1, t3, c1 + FMOV a7, t3 + FADD c2, t4, c2 + FMOV a8, t4 + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add I, -1, I + cmp I, 0 + FADD c1, t1, c1 + FADD c2, t2, c2 + FMOV a1, t1 + FMOV a2, t2 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + LDF [X + 1 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FADD c1, t1, c1 + add I, -1, I + FMOV a1, t1 + LDF [X + 0 * SIZE], a1 + + FADD c2, t2, c2 + cmp I, 0 + FMOV a2, t2 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + + FADD c1, t3, c1 + FMOV a3, t3 + LDF [X + 0 * SIZE], a3 + + FADD c2, t4, c2 + FMOV a4, t4 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + FADD c1, t1, c1 + FMOV a5, t1 + LDF [X + 0 * SIZE], a5 + + FADD c2, t2, c2 + FMOV a6, t2 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + + FADD c1, t3, c1 + FMOV a7, t3 + LDF [X + 0 * SIZE], a7 + + FADD c2, t4, c2 + FMOV a8, t4 + LDF [X + 1 * SIZE], a8 + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FADD c1, t1, c1 + FMOV a1, t1 + FADD c2, t2, c2 + FMOV a2, t2 + + FADD c1, t3, c1 + FMOV a3, t3 + FADD c2, t4, c2 + FMOV a4, t4 + + FADD c1, t1, c1 + FMOV a5, t1 + FADD c2, t2, c2 + FMOV a6, t2 + + FADD c1, t3, c1 + FMOV a7, t3 + FADD c2, t4, c2 + FMOV a8, t4 + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + FADD c1, t1, c1 + FADD c2, t2, c2 + add I, -1, I + FMOV a1, t1 + FMOV a2, t2 + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + + return %i7 + 8 + clr %o0 + + EPILOGUE From e3bc83f2a8b3304fd1d8107a2f73a672a4ec5ffe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:26:10 +0100 Subject: [PATCH 171/189] Add x86 implementation of ?sum as trivial copy of ?asum with the fabs calls removed --- kernel/x86/sum.S | 207 +++++++++++++++++++++++++++++++++++++++++++++ kernel/x86/zsum.S | 208 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 415 insertions(+) create mode 100644 kernel/x86/sum.S create mode 100644 kernel/x86/zsum.S diff --git a/kernel/x86/sum.S b/kernel/x86/sum.S new file mode 100644 index 0000000000..b24f34c8be --- /dev/null +++ b/kernel/x86/sum.S @@ -0,0 +1,207 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + fldz + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $BASE_SHIFT, INCX + fldz + fldz + fldz + cmpl $SIZE, INCX + jne .L40 + + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + FLD 2 * SIZE(X) + FLD 3 * SIZE(X) + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + FLD 5 * SIZE(X) + FLD 6 * SIZE(X) + FLD 7 * SIZE(X) + + addl $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $7, I + jle .L998 + ALIGN_4 + + +.L21: + FLD (X) + faddp %st,%st(1) + addl $1 * SIZE, X + decl I + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD (X) + addl INCX, X + FLD (X) + addl INCX, X + FLD (X) + addl INCX, X + FLD (X) + addl INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD (X) + addl INCX, X + FLD (X) + addl INCX, X + FLD (X) + addl INCX, X + FLD (X) + addl INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $7, I + jle .L998 + ALIGN_4 + + +.L61: + FLD (X) + addl INCX, X + faddp %st,%st(1) + decl I + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zsum.S b/kernel/x86/zsum.S new file mode 100644 index 0000000000..cd2ce61db6 --- /dev/null +++ b/kernel/x86/zsum.S @@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + fldz + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + + fldz + fldz + fldz + cmpl $SIZE * 2, INCX + jne .L40 + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + FLD 2 * SIZE(X) + FLD 3 * SIZE(X) + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + FLD 5 * SIZE(X) + FLD 6 * SIZE(X) + FLD 7 * SIZE(X) + + addl $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $3, I + jle .L998 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + faddp %st,%st(3) + faddp %st,%st(1) + addl $2 * SIZE, X + decl I + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addl INCX, X + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addl INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addl INCX, X + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addl INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $3, I + jle .L998 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addl INCX, X + faddp %st,%st(3) + faddp %st,%st(1) + decl I + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + ret + + EPILOGUE From 9d717cb5ee817f87a1306d64da75a09375abd407 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:27:04 +0100 Subject: [PATCH 172/189] Add x86_64 implementation of ?sum as trivial copy of ?asum with the fabs calls removed --- kernel/x86_64/sum.S | 179 ++++++++++++++++++++++++++++++++++++++++++ kernel/x86_64/zsum.S | 180 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 359 insertions(+) create mode 100644 kernel/x86_64/sum.S create mode 100644 kernel/x86_64/zsum.S diff --git a/kernel/x86_64/sum.S b/kernel/x86_64/sum.S new file mode 100644 index 0000000000..d075eaa042 --- /dev/null +++ b/kernel/x86_64/sum.S @@ -0,0 +1,179 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + fldz + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $BASE_SHIFT, INCX + + fldz + fldz + fldz + cmpq $SIZE, INCX + jne .L40 + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + FLD 2 * SIZE(X) + FLD 3 * SIZE(X) + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + FLD 5 * SIZE(X) + FLD 6 * SIZE(X) + FLD 7 * SIZE(X) + + addq $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L10 + ALIGN_4 + +.L20: + andq $7, M + jle .L998 + ALIGN_4 + +.L21: + FLD (X) + faddp %st,%st(1) + addq $1 * SIZE, X + decq M + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD (X) + addq INCX, X + FLD (X) + addq INCX, X + FLD (X) + addq INCX, X + FLD (X) + addq INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD (X) + addq INCX, X + FLD (X) + addq INCX, X + FLD (X) + addq INCX, X + FLD (X) + addq INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $7, M + jle .L998 + ALIGN_4 + + +.L61: + FLD (X) + addq INCX, X + faddp %st,%st(1) + decq M + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + ret + + EPILOGUE diff --git a/kernel/x86_64/zsum.S b/kernel/x86_64/zsum.S new file mode 100644 index 0000000000..45e0ddff55 --- /dev/null +++ b/kernel/x86_64/zsum.S @@ -0,0 +1,180 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + fldz + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $ZBASE_SHIFT, INCX + + fldz + fldz + fldz + cmpq $SIZE * 2, INCX + jne .L40 + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + FLD 2 * SIZE(X) + FLD 3 * SIZE(X) + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + FLD 5 * SIZE(X) + FLD 6 * SIZE(X) + FLD 7 * SIZE(X) + + addq $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L10 + ALIGN_4 + +.L20: + andq $3, M + jle .L998 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + faddp %st,%st(3) + faddp %st,%st(1) + addq $2 * SIZE, X + decq M + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX, X + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX, X + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $3, M + jle .L998 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX, X + faddp %st,%st(3) + faddp %st,%st(1) + decq M + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + ret + + EPILOGUE From 246ca29679c5e74d2f306e39eefd1939aa6c37bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Mar 2019 22:49:05 +0100 Subject: [PATCH 173/189] Add ZARCH implementation of ?sum as trivial copies of the respective ?asum kernels with the ABS and vflpsb calls removed --- kernel/zarch/KERNEL.Z13 | 5 + kernel/zarch/KERNEL.Z14 | 5 + kernel/zarch/KERNEL.ZARCH_GENERIC | 5 + kernel/zarch/csum.c | 137 +++++++++++++++++++++++++++ kernel/zarch/dsum.c | 148 +++++++++++++++++++++++++++++ kernel/zarch/ssum.c | 151 ++++++++++++++++++++++++++++++ kernel/zarch/zsum.c | 136 +++++++++++++++++++++++++++ 7 files changed, 587 insertions(+) create mode 100644 kernel/zarch/csum.c create mode 100644 kernel/zarch/dsum.c create mode 100644 kernel/zarch/ssum.c create mode 100644 kernel/zarch/zsum.c diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index 22c7e97032..b1ffd3c54d 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = zasum.c +SSUMKERNEL = ../arm/asum.c +DSUMKERNEL = dasum.c +CSUMKERNEL = ../arm/zasum.c +ZSUMKERNEL = zasum.c + SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = ../arm/zaxpy.c diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index 80f78f48fa..971896c2d4 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c +SSUMKERNEL = ssum.c +DSUMKERNEL = dsum.c +CSUMKERNEL = csum.c +ZSUMKERNEL = zsum.c + SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c diff --git a/kernel/zarch/KERNEL.ZARCH_GENERIC b/kernel/zarch/KERNEL.ZARCH_GENERIC index 848ee9b548..3bbeb9155d 100644 --- a/kernel/zarch/KERNEL.ZARCH_GENERIC +++ b/kernel/zarch/KERNEL.ZARCH_GENERIC @@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c diff --git a/kernel/zarch/csum.c b/kernel/zarch/csum.c new file mode 100644 index 0000000000..c0b8c6371d --- /dev/null +++ b/kernel/zarch/csum.c @@ -0,0 +1,137 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT sum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [sum] "=Q"(sum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return sum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ip = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (sumf); + + if (inc_x == 1) { + + n1 = n & -32; + if (n1 > 0) { + + sumf = csum_kernel_32(n1, x); + i = n1; + ip = 2 * n1; + } + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + i++; + ip += 2; + } + + } else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + ip += inc_x2; + i++; + } + + } + return (sumf); +} diff --git a/kernel/zarch/dsum.c b/kernel/zarch/dsum.c new file mode 100644 index 0000000000..178bc3462d --- /dev/null +++ b/kernel/zarch/dsum.c @@ -0,0 +1,148 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT sum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [sum] "=Q"(sum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return sum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) + return sumf; + + if (inc_x == 1) { + + n1 = n & -32; + + if (n1 > 0) { + + sumf = dsum_kernel_32(n1, x); + i = n1; + } + + while (i < n) { + sumf += x[i]; + i++; + } + + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { + + sum1 += x[i]; + sum2 += x[i + inc_x]; + sum1 += x[i + 2 * inc_x]; + sum2 += x[i + 3 * inc_x]; + + i += inc_x * 4; + j += 4; + + } + sumf = sum1 + sum2; + while (j < n) { + + sumf += x[i]; + i += inc_x; + j++; + } + + } + return sumf; +} diff --git a/kernel/zarch/ssum.c b/kernel/zarch/ssum.c new file mode 100644 index 0000000000..a433ab5923 --- /dev/null +++ b/kernel/zarch/ssum.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + + +static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT sum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [sum] "=Q"(sum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return sum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) + return sumf; + + if (inc_x == 1) { + + n1 = n & -64; + + if (n1 > 0) { + + sumf = ssum_kernel_64(n1, x); + i = n1; + } + + while (i < n) { + sumf += x[i]; + i++; + } + + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { + + sum1 += x[i]; + sum2 += x[i + inc_x]; + sum1 += x[i + 2 * inc_x]; + sum2 += x[i + 3 * inc_x]; + + i += inc_x * 4; + j += 4; + + } + sumf = sum1 + sum2; + while (j < n) { + + sumf += x[i]; + i += inc_x; + j++; + } + + } + return sumf; +} diff --git a/kernel/zarch/zsum.c b/kernel/zarch/zsum.c new file mode 100644 index 0000000000..7cfc1f17f9 --- /dev/null +++ b/kernel/zarch/zsum.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + + +static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT sum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [sum] "=Q"(sum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return sum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ip = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (sumf); + + if (inc_x == 1) { + + n1 = n & -16; + if (n1 > 0) { + + sumf = zsum_kernel_16(n1, x); + i = n1; + ip = 2 * n1; + } + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + i++; + ip += 2; + } + + } else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + ip += inc_x2; + i++; + } + + } + return (sumf); +} From 1679de5e5968bdeffd63793bed55048088216c18 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Mar 2019 10:50:43 +0200 Subject: [PATCH 174/189] Detect 32bit environment on 64bit ARM hardware for #2056, using same approach as #2058 --- cmake/system_check.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index f30a946b49..94d3ba6437 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -49,7 +49,11 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") set(ARM 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") - set(ARM64 1) + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + set(ARM64 1) + else() + set(ARM 1) + endif() endif() if (X86_64) From d17da6c6a44bcf94a1e677642288261f7a1848d9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Mar 2019 11:57:01 +0200 Subject: [PATCH 175/189] Add cmake defaults for ?sum kernels --- cmake/kernel.cmake | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index fad84de519..0ed09e7763 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -107,6 +107,12 @@ macro(SetDefaultL1) set(DAXPBYKERNEL ../arm/axpby.c) set(CAXPBYKERNEL ../arm/zaxpby.c) set(ZAXPBYKERNEL ../arm/zaxpby.c) + set(SSUMKERNEL sum.S) + set(DSUMKERNEL sum.S) + set(CSUMKERNEL zsum.S) + set(ZSUMKERNEL zsum.S) + set(QSUMKERNEL sum.S) + set(XSUMKERNEL zsum.S) endmacro () macro(SetDefaultL2) @@ -162,4 +168,4 @@ macro(SetDefaultL3) set(DGEADD_KERNEL ../generic/geadd.c) set(CGEADD_KERNEL ../generic/zgeadd.c) set(ZGEADD_KERNEL ../generic/zgeadd.c) -endmacro () \ No newline at end of file +endmacro () From 100d94f94edca3274f75658198dada784dd18daa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Mar 2019 13:55:05 +0200 Subject: [PATCH 176/189] Add ?sum --- kernel/x86/KERNEL.generic | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/x86/KERNEL.generic b/kernel/x86/KERNEL.generic index 672edb0696..0aac0ce996 100644 --- a/kernel/x86/KERNEL.generic +++ b/kernel/x86/KERNEL.generic @@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c From c04a7290812f79972b7dbe92be4ccd6e879e88d8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Mar 2019 13:55:49 +0200 Subject: [PATCH 177/189] Add ?sum definitions for generic kernel --- kernel/x86_64/KERNEL.generic | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/x86_64/KERNEL.generic b/kernel/x86_64/KERNEL.generic index a23e59f3f8..7cb0cb836c 100644 --- a/kernel/x86_64/KERNEL.generic +++ b/kernel/x86_64/KERNEL.generic @@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c From 21d146a8de232a2774d706c5725586dca3d39c02 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Mar 2019 22:12:23 +0200 Subject: [PATCH 178/189] Add declarations for ?sum --- common_q.h | 2 ++ common_x.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/common_q.h b/common_q.h index 30ad3727ad..b4ace3a628 100644 --- a/common_q.h +++ b/common_q.h @@ -19,6 +19,7 @@ #define QDOTC_K qdot_k #define QNRM2_K qnrm2_k #define QSCAL_K qscal_k +#define QSUM_K qsum_k #define QSWAP_K qswap_k #define QROT_K qrot_k @@ -161,6 +162,7 @@ #define QDOTC_K gotoblas -> qdot_k #define QNRM2_K gotoblas -> qnrm2_k #define QSCAL_K gotoblas -> qscal_k +#define QSUM_K gotoblas -> qsum_k #define QSWAP_K gotoblas -> qswap_k #define QROT_K gotoblas -> qrot_k diff --git a/common_x.h b/common_x.h index 03b98db4f4..2ed525faa1 100644 --- a/common_x.h +++ b/common_x.h @@ -19,6 +19,7 @@ #define XDOTC_K xdotc_k #define XNRM2_K xnrm2_k #define XSCAL_K xscal_k +#define XSUM_K xsum_k #define XSWAP_K xswap_k #define XROT_K xqrot_k @@ -227,6 +228,7 @@ #define XDOTC_K gotoblas -> xdotc_k #define XNRM2_K gotoblas -> xnrm2_k #define XSCAL_K gotoblas -> xscal_k +#define XSUM_K gotoblas -> xsum_k #define XSWAP_K gotoblas -> xswap_k #define XROT_K gotoblas -> xqrot_k From 9229d6859b5f4b185315048ccc58644c9112bdd5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Apr 2019 09:38:18 +0200 Subject: [PATCH 179/189] Add -lm and disable EXPRECISION support on *BSD fixes #2075 --- cmake/os.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/os.cmake b/cmake/os.cmake index 1321ef6194..2d25e7aaae 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -8,6 +8,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") set(NO_EXPRECISION 1) endif () +if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly") + set(EXTRALIB "${EXTRALIB} -lm") + set(NO_EXPRECISION 1) +endif () + if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX") set(EXTRALIB "${EXTRALIB} -lm") endif () From bcdf1d49170508fd5c8250f802dd9018b7771534 Mon Sep 17 00:00:00 2001 From: Rashmica Gupta Date: Tue, 9 Apr 2019 14:13:24 +1000 Subject: [PATCH 180/189] Add in runtime CPU detection for POWER. --- Makefile.system | 6 ++ driver/others/Makefile | 8 +++ driver/others/dynamic_power.c | 102 ++++++++++++++++++++++++++++++++++ kernel/power/KERNEL.POWER8 | 32 +++++------ kernel/power/KERNEL.POWER9 | 32 +++++------ kernel/setparam-ref.c | 22 ++++++++ 6 files changed, 170 insertions(+), 32 deletions(-) create mode 100644 driver/others/dynamic_power.c diff --git a/Makefile.system b/Makefile.system index 53f89b2fa6..a95d6190f3 100644 --- a/Makefile.system +++ b/Makefile.system @@ -528,6 +528,12 @@ DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 endif +ifeq ($(ARCH), power) +DYNAMIC_CORE = POWER6 +DYNAMIC_CORE += POWER8 +DYNAMIC_CORE += POWER9 +endif + # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty ifndef DYNAMIC_CORE override DYNAMIC_ARCH= diff --git a/driver/others/Makefile b/driver/others/Makefile index 3dc2e7c1ba..d4b5c26d53 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -18,8 +18,12 @@ ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(ARCH),arm64) COMMONOBJS += dynamic_arm64.$(SUFFIX) else +ifeq ($(ARCH),power) +COMMONOBJS += dynamic_power.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) endif +endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -78,8 +82,12 @@ ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(ARCH),arm64) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX) else +ifeq ($(ARCH),power) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) endif +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c new file mode 100644 index 0000000000..0c4a87a5e3 --- /dev/null +++ b/driver/others/dynamic_power.c @@ -0,0 +1,102 @@ + +#include "common.h" + +extern gotoblas_t gotoblas_POWER6; +extern gotoblas_t gotoblas_POWER8; +extern gotoblas_t gotoblas_POWER9; + +extern void openblas_warning(int verbose, const char *msg); + +static char *corename[] = { + "unknown", + "POWER6", + "POWER8", + "POWER9" +}; + +#define NUM_CORETYPES 4 + +char *gotoblas_corename(void) { + if (gotoblas == &gotoblas_POWER6) return corename[1]; + if (gotoblas == &gotoblas_POWER8) return corename[2]; + if (gotoblas == &gotoblas_POWER9) return corename[3]; + return corename[0]; +} + +static gotoblas_t *get_coretype(void) { + + if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) + return &gotoblas_POWER6; + if (__builtin_cpu_is("power8")) + return &gotoblas_POWER8; + if (__builtin_cpu_is("power9")) + return &gotoblas_POWER9; + return NULL; +} + +static gotoblas_t *force_coretype(char * coretype) { + + int i ; + int found = -1; + char message[128]; + + for ( i = 0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 1: return (&gotoblas_POWER6); + case 2: return (&gotoblas_POWER8); + case 3: return (&gotoblas_POWER9); + default: return NULL; + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); +} + +void gotoblas_dynamic_init(void) { + + char coremsg[128]; + char coren[22]; + char *p; + + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to POWER8 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_POWER8; + } + + if (gotoblas && gotoblas -> init) { + strncpy(coren,gotoblas_corename(),20); + sprintf(coremsg, "Core: %s\n",coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index e6f69c7c47..43f004fbbb 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -13,40 +13,40 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_16x4_power8.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S DGEMMONCOPY = dgemm_ncopy_4_power8.S DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = cgemm_tcopy_8_power8.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c ZGEMMITCOPY = zgemm_tcopy_8_power8.S -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 86a9319714..e166f252fc 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -13,40 +13,40 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_power9.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S DGEMMONCOPY = dgemm_ncopy_4_power8.S DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = cgemm_tcopy_8_power8.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c ZGEMMITCOPY = zgemm_tcopy_8_power8.S -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 6d4028b0b2..b964a8bad2 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -718,6 +718,27 @@ static void init_parameter(void) { } #else // defined(ARCH_ARM64) +#if defined(ARCH_POWER) +static void init_parameter(void) { + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; + + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; + TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; + + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +} +#else //POWER + #ifdef ARCH_X86 static int get_l2_size_old(void){ int i, eax, ebx, ecx, edx, cpuid_level; @@ -1303,4 +1324,5 @@ static void init_parameter(void) { } +#endif //POWER #endif //defined(ARCH_ARM64) From 40e53e52d645d1cbef76c8432847fa3c219b9dd2 Mon Sep 17 00:00:00 2001 From: Jeff Baylor Date: Mon, 22 Apr 2019 17:01:34 -0700 Subject: [PATCH 181/189] snprintf define consolidated to common.h --- common.h | 2 ++ driver/others/openblas_get_config.c | 6 ------ utest/ctest.h | 4 ---- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/common.h b/common.h index 239b2a850c..0ac74bb20a 100644 --- a/common.h +++ b/common.h @@ -85,6 +85,8 @@ extern "C" { #if !defined(_MSC_VER) #include +#elif _MSC_VER < 1900 +#define snprintf _snprintf #endif #include diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index eca494dca3..81648fb7c3 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -35,12 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#if defined(_WIN32) && defined(_MSC_VER) -#if _MSC_VER < 1900 -#define snprintf _snprintf -#endif -#endif - static char* openblas_config_str="" "OpenBLAS " VERSION diff --git a/utest/ctest.h b/utest/ctest.h index f297dafbae..d316b14943 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -83,10 +83,6 @@ struct ctest { #undef CTEST_SEGFAULT #endif -#if _MSC_VER < 1900 -#define snprintf _snprintf -#endif - #ifndef __cplusplus #define inline __inline #endif From 9a19616a282d0c01d6695c7419dff01895d25d73 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 27 Apr 2019 18:55:47 +0200 Subject: [PATCH 182/189] Support INTERFACE64=1 --- relapack/inc/relapack.h | 116 ++++++++++++++++++++++------------------ 1 file changed, 64 insertions(+), 52 deletions(-) diff --git a/relapack/inc/relapack.h b/relapack/inc/relapack.h index e421f352b1..7f283e04d5 100644 --- a/relapack/inc/relapack.h +++ b/relapack/inc/relapack.h @@ -1,67 +1,79 @@ #ifndef RELAPACK_H #define RELAPACK_H -void RELAPACK_slauum(const char *, const int *, float *, const int *, int *); -void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *); -void RELAPACK_clauum(const char *, const int *, float *, const int *, int *); -void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *); +#ifdef USE64BITINT + typedef BLASLONG blasint; + #if defined(OS_WINDOWS) && defined(__64BIT__) + #define blasabs(x) llabs(x) + #else + #define blasabs(x) labs(x) + #endif +#else + typedef int blasint; + #define blasabs(x) abs(x) +#endif -void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *); -void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *); -void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *); -void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *); +void RELAPACK_slauum(const char *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_dlauum(const char *, const blasint *, double *, const blasint *, blasint *); +void RELAPACK_clauum(const char *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_zlauum(const char *, const blasint *, double *, const blasint *, blasint *); -void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *); -void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *); -void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *); -void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *); +void RELAPACK_strtri(const char *, const char *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_dtrtri(const char *, const char *, const blasint *, double *, const blasint *, blasint *); +void RELAPACK_ctrtri(const char *, const char *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_ztrtri(const char *, const char *, const blasint *, double *, const blasint *, blasint *); -void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *); -void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *); -void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *); -void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *); +void RELAPACK_spotrf(const char *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_dpotrf(const char *, const blasint *, double *, const blasint *, blasint *); +void RELAPACK_cpotrf(const char *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_zpotrf(const char *, const blasint *, double *, const blasint *, blasint *); -void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_spbtrf(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_dpbtrf(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *); +void RELAPACK_cpbtrf(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *); +void RELAPACK_zpbtrf(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *); -void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *); -void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *); -void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *); -void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *); +void RELAPACK_ssytrf(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_dsytrf(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_csytrf(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_chetrf(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_zsytrf(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_zhetrf(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_ssytrf_rook(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_dsytrf_rook(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_csytrf_rook(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_chetrf_rook(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_zsytrf_rook(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_zhetrf_rook(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); -void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); -void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); -void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); -void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); +void RELAPACK_sgetrf(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +void RELAPACK_dgetrf(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +void RELAPACK_cgetrf(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +void RELAPACK_zgetrf(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); -void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); -void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); -void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); -void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); +void RELAPACK_sgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +void RELAPACK_dgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +void RELAPACK_cgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +void RELAPACK_zgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); -void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); -void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); -void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); -void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); +void RELAPACK_ssygst(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *); +void RELAPACK_dsygst(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *); +void RELAPACK_chegst(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *); +void RELAPACK_zhegst(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *); -void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); -void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); -void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); -void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); +void RELAPACK_strsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *); +void RELAPACK_dtrsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *); +void RELAPACK_ctrsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *); +void RELAPACK_ztrsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *); -void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); -void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); -void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); -void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +void RELAPACK_stgsyl(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, const blasint *, blasint *, blasint *); +void RELAPACK_dtgsyl(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, const blasint *, blasint *, blasint *); +void RELAPACK_ctgsyl(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, const blasint *, blasint *, blasint *); +void RELAPACK_ztgsyl(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, const blasint *, blasint *, blasint *); + +void RELAPACK_sgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); +void RELAPACK_dgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); +void RELAPACK_cgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); +void RELAPACK_zgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); #endif /* RELAPACK_H */ From 798c448b0c9ed1d0546f3d660a26f66d6a852283 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 27 Apr 2019 19:06:00 +0200 Subject: [PATCH 183/189] Add support for INTERFACE64 and fix XERBLA calls 1. Replaced all instances of "int" with "blasint" 2. Added string length as "hidden" third parameter in calls to fortran XERBLA --- relapack/src/blas.h | 106 +++++----- relapack/src/cgbtrf.c | 78 +++---- relapack/src/cgemmt.c | 66 +++--- relapack/src/cgetrf.c | 40 ++-- relapack/src/chegst.c | 38 ++-- relapack/src/chetrf.c | 72 +++---- relapack/src/chetrf_rec2.c | 32 +-- relapack/src/chetrf_rook.c | 72 +++---- relapack/src/chetrf_rook_rec2.c | 40 ++-- relapack/src/clauum.c | 28 +-- relapack/src/cpbtrf.c | 42 ++-- relapack/src/cpotrf.c | 28 +-- relapack/src/csytrf.c | 72 +++---- relapack/src/csytrf_rec2.c | 32 +-- relapack/src/csytrf_rook.c | 72 +++---- relapack/src/csytrf_rook_rec2.c | 40 ++-- relapack/src/ctgsyl.c | 62 +++--- relapack/src/ctrsyl.c | 52 ++--- relapack/src/ctrsyl_rec2.c | 36 ++-- relapack/src/ctrtri.c | 34 +-- relapack/src/dgbtrf.c | 80 +++---- relapack/src/dgemmt.c | 62 +++--- relapack/src/dgetrf.c | 40 ++-- relapack/src/dlauum.c | 28 +-- relapack/src/dpbtrf.c | 42 ++-- relapack/src/dpotrf.c | 28 +-- relapack/src/dsygst.c | 40 ++-- relapack/src/dsytrf.c | 72 +++---- relapack/src/dsytrf_rec2.c | 32 +-- relapack/src/dsytrf_rook.c | 72 +++---- relapack/src/dsytrf_rook_rec2.c | 38 ++-- relapack/src/dtgsyl.c | 66 +++--- relapack/src/dtrsyl.c | 56 ++--- relapack/src/dtrsyl_rec2.c | 58 ++--- relapack/src/dtrtri.c | 34 +-- relapack/src/f2c.c | 2 +- relapack/src/f2c.h | 13 ++ relapack/src/lapack.h | 124 +++++------ relapack/src/lapack_wrappers.c | 360 ++++++++++++++++---------------- relapack/src/relapack.h | 42 ++-- relapack/src/sgbtrf.c | 79 ++++--- relapack/src/sgemmt.c | 62 +++--- relapack/src/sgetrf.c | 40 ++-- relapack/src/slauum.c | 28 +-- relapack/src/spbtrf.c | 42 ++-- relapack/src/spotrf.c | 28 +-- relapack/src/ssygst.c | 38 ++-- relapack/src/ssytrf.c | 73 ++++--- relapack/src/ssytrf_rec2.c | 28 +-- relapack/src/ssytrf_rook.c | 72 +++---- relapack/src/ssytrf_rook_rec2.c | 32 +-- relapack/src/stgsyl.c | 66 +++--- relapack/src/strsyl.c | 56 ++--- relapack/src/strsyl_rec2.c | 50 ++--- relapack/src/strtri.c | 34 +-- relapack/src/zgbtrf.c | 78 +++---- relapack/src/zgemmt.c | 66 +++--- relapack/src/zgetrf.c | 40 ++-- relapack/src/zhegst.c | 38 ++-- relapack/src/zhetrf.c | 72 +++---- relapack/src/zhetrf_rec2.c | 36 ++-- relapack/src/zhetrf_rook.c | 72 +++---- relapack/src/zhetrf_rook_rec2.c | 38 ++-- relapack/src/zlauum.c | 28 +-- relapack/src/zpbtrf.c | 42 ++-- relapack/src/zpotrf.c | 28 +-- relapack/src/zsytrf.c | 72 +++---- relapack/src/zsytrf_rec2.c | 34 +-- relapack/src/zsytrf_rook.c | 72 +++---- relapack/src/zsytrf_rook_rec2.c | 36 ++-- relapack/src/ztgsyl.c | 62 +++--- relapack/src/ztrsyl.c | 52 ++--- relapack/src/ztrsyl_rec2.c | 42 ++-- relapack/src/ztrtri.c | 34 +-- 74 files changed, 2010 insertions(+), 1991 deletions(-) diff --git a/relapack/src/blas.h b/relapack/src/blas.h index 7441c1033d..6d9f1a42a2 100644 --- a/relapack/src/blas.h +++ b/relapack/src/blas.h @@ -1,61 +1,61 @@ #ifndef BLAS_H #define BLAS_H -extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *); -extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *); -extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *); -extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *); - -extern void BLAS(sscal)(const int *, const float *, float *, const int *); -extern void BLAS(dscal)(const int *, const double *, double *, const int *); -extern void BLAS(cscal)(const int *, const float *, float *, const int *); -extern void BLAS(zscal)(const int *, const double *, double *, const int *); - -extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *); -extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *); -extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *); -extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *); - -extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); -extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); -extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); -extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); - -extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); -extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); -extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); -extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); - -extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); -extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); -extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); -extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); - -extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); -extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); -extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); -extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); - -extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); -extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); -extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); -extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); - -extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); -extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); -extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); -extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); - -extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); -extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); -extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); -extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +extern void BLAS(sswap)(const blasint *, float *, const blasint *, float *, const blasint *); +extern void BLAS(dswap)(const blasint *, double *, const blasint *, double *, const blasint *); +extern void BLAS(cswap)(const blasint *, float *, const blasint *, float *, const blasint *); +extern void BLAS(zswap)(const blasint *, double *, const blasint *, double *, const blasint *); + +extern void BLAS(sscal)(const blasint *, const float *, float *, const blasint *); +extern void BLAS(dscal)(const blasint *, const double *, double *, const blasint *); +extern void BLAS(cscal)(const blasint *, const float *, float *, const blasint *); +extern void BLAS(zscal)(const blasint *, const double *, double *, const blasint *); + +extern void BLAS(saxpy)(const blasint *, const float *, const float *, const blasint *, float *, const blasint *); +extern void BLAS(daxpy)(const blasint *, const double *, const double *, const blasint *, double *, const blasint *); +extern void BLAS(caxpy)(const blasint *, const float *, const float *, const blasint *, float *, const blasint *); +extern void BLAS(zaxpy)(const blasint *, const double *, const double *, const blasint *, double *, const blasint *); + +extern void BLAS(sgemv)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); +extern void BLAS(dgemv)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); +extern void BLAS(cgemv)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); +extern void BLAS(zgemv)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); + +extern void BLAS(sgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); +extern void BLAS(dgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); +extern void BLAS(cgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); +extern void BLAS(zgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); + +extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *); +extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *); +extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *); +extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *); + +extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *); +extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *); +extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *); +extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *); + +extern void BLAS(ssyrk)(const char *, const char *, const blasint *, const blasint *, const float *, float *, const blasint *, const float *, float *, const blasint *); +extern void BLAS(dsyrk)(const char *, const char *, const blasint *, const blasint *, const double *, double *, const blasint *, const double *, double *, const blasint *); +extern void BLAS(cherk)(const char *, const char *, const blasint *, const blasint *, const float *, float *, const blasint *, const float *, float *, const blasint *); +extern void BLAS(zherk)(const char *, const char *, const blasint *, const blasint *, const double *, double *, const blasint *, const double *, double *, const blasint *); + +extern void BLAS(ssymm)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); +extern void BLAS(dsymm)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); +extern void BLAS(chemm)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); +extern void BLAS(zhemm)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); + +extern void BLAS(ssyr2k)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); +extern void BLAS(dsyr2k)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); +extern void BLAS(cher2k)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); +extern void BLAS(zher2k)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); #if HAVE_XGEMMT -extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); -extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); -extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); -extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); +extern void BLAS(sgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); +extern void BLAS(dgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); +extern void BLAS(cgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); +extern void BLAS(zgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); #endif #endif /* BLAS_H */ diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c index 90b2c87895..eddfdedf77 100644 --- a/relapack/src/cgbtrf.c +++ b/relapack/src/cgbtrf.c @@ -1,9 +1,9 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *, - const int *, float *, const int *, int *, float *, const int *, float *, - const int *, int *); +static void RELAPACK_cgbtrf_rec(const blasint *, const blasint *, const blasint *, + const blasint *, float *, const blasint *, blasint *, float *, const blasint *, float *, + const blasint *, blasint *); /** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. @@ -13,9 +13,9 @@ static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html * */ void RELAPACK_cgbtrf( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + float *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { // Check arguments @@ -31,8 +31,8 @@ void RELAPACK_cgbtrf( else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CGBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CGBTRF", &minfo, strlen("CGBTRF")); return; } @@ -40,14 +40,14 @@ void RELAPACK_cgbtrf( const float ZERO[] = { 0., 0. }; // Result upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; float *const A = Ab + 2 * kv; // Zero upper diagonal fill-in elements - int i, j; + blasint i, j; for (j = 0; j < *n; j++) { float *const A_j = A + 2 * *ldA * j; for (i = MAX(0, j - kv); i < j - *ku; i++) @@ -55,11 +55,11 @@ void RELAPACK_cgbtrf( } // Allocate work space - const int n1 = CREC_SPLIT(*n); - const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const int nWorkl = (kv > n1) ? n1 : kv; - const int mWorku = (*kl > n1) ? n1 : *kl; - const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint n1 = CREC_SPLIT(*n); + const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const blasint nWorkl = (kv > n1) ? n1 : kv; + const blasint mWorku = (*kl > n1) ? n1 : *kl; + const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float)); float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float)); LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -76,10 +76,10 @@ void RELAPACK_cgbtrf( /** cgbtrf's recursive compute kernel */ static void RELAPACK_cgbtrf_rec( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + float *Ab, const blasint *ldAb, blasint *ipiv, + float *Workl, const blasint *ldWorkl, float *Worku, const blasint *ldWorku, + blasint *info ) { if (*n <= MAX(CROSSOVER_CGBTRF, 1)) { @@ -91,25 +91,25 @@ static void RELAPACK_cgbtrf_rec( // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterators - int i, j; + blasint i, j; // Output upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; float *const A = Ab + 2 * kv; // Splitting - const int n1 = MIN(CREC_SPLIT(*n), *kl); - const int n2 = *n - n1; - const int m1 = MIN(n1, *m); - const int m2 = *m - m1; - const int mn1 = MIN(m1, n1); - const int mn2 = MIN(m2, n2); + const blasint n1 = MIN(CREC_SPLIT(*n), *kl); + const blasint n2 = *n - n1; + const blasint m1 = MIN(n1, *m); + const blasint m2 = *m - m1; + const blasint mn1 = MIN(m1, n1); + const blasint mn2 = MIN(m2, n2); // Ab_L * // Ab_BR @@ -129,14 +129,14 @@ static void RELAPACK_cgbtrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // Banded splitting - const int n21 = MIN(n2, kv - n1); - const int n22 = MIN(n2 - n21, n1); - const int m21 = MIN(m2, *kl - m1); - const int m22 = MIN(m2 - m21, m1); + const blasint n21 = MIN(n2, kv - n1); + const blasint n22 = MIN(n2 - n21, n1); + const blasint m21 = MIN(m2, *kl - m1); + const blasint m22 = MIN(m2 - m21, m1); // n1 n21 n22 // m * A_Rl ARr @@ -164,7 +164,7 @@ static void RELAPACK_cgbtrf_rec( // partially redo swaps in A_L for (i = 0; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); @@ -180,7 +180,7 @@ static void RELAPACK_cgbtrf_rec( for (j = 0; j < n22; j++) { float *const A_Rrj = A_Rr + 2 * *ldA * j; for (i = j; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { const float tmpr = A_Rrj[2 * i]; const float tmpc = A_Rrj[2 * i + 1]; @@ -211,7 +211,7 @@ static void RELAPACK_cgbtrf_rec( // partially undo swaps in A_L for (i = mn1 - 1; i >= 0; i--) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); diff --git a/relapack/src/cgemmt.c b/relapack/src/cgemmt.c index 28e2b00b01..3af4d790f5 100644 --- a/relapack/src/cgemmt.c +++ b/relapack/src/cgemmt.c @@ -1,12 +1,12 @@ #include "relapack.h" static void RELAPACK_cgemmt_rec(const char *, const char *, const char *, - const int *, const int *, const float *, const float *, const int *, - const float *, const int *, const float *, float *, const int *); + const blasint *, const blasint *, const float *, const float *, const blasint *, + const float *, const blasint *, const float *, float *, const blasint *); static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *, - const int *, const int *, const float *, const float *, const int *, - const float *, const int *, const float *, float *, const int *); + const blasint *, const blasint *, const float *, const float *, const blasint *, + const float *, const blasint *, const float *, float *, const blasint *); /** CGEMMT computes a matrix-matrix product with general matrices but updates @@ -20,10 +20,10 @@ static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *, * */ void RELAPACK_cgemmt( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { #if HAVE_XGEMMT @@ -32,15 +32,15 @@ void RELAPACK_cgemmt( #else // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int notransA = LAPACK(lsame)(transA, "N"); - const int tranA = LAPACK(lsame)(transA, "T"); - const int ctransA = LAPACK(lsame)(transA, "C"); - const int notransB = LAPACK(lsame)(transB, "N"); - const int tranB = LAPACK(lsame)(transB, "T"); - const int ctransB = LAPACK(lsame)(transB, "C"); - int info = 0; + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint notransA = LAPACK(lsame)(transA, "N"); + const blasint tranA = LAPACK(lsame)(transA, "T"); + const blasint ctransA = LAPACK(lsame)(transA, "C"); + const blasint notransB = LAPACK(lsame)(transB, "N"); + const blasint tranB = LAPACK(lsame)(transB, "T"); + const blasint ctransB = LAPACK(lsame)(transB, "C"); + blasint info = 0; if (!lower && !upper) info = 1; else if (!tranA && !ctransA && !notransA) @@ -58,7 +58,7 @@ void RELAPACK_cgemmt( else if (*ldC < MAX(1, *n)) info = 13; if (info) { - LAPACK(xerbla)("CGEMMT", &info); + LAPACK(xerbla)("CGEMMT", &info, strlen("CGEMMT")); return; } @@ -76,10 +76,10 @@ void RELAPACK_cgemmt( /** cgemmt's recursive compute kernel */ static void RELAPACK_cgemmt_rec( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { if (*n <= MAX(CROSSOVER_CGEMMT, 1)) { @@ -89,8 +89,8 @@ static void RELAPACK_cgemmt_rec( } // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_T // A_B @@ -126,16 +126,16 @@ static void RELAPACK_cgemmt_rec( /** cgemmt's unblocked compute kernel */ static void RELAPACK_cgemmt_rec2( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { - const int incB = (*transB == 'N') ? 1 : *ldB; - const int incC = 1; + const blasint incB = (*transB == 'N') ? 1 : *ldB; + const blasint incC = 1; - int i; + blasint i; for (i = 0; i < *n; i++) { // A_0 // A_i @@ -151,13 +151,13 @@ static void RELAPACK_cgemmt_rec2( float *const C_ii = C + 2 * *ldC * i + 2 * i; if (*uplo == 'L') { - const int nmi = *n - i; + const blasint nmi = *n - i; if (*transA == 'N') BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); else BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); } else { - const int ip1 = i + 1; + const blasint ip1 = i + 1; if (*transA == 'N') BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); else diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c index b31a711d0f..9aab718a0e 100644 --- a/relapack/src/cgetrf.c +++ b/relapack/src/cgetrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_cgetrf_rec(const int *, const int *, float *, - const int *, int *, int *); +static void RELAPACK_cgetrf_rec(const blasint *, const blasint *, float *, + const blasint *, blasint *, blasint *); /** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. @@ -11,9 +11,9 @@ static void RELAPACK_cgetrf_rec(const int *, const int *, float *, * http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html */ void RELAPACK_cgetrf( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { // Check arguments @@ -25,12 +25,12 @@ void RELAPACK_cgetrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CGETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CGETRF", &minfo, strlen("CGETRF")); return; } - const int sn = MIN(*m, *n); + const blasint sn = MIN(*m, *n); RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -38,10 +38,10 @@ void RELAPACK_cgetrf( if (*m < *n) { // Constants const float ONE[] = { 1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Splitting - const int rn = *n - *m; + const blasint rn = *n - *m; // A_L A_R const float *const A_L = A; @@ -57,9 +57,9 @@ void RELAPACK_cgetrf( /** cgetrf's recursive compute kernel */ static void RELAPACK_cgetrf_rec( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { if (*n <= MAX(CROSSOVER_CGETRF, 1)) { @@ -71,12 +71,12 @@ static void RELAPACK_cgetrf_rec( // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; - const int m2 = *m - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; + const blasint m2 = *m - n1; // A_L A_R float *const A_L = A; @@ -91,8 +91,8 @@ static void RELAPACK_cgetrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // recursion(A_L, ipiv_T) RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); @@ -111,7 +111,7 @@ static void RELAPACK_cgetrf_rec( // apply pivots to A_BL LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) ipiv_B[i] += n1; } diff --git a/relapack/src/chegst.c b/relapack/src/chegst.c index dff875017d..fe77b03eae 100644 --- a/relapack/src/chegst.c +++ b/relapack/src/chegst.c @@ -3,9 +3,9 @@ #include "stdlib.h" #endif -static void RELAPACK_chegst_rec(const int *, const char *, const int *, - float *, const int *, const float *, const int *, - float *, const int *, int *); +static void RELAPACK_chegst_rec(const blasint *, const char *, const blasint *, + float *, const blasint *, const float *, const blasint *, + float *, const blasint *, blasint *); /** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. @@ -15,14 +15,14 @@ static void RELAPACK_chegst_rec(const int *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html * */ void RELAPACK_chegst( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + float *A, const blasint *ldA, const float *B, const blasint *ldB, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (*itype < 1 || *itype > 3) *info = -1; @@ -35,8 +35,8 @@ void RELAPACK_chegst( else if (*ldB < MAX(1, *n)) *info = -7; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CHEGST", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CHEGST", &minfo, strlen("CHEGST")); return; } @@ -45,9 +45,9 @@ void RELAPACK_chegst( // Allocate work space float *Work = NULL; - int lWork = 0; + blasint lWork = 0; #if XSYGST_ALLOW_MALLOC - const int n1 = CREC_SPLIT(*n); + const blasint n1 = CREC_SPLIT(*n); lWork = n1 * (*n - n1); Work = malloc(lWork * 2 * sizeof(float)); if (!Work) @@ -67,9 +67,9 @@ void RELAPACK_chegst( /** chegst's recursive compute kernel */ static void RELAPACK_chegst_rec( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - float *Work, const int *lWork, int *info + const blasint *itype, const char *uplo, const blasint *n, + float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *Work, const blasint *lWork, blasint *info ) { if (*n <= MAX(CROSSOVER_CHEGST, 1)) { @@ -84,14 +84,14 @@ static void RELAPACK_chegst_rec( const float MONE[] = { -1., 0. }; const float HALF[] = { .5, 0. }; const float MHALF[] = { -.5, 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/chetrf.c b/relapack/src/chetrf.c index 2928235e47..8cd3c07742 100644 --- a/relapack/src/chetrf.c +++ b/relapack/src/chetrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *, - float *, const int *, int *, float *, const int *, int *); +static void RELAPACK_chetrf_rec(const char *, const blasint *, const blasint *, blasint *, + float *, const blasint *, blasint *, float *, const blasint *, blasint *); /** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *, * http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html * */ void RELAPACK_chetrf( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_chetrf( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CHETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CHETRF", &minfo, strlen("CHETRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_chetrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_chetrf( /** chetrf's recursive compute kernel */ static void RELAPACK_chetrf_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - float *A, const int *ldA, int *ipiv, - float *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_CHETRF, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_chetrf_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = CREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = CREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_chetrf_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + 2 * n1; float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_chetrf_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_chetrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = CREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = CREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_chetrf_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/chetrf_rec2.c b/relapack/src/chetrf_rec2.c index b5c8341b6b..412f64cf76 100644 --- a/relapack/src/chetrf_rec2.c +++ b/relapack/src/chetrf_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static complex c_b1 = {1.f,0.f}; -static int c__1 = 1; +static blasint c__1 = 1; /** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int * - nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, - int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, blasint *n, blasint * + nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, complex *w, + int *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; float r__1, r__2, r__3, r__4; complex q__1, q__2, q__3, q__4; @@ -38,22 +38,22 @@ static int c__1 = 1; void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); /* Local variables */ - static int j, k; + static blasint j, k; static float t, r1; static complex d11, d21, d22; - static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; static float alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * - , complex *, int *, complex *, int *, complex *, complex * - , int *, ftnlen), ccopy_(int *, complex *, int *, - complex *, int *), cswap_(int *, complex *, int *, - complex *, int *); - static int kstep; + extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex * + , complex *, blasint *, complex *, blasint *, complex *, complex * + , blasint *, ftnlen), ccopy_(int *, complex *, blasint *, + complex *, blasint *), cswap_(int *, complex *, blasint *, + complex *, blasint *); + static blasint kstep; static float absakk; - extern /* Subroutine */ int clacgv_(int *, complex *, int *); - extern int icamax_(int *, complex *, int *); - extern /* Subroutine */ int csscal_(int *, float *, complex *, int + extern /* Subroutine */ blasint clacgv_(int *, complex *, blasint *); + extern blasint icamax_(int *, complex *, blasint *); + extern /* Subroutine */ blasint csscal_(int *, float *, complex *, int *); static float colmax, rowmax; diff --git a/relapack/src/chetrf_rook.c b/relapack/src/chetrf_rook.c index 086393d576..3d2fa32160 100644 --- a/relapack/src/chetrf_rook.c +++ b/relapack/src/chetrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *, - float *, const int *, int *, float *, const int *, int *); +static void RELAPACK_chetrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, + float *, const blasint *, blasint *, float *, const blasint *, blasint *); /** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int * http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html * */ void RELAPACK_chetrf_rook( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_chetrf_rook( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CHETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CHETRF", &minfo, strlen("CHETRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_chetrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_chetrf_rook( /** chetrf_rook's recursive compute kernel */ static void RELAPACK_chetrf_rook_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - float *A, const int *ldA, int *ipiv, - float *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_CHETRF, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_chetrf_rook_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = CREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = CREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_chetrf_rook_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + 2 * n1; float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_chetrf_rook_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_chetrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = CREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = CREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_chetrf_rook_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/chetrf_rook_rec2.c b/relapack/src/chetrf_rook_rec2.c index a42cbfd44d..e0b2ff9628 100644 --- a/relapack/src/chetrf_rook_rec2.c +++ b/relapack/src/chetrf_rook_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static complex c_b1 = {1.f,0.f}; -static int c__1 = 1; +static blasint c__1 = 1; /** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n, - int *nb, int *kb, complex *a, int *lda, int *ipiv, - complex *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, blasint *n, + int *nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, + complex *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; float r__1, r__2; complex q__1, q__2, q__3, q__4, q__5; @@ -38,29 +38,29 @@ static int c__1 = 1; void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); /* Local variables */ - static int j, k, p; + static blasint j, k, p; static float t, r1; static complex d11, d21, d22; - static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static int imax, jmax; + static blasint imax, jmax; static float alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * - , complex *, int *, complex *, int *, complex *, complex * - , int *, ftnlen); + extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex * + , complex *, blasint *, complex *, blasint *, complex *, complex * + , blasint *, ftnlen); static float sfmin; - extern /* Subroutine */ int ccopy_(int *, complex *, int *, - complex *, int *); - static int itemp; - extern /* Subroutine */ int cswap_(int *, complex *, int *, - complex *, int *); - static int kstep; + extern /* Subroutine */ blasint ccopy_(int *, complex *, blasint *, + complex *, blasint *); + static blasint itemp; + extern /* Subroutine */ blasint cswap_(int *, complex *, blasint *, + complex *, blasint *); + static blasint kstep; static float stemp, absakk; - extern /* Subroutine */ int clacgv_(int *, complex *, int *); - extern int icamax_(int *, complex *, int *); + extern /* Subroutine */ blasint clacgv_(int *, complex *, blasint *); + extern blasint icamax_(int *, complex *, blasint *); extern double slamch_(char *, ftnlen); - extern /* Subroutine */ int csscal_(int *, float *, complex *, int + extern /* Subroutine */ blasint csscal_(int *, float *, complex *, int *); static float colmax, rowmax; diff --git a/relapack/src/clauum.c b/relapack/src/clauum.c index 36d6297cfc..2bc93f182b 100644 --- a/relapack/src/clauum.c +++ b/relapack/src/clauum.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_clauum_rec(const char *, const int *, float *, - const int *, int *); +static void RELAPACK_clauum_rec(const char *, const blasint *, float *, + const blasint *, blasint *); /** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. @@ -11,14 +11,14 @@ static void RELAPACK_clauum_rec(const char *, const int *, float *, * http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html * */ void RELAPACK_clauum( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_clauum( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CLAUUM", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CLAUUM", &minfo, strlen("CLAUUM")); return; } @@ -42,9 +42,9 @@ void RELAPACK_clauum( /** clauum's recursive compute kernel */ static void RELAPACK_clauum_rec( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { if (*n <= MAX(CROSSOVER_CLAUUM, 1)) { @@ -57,8 +57,8 @@ static void RELAPACK_clauum_rec( const float ONE[] = { 1., 0. }; // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/cpbtrf.c b/relapack/src/cpbtrf.c index e0ea7b944a..971e547c64 100644 --- a/relapack/src/cpbtrf.c +++ b/relapack/src/cpbtrf.c @@ -1,8 +1,8 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *, - float *, const int *, float *, const int *, int *); +static void RELAPACK_cpbtrf_rec(const char *, const blasint *, const blasint *, + float *, const blasint *, float *, const blasint *, blasint *); /** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. @@ -12,14 +12,14 @@ static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html * */ void RELAPACK_cpbtrf( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + float *Ab, const blasint *ldAb, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -30,8 +30,8 @@ void RELAPACK_cpbtrf( else if (*ldAb < *kd + 1) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CPBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CPBTRF", &minfo, strlen("CPBTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_cpbtrf( const float ZERO[] = { 0., 0. }; // Allocate work space - const int n1 = CREC_SPLIT(*n); - const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint n1 = CREC_SPLIT(*n); + const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; float *Work = malloc(mWork * nWork * 2 * sizeof(float)); LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -58,10 +58,10 @@ void RELAPACK_cpbtrf( /** cpbtrf's recursive compute kernel */ static void RELAPACK_cpbtrf_rec( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - float *Work, const int *ldWork, - int *info + const char *uplo, const blasint *n, const blasint *kd, + float *Ab, const blasint *ldAb, + float *Work, const blasint *ldWork, + blasint *info ){ if (*n <= MAX(CROSSOVER_CPBTRF, 1)) { @@ -75,12 +75,12 @@ static void RELAPACK_cpbtrf_rec( const float MONE[] = { -1., 0. }; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); // Splitting - const int n1 = MIN(CREC_SPLIT(*n), *kd); - const int n2 = *n - n1; + const blasint n1 = MIN(CREC_SPLIT(*n), *kd); + const blasint n2 = *n - n1; // * * // * Ab_BR @@ -99,8 +99,8 @@ static void RELAPACK_cpbtrf_rec( return; // Banded splitting - const int n21 = MIN(n2, *kd - n1); - const int n22 = MIN(n2 - n21, *kd); + const blasint n21 = MIN(n2, *kd - n1); + const blasint n22 = MIN(n2 - n21, *kd); // n1 n21 n22 // n1 * A_TRl A_TRr diff --git a/relapack/src/cpotrf.c b/relapack/src/cpotrf.c index e35caa7fa8..0f8e7ebb06 100644 --- a/relapack/src/cpotrf.c +++ b/relapack/src/cpotrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_cpotrf_rec(const char *, const int *, float *, - const int *, int *); +static void RELAPACK_cpotrf_rec(const char *, const blasint *, float *, + const blasint *, blasint *); /** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. @@ -11,14 +11,14 @@ static void RELAPACK_cpotrf_rec(const char *, const int *, float *, * http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html * */ void RELAPACK_cpotrf( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_cpotrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CPOTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CPOTRF", &minfo, strlen("CPOTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_cpotrf( /** cpotrf's recursive compute kernel */ static void RELAPACK_cpotrf_rec( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ){ if (*n <= MAX(CROSSOVER_CPOTRF, 1)) { @@ -58,8 +58,8 @@ static void RELAPACK_cpotrf_rec( const float MONE[] = { -1., 0. }; // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/csytrf.c b/relapack/src/csytrf.c index 01c161d1ae..2ebc310014 100644 --- a/relapack/src/csytrf.c +++ b/relapack/src/csytrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *, - float *, const int *, int *, float *, const int *, int *); +static void RELAPACK_csytrf_rec(const char *, const blasint *, const blasint *, blasint *, + float *, const blasint *, blasint *, float *, const blasint *, blasint *); /** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *, * http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html * */ void RELAPACK_csytrf( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_csytrf( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CSYTRF", &minfo, strlen("CSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_csytrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy arguments - int nout; + blasint nout; // Recursive kernel RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_csytrf( /** csytrf's recursive compute kernel */ static void RELAPACK_csytrf_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - float *A, const int *ldA, int *ipiv, - float *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_CSYTRF, 3)) { // Unblocked @@ -96,34 +96,34 @@ static void RELAPACK_csytrf_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = CREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = CREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -139,23 +139,23 @@ static void RELAPACK_csytrf_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + 2 * n1; float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -182,22 +182,22 @@ static void RELAPACK_csytrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = CREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = CREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -213,19 +213,19 @@ static void RELAPACK_csytrf_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/csytrf_rec2.c b/relapack/src/csytrf_rec2.c index 9d6bd849d0..216a9e2484 100644 --- a/relapack/src/csytrf_rec2.c +++ b/relapack/src/csytrf_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static complex c_b1 = {1.f,0.f}; -static int c__1 = 1; +static blasint c__1 = 1; /** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int * - nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, - int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, blasint *n, blasint * + nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, complex *w, + int *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; float r__1, r__2, r__3, r__4; complex q__1, q__2, q__3; @@ -38,21 +38,21 @@ static int c__1 = 1; void c_div(complex *, complex *, complex *); /* Local variables */ - static int j, k; + static blasint j, k; static complex t, r1, d11, d21, d22; - static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; static float alpha; - extern /* Subroutine */ int cscal_(int *, complex *, complex *, - int *); + extern /* Subroutine */ blasint cscal_(int *, complex *, complex *, + blasint *); extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * - , complex *, int *, complex *, int *, complex *, complex * - , int *, ftnlen), ccopy_(int *, complex *, int *, - complex *, int *), cswap_(int *, complex *, int *, - complex *, int *); - static int kstep; + extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex * + , complex *, blasint *, complex *, blasint *, complex *, complex * + , blasint *, ftnlen), ccopy_(int *, complex *, blasint *, + complex *, blasint *), cswap_(int *, complex *, blasint *, + complex *, blasint *); + static blasint kstep; static float absakk; - extern int icamax_(int *, complex *, int *); + extern blasint icamax_(int *, complex *, blasint *); static float colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/csytrf_rook.c b/relapack/src/csytrf_rook.c index aa7dd0e57a..e8a9865cca 100644 --- a/relapack/src/csytrf_rook.c +++ b/relapack/src/csytrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *, - float *, const int *, int *, float *, const int *, int *); +static void RELAPACK_csytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, + float *, const blasint *, blasint *, float *, const blasint *, blasint *); /** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int * http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html * */ void RELAPACK_csytrf_rook( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_csytrf_rook( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CSYTRF", &minfo, strlen("CSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_csytrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_csytrf_rook( /** csytrf_rook's recursive compute kernel */ static void RELAPACK_csytrf_rook_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - float *A, const int *ldA, int *ipiv, - float *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_csytrf_rook_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = CREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = CREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_csytrf_rook_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + 2 * n1; float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_csytrf_rook_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_csytrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = CREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = CREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_csytrf_rook_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/csytrf_rook_rec2.c b/relapack/src/csytrf_rook_rec2.c index 6638338a60..2561065d7b 100644 --- a/relapack/src/csytrf_rook_rec2.c +++ b/relapack/src/csytrf_rook_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static complex c_b1 = {1.f,0.f}; -static int c__1 = 1; +static blasint c__1 = 1; /** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n, - int *nb, int *kb, complex *a, int *lda, int *ipiv, - complex *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, blasint *n, + int *nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, + complex *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; float r__1, r__2; complex q__1, q__2, q__3, q__4; @@ -38,27 +38,27 @@ static int c__1 = 1; void c_div(complex *, complex *, complex *); /* Local variables */ - static int j, k, p; + static blasint j, k, p; static complex t, r1, d11, d12, d21, d22; - static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static int imax, jmax; + static blasint imax, jmax; static float alpha; - extern /* Subroutine */ int cscal_(int *, complex *, complex *, - int *); + extern /* Subroutine */ blasint cscal_(int *, complex *, complex *, + blasint *); extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * - , complex *, int *, complex *, int *, complex *, complex * - , int *, ftnlen); + extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex * + , complex *, blasint *, complex *, blasint *, complex *, complex * + , blasint *, ftnlen); static float sfmin; - extern /* Subroutine */ int ccopy_(int *, complex *, int *, - complex *, int *); - static int itemp; - extern /* Subroutine */ int cswap_(int *, complex *, int *, - complex *, int *); - static int kstep; + extern /* Subroutine */ blasint ccopy_(int *, complex *, blasint *, + complex *, blasint *); + static blasint itemp; + extern /* Subroutine */ blasint cswap_(int *, complex *, blasint *, + complex *, blasint *); + static blasint kstep; static float stemp, absakk; - extern int icamax_(int *, complex *, int *); + extern blasint icamax_(int *, complex *, blasint *); extern double slamch_(char *, ftnlen); static float colmax, rowmax; diff --git a/relapack/src/ctgsyl.c b/relapack/src/ctgsyl.c index 15c738baf2..704f3ef232 100644 --- a/relapack/src/ctgsyl.c +++ b/relapack/src/ctgsyl.c @@ -1,10 +1,10 @@ #include "relapack.h" #include -static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *, - const int *, const float *, const int *, const float *, const int *, - float *, const int *, const float *, const int *, const float *, - const int *, float *, const int *, float *, float *, float *, int *); +static void RELAPACK_ctgsyl_rec(const char *, const blasint *, const blasint *, + const blasint *, const float *, const blasint *, const float *, const blasint *, + float *, const blasint *, const float *, const blasint *, const float *, + const blasint *, float *, const blasint *, float *, float *, float *, blasint *); /** CTGSYL solves the generalized Sylvester equation. @@ -14,21 +14,21 @@ static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html * */ void RELAPACK_ctgsyl( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, + const float *D, const blasint *ldD, const float *E, const blasint *ldE, + float *F, const blasint *ldF, float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info + float *Work, const blasint *lWork, blasint *iWork, blasint *info ) { // Parse arguments - const int notran = LAPACK(lsame)(trans, "N"); - const int tran = LAPACK(lsame)(trans, "C"); + const blasint notran = LAPACK(lsame)(trans, "N"); + const blasint tran = LAPACK(lsame)(trans, "C"); // Compute work buffer size - int lwmin = 1; + blasint lwmin = 1; if (notran && (*ijob == 1 || *ijob == 2)) lwmin = MAX(1, 2 * *m * *n); *info = 0; @@ -57,8 +57,8 @@ void RELAPACK_ctgsyl( else if (*lWork < lwmin && *lWork != -1) *info = -20; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CTGSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CTGSYL", &minfo, strlen("CTGSYL")); return; } @@ -74,8 +74,8 @@ void RELAPACK_ctgsyl( // Constant const float ZERO[] = { 0., 0. }; - int isolve = 1; - int ifunc = 0; + blasint isolve = 1; + blasint ifunc = 0; if (notran) { if (*ijob >= 3) { ifunc = *ijob - 2; @@ -86,7 +86,7 @@ void RELAPACK_ctgsyl( } float scale2; - int iround; + blasint iround; for (iround = 1; iround <= isolve; iround++) { *scale = 1; float dscale = 0; @@ -119,13 +119,13 @@ void RELAPACK_ctgsyl( /** ctgsyl's recursive vompute kernel */ static void RELAPACK_ctgsyl_rec( - const char *trans, const int *ifunc, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, + const char *trans, const blasint *ifunc, const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, + const float *D, const blasint *ldD, const float *E, const blasint *ldE, + float *F, const blasint *ldF, float *scale, float *dsum, float *dscale, - int *info + blasint *info ) { if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) { @@ -137,18 +137,18 @@ static void RELAPACK_ctgsyl_rec( // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs float scale1[] = { 1., 0. }; float scale2[] = { 1., 0. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - const int m1 = CREC_SPLIT(*m); - const int m2 = *m - m1; + const blasint m1 = CREC_SPLIT(*m); + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -206,8 +206,8 @@ static void RELAPACK_ctgsyl_rec( } } else { // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/ctrsyl.c b/relapack/src/ctrsyl.c index b548d5354d..fed6e847e5 100644 --- a/relapack/src/ctrsyl.c +++ b/relapack/src/ctrsyl.c @@ -1,8 +1,8 @@ #include "relapack.h" -static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *, - const int *, const int *, const float *, const int *, const float *, - const int *, float *, const int *, float *, int *); +static void RELAPACK_ctrsyl_rec(const char *, const char *, const blasint *, + const blasint *, const blasint *, const float *, const blasint *, const float *, + const blasint *, float *, const blasint *, float *, blasint *); /** CTRSYL solves the complex Sylvester matrix equation. @@ -12,18 +12,18 @@ static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html * */ void RELAPACK_ctrsyl( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, float *scale, + blasint *info ) { // Check arguments - const int notransA = LAPACK(lsame)(tranA, "N"); - const int ctransA = LAPACK(lsame)(tranA, "C"); - const int notransB = LAPACK(lsame)(tranB, "N"); - const int ctransB = LAPACK(lsame)(tranB, "C"); + const blasint notransA = LAPACK(lsame)(tranA, "N"); + const blasint ctransA = LAPACK(lsame)(tranA, "C"); + const blasint notransB = LAPACK(lsame)(tranB, "N"); + const blasint ctransB = LAPACK(lsame)(tranB, "C"); *info = 0; if (!ctransA && !notransA) *info = -1; @@ -42,8 +42,8 @@ void RELAPACK_ctrsyl( else if (*ldC < MAX(1, *m)) *info = -11; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CTRSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CTRSYL", &minfo, strlen("CTRSYL")); return; } @@ -58,11 +58,11 @@ void RELAPACK_ctrsyl( /** ctrsyl's recursive compute kernel */ static void RELAPACK_ctrsyl_rec( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, float *scale, + blasint *info ) { if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) { @@ -75,18 +75,18 @@ static void RELAPACK_ctrsyl_rec( const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; const float MSGN[] = { -*isgn, 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs float scale1[] = { 1., 0. }; float scale2[] = { 1., 0. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - const int m1 = CREC_SPLIT(*m); - const int m2 = *m - m1; + const blasint m1 = CREC_SPLIT(*m); + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -122,8 +122,8 @@ static void RELAPACK_ctrsyl_rec( } } else { // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/ctrsyl_rec2.c b/relapack/src/ctrsyl_rec2.c index 518574868a..556491c7a1 100644 --- a/relapack/src/ctrsyl_rec2.c +++ b/relapack/src/ctrsyl_rec2.c @@ -14,16 +14,16 @@ #include "f2c.h" #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES -complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) { - extern void cdotu_(complex *, int *, complex *, int *, complex *, int *); +complex cdotu_fun(int *n, complex *x, blasint *incx, complex *y, blasint *incy) { + extern void cdotu_(complex *, blasint *, complex *, blasint *, complex *, blasint *); complex result; cdotu_(&result, n, x, incx, y, incy); return result; } #define cdotu_ cdotu_fun -complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) { - extern void cdotc_(complex *, int *, complex *, int *, complex *, int *); +complex cdotc_fun(int *n, complex *x, blasint *incx, complex *y, blasint *incy) { + extern void cdotc_(complex *, blasint *, complex *, blasint *, complex *, blasint *); complex result; cdotc_(&result, n, x, incx, y, incy); return result; @@ -43,7 +43,7 @@ complex cladiv_fun(complex *a, complex *b) { /* Table of constant values */ -static int c__1 = 1; +static blasint c__1 = 1; /** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) * @@ -51,12 +51,12 @@ static int c__1 = 1; * It serves as an unblocked kernel in the recursive algorithms. * */ /* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int - *isgn, int *m, int *n, complex *a, int *lda, complex *b, - int *ldb, complex *c__, int *ldc, float *scale, int *info, + *isgn, blasint *m, blasint *n, complex *a, blasint *lda, complex *b, + int *ldb, complex *c__, blasint *ldc, float *scale, blasint *info, ftnlen trana_len, ftnlen tranb_len) { /* System generated locals */ - int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; float r__1, r__2; complex q__1, q__2, q__3, q__4; @@ -66,7 +66,7 @@ static int c__1 = 1; void r_cnjg(complex *, complex *); /* Local variables */ - static int j, k, l; + static blasint j, k, l; static complex a11; static float db; static complex x11; @@ -75,20 +75,20 @@ static int c__1 = 1; static float dum[1], eps, sgn, smin; static complex suml, sumr; /* Complex */ complex cdotc_(int *, complex *, int - *, complex *, int *); - extern int lsame_(char *, char *, ftnlen, ftnlen); + *, complex *, blasint *); + extern blasint lsame_(char *, char *, ftnlen, ftnlen); /* Complex */ complex cdotu_(int *, complex *, int - *, complex *, int *); - extern /* Subroutine */ int slabad_(float *, float *); - extern float clange_(char *, int *, int *, complex *, - int *, float *, ftnlen); + *, complex *, blasint *); + extern /* Subroutine */ blasint slabad_(float *, float *); + extern float clange_(char *, blasint *, blasint *, complex *, + blasint *, float *, ftnlen); /* Complex */ complex cladiv_(complex *, complex *); static float scaloc; extern float slamch_(char *, ftnlen); - extern /* Subroutine */ int csscal_(int *, float *, complex *, int - *), xerbla_(char *, int *, ftnlen); + extern /* Subroutine */ blasint csscal_(int *, float *, complex *, int + *), xerbla_(char *, blasint *, ftnlen); static float bignum; - static int notrna, notrnb; + static blasint notrna, notrnb; static float smlnum; /* Parameter adjustments */ diff --git a/relapack/src/ctrtri.c b/relapack/src/ctrtri.c index 0262cb59d9..5201a24c73 100644 --- a/relapack/src/ctrtri.c +++ b/relapack/src/ctrtri.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_ctrtri_rec(const char *, const char *, const int *, - float *, const int *, int *); +static void RELAPACK_ctrtri_rec(const char *, const char *, const blasint *, + float *, const blasint *, blasint *); /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. @@ -11,16 +11,16 @@ static void RELAPACK_ctrtri_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html * */ void RELAPACK_ctrtri( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int nounit = LAPACK(lsame)(diag, "N"); - const int unit = LAPACK(lsame)(diag, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint nounit = LAPACK(lsame)(diag, "N"); + const blasint unit = LAPACK(lsame)(diag, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -31,8 +31,8 @@ void RELAPACK_ctrtri( else if (*ldA < MAX(1, *n)) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("CTRTRI", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("CTRTRI", &minfo, strlen("CTRTRI")); return; } @@ -42,7 +42,7 @@ void RELAPACK_ctrtri( // check for singularity if (nounit) { - int i; + blasint i; for (i = 0; i < *n; i++) if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { *info = i; @@ -57,9 +57,9 @@ void RELAPACK_ctrtri( /** ctrtri's recursive compute kernel */ static void RELAPACK_ctrtri_rec( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + float *A, const blasint *ldA, + blasint *info ){ if (*n <= MAX(CROSSOVER_CTRTRI, 1)) { @@ -73,8 +73,8 @@ static void RELAPACK_ctrtri_rec( const float MONE[] = { -1., 0. }; // Splitting - const int n1 = CREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = CREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c index 1a1757d311..f4b443629b 100644 --- a/relapack/src/dgbtrf.c +++ b/relapack/src/dgbtrf.c @@ -1,9 +1,8 @@ #include "relapack.h" #include "stdlib.h" - -static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *, - const int *, double *, const int *, int *, double *, const int *, double *, - const int *, int *); +static void RELAPACK_dgbtrf_rec(const blasint *, const blasint *, const blasint *, + const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *, + const blasint *, blasint *); /** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. @@ -13,9 +12,9 @@ static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html * */ void RELAPACK_dgbtrf( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + double *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { // Check arguments @@ -31,8 +30,8 @@ void RELAPACK_dgbtrf( else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DGBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DGBTRF", &minfo, strlen("DGBTRF")); return; } @@ -40,14 +39,14 @@ void RELAPACK_dgbtrf( const double ZERO[] = { 0. }; // Result upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; double *const A = Ab + kv; // Zero upper diagonal fill-in elements - int i, j; + blasint i, j; for (j = 0; j < *n; j++) { double *const A_j = A + *ldA * j; for (i = MAX(0, j - kv); i < j - *ku; i++) @@ -55,11 +54,12 @@ void RELAPACK_dgbtrf( } // Allocate work space - const int n1 = DREC_SPLIT(*n); - const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const int nWorkl = (kv > n1) ? n1 : kv; - const int mWorku = (*kl > n1) ? n1 : *kl; - const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint n1 = DREC_SPLIT(*n); + const blasint mWorkl = abs( (kv > n1) ? MAX(1, *m - *kl) : kv); + const blasint nWorkl = abs( (kv > n1) ? n1 : kv); + const blasint mWorku = abs( (*kl > n1) ? n1 : *kl); +// const blasint nWorku = abs( (*kl > n1) ? MAX(0, *n - *kl) : *kl); + const blasint nWorku = abs( (*kl > n1) ? MAX(1, *n - *kl) : *kl); double *Workl = malloc(mWorkl * nWorkl * sizeof(double)); double *Worku = malloc(mWorku * nWorku * sizeof(double)); LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -76,10 +76,10 @@ void RELAPACK_dgbtrf( /** dgbtrf's recursive compute kernel */ static void RELAPACK_dgbtrf_rec( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + double *Ab, const blasint *ldAb, blasint *ipiv, + double *Workl, const blasint *ldWorkl, double *Worku, const blasint *ldWorku, + blasint *info ) { if (*n <= MAX(CROSSOVER_DGBTRF, 1)) { @@ -91,25 +91,25 @@ static void RELAPACK_dgbtrf_rec( // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterators - int i, j; + blasint i, j; // Output upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; double *const A = Ab + kv; // Splitting - const int n1 = MIN(DREC_SPLIT(*n), *kl); - const int n2 = *n - n1; - const int m1 = MIN(n1, *m); - const int m2 = *m - m1; - const int mn1 = MIN(m1, n1); - const int mn2 = MIN(m2, n2); + const blasint n1 = MIN(DREC_SPLIT(*n), *kl); + const blasint n2 = *n - n1; + const blasint m1 = MIN(n1, *m); + const blasint m2 = *m - m1; + const blasint mn1 = MIN(m1, n1); + const blasint mn2 = MIN(m2, n2); // Ab_L * // Ab_BR @@ -129,14 +129,14 @@ static void RELAPACK_dgbtrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // Banded splitting - const int n21 = MIN(n2, kv - n1); - const int n22 = MIN(n2 - n21, n1); - const int m21 = MIN(m2, *kl - m1); - const int m22 = MIN(m2 - m21, m1); + const blasint n21 = MIN(n2, kv - n1); + const blasint n22 = MIN(n2 - n21, n1); + const blasint m21 = MIN(m2, *kl - m1); + const blasint m22 = MIN(m2 - m21, m1); // n1 n21 n22 // m * A_Rl ARr @@ -164,7 +164,7 @@ static void RELAPACK_dgbtrf_rec( // partially redo swaps in A_L for (i = 0; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); @@ -180,7 +180,7 @@ static void RELAPACK_dgbtrf_rec( for (j = 0; j < n22; j++) { double *const A_Rrj = A_Rr + *ldA * j; for (i = j; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { const double tmp = A_Rrj[i]; A_Rrj[i] = A_Rr[ip]; @@ -208,7 +208,7 @@ static void RELAPACK_dgbtrf_rec( // partially undo swaps in A_L for (i = mn1 - 1; i >= 0; i--) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); diff --git a/relapack/src/dgemmt.c b/relapack/src/dgemmt.c index 9c925b5861..1ceab6c377 100644 --- a/relapack/src/dgemmt.c +++ b/relapack/src/dgemmt.c @@ -1,12 +1,12 @@ #include "relapack.h" static void RELAPACK_dgemmt_rec(const char *, const char *, const char *, - const int *, const int *, const double *, const double *, const int *, - const double *, const int *, const double *, double *, const int *); + const blasint *, const blasint *, const double *, const double *, const blasint *, + const double *, const blasint *, const double *, double *, const blasint *); static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *, - const int *, const int *, const double *, const double *, const int *, - const double *, const int *, const double *, double *, const int *); + const blasint *, const blasint *, const double *, const double *, const blasint *, + const double *, const blasint *, const double *, double *, const blasint *); /** DGEMMT computes a matrix-matrix product with general matrices but updates @@ -20,10 +20,10 @@ static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *, * */ void RELAPACK_dgemmt( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { #if HAVE_XGEMMT @@ -32,13 +32,13 @@ void RELAPACK_dgemmt( #else // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int notransA = LAPACK(lsame)(transA, "N"); - const int tranA = LAPACK(lsame)(transA, "T"); - const int notransB = LAPACK(lsame)(transB, "N"); - const int tranB = LAPACK(lsame)(transB, "T"); - int info = 0; + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint notransA = LAPACK(lsame)(transA, "N"); + const blasint tranA = LAPACK(lsame)(transA, "T"); + const blasint notransB = LAPACK(lsame)(transB, "N"); + const blasint tranB = LAPACK(lsame)(transB, "T"); + blasint info = 0; if (!lower && !upper) info = 1; else if (!tranA && !notransA) @@ -56,7 +56,7 @@ void RELAPACK_dgemmt( else if (*ldC < MAX(1, *n)) info = 13; if (info) { - LAPACK(xerbla)("DGEMMT", &info); + LAPACK(xerbla)("DGEMMT", &info, strlen("DGEMMT")); return; } @@ -74,10 +74,10 @@ void RELAPACK_dgemmt( /** dgemmt's recursive compute kernel */ static void RELAPACK_dgemmt_rec( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { if (*n <= MAX(CROSSOVER_DGEMMT, 1)) { @@ -87,8 +87,8 @@ static void RELAPACK_dgemmt_rec( } // Splitting - const int n1 = DREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = DREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_T // A_B @@ -124,16 +124,16 @@ static void RELAPACK_dgemmt_rec( /** dgemmt's unblocked compute kernel */ static void RELAPACK_dgemmt_rec2( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { - const int incB = (*transB == 'N') ? 1 : *ldB; - const int incC = 1; + const blasint incB = (*transB == 'N') ? 1 : *ldB; + const blasint incC = 1; - int i; + blasint i; for (i = 0; i < *n; i++) { // A_0 // A_i @@ -149,13 +149,13 @@ static void RELAPACK_dgemmt_rec2( double *const C_ii = C + *ldC * i + i; if (*uplo == 'L') { - const int nmi = *n - i; + const blasint nmi = *n - i; if (*transA == 'N') BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); else BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); } else { - const int ip1 = i + 1; + const blasint ip1 = i + 1; if (*transA == 'N') BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); else diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c index 07f5472fd1..c4bce8fc5d 100644 --- a/relapack/src/dgetrf.c +++ b/relapack/src/dgetrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_dgetrf_rec(const int *, const int *, double *, - const int *, int *, int *); +static void RELAPACK_dgetrf_rec(const blasint *, const blasint *, double *, + const blasint *, blasint *, blasint *); /** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. @@ -11,9 +11,9 @@ static void RELAPACK_dgetrf_rec(const int *, const int *, double *, * http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html * */ void RELAPACK_dgetrf( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { // Check arguments @@ -25,12 +25,12 @@ void RELAPACK_dgetrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DGETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DGETRF", &minfo, strlen("DGETRF")); return; } - const int sn = MIN(*m, *n); + const blasint sn = MIN(*m, *n); RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -38,10 +38,10 @@ void RELAPACK_dgetrf( if (*m < *n) { // Constants const double ONE[] = { 1. }; - const int iONE[] = { 1. }; + const blasint iONE[] = { 1. }; // Splitting - const int rn = *n - *m; + const blasint rn = *n - *m; // A_L A_R const double *const A_L = A; @@ -57,9 +57,9 @@ void RELAPACK_dgetrf( /** dgetrf's recursive compute kernel */ static void RELAPACK_dgetrf_rec( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { if (*n <= MAX(CROSSOVER_DGETRF, 1)) { @@ -71,12 +71,12 @@ static void RELAPACK_dgetrf_rec( // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Splitting - const int n1 = DREC_SPLIT(*n); - const int n2 = *n - n1; - const int m2 = *m - n1; + const blasint n1 = DREC_SPLIT(*n); + const blasint n2 = *n - n1; + const blasint m2 = *m - n1; // A_L A_R double *const A_L = A; @@ -91,8 +91,8 @@ static void RELAPACK_dgetrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // recursion(A_L, ipiv_T) RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); @@ -111,7 +111,7 @@ static void RELAPACK_dgetrf_rec( // apply pivots to A_BL LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) ipiv_B[i] += n1; } diff --git a/relapack/src/dlauum.c b/relapack/src/dlauum.c index d722ea809f..6c7dcccb33 100644 --- a/relapack/src/dlauum.c +++ b/relapack/src/dlauum.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_dlauum_rec(const char *, const int *, double *, - const int *, int *); +static void RELAPACK_dlauum_rec(const char *, const blasint *, double *, + const blasint *, blasint *); /** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. @@ -11,14 +11,14 @@ static void RELAPACK_dlauum_rec(const char *, const int *, double *, * http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html * */ void RELAPACK_dlauum( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_dlauum( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DLAUUM", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DLAUUM", &minfo, strlen("DLAUUM")); return; } @@ -42,9 +42,9 @@ void RELAPACK_dlauum( /** dlauum's recursive compute kernel */ static void RELAPACK_dlauum_rec( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { if (*n <= MAX(CROSSOVER_DLAUUM, 1)) { @@ -57,8 +57,8 @@ static void RELAPACK_dlauum_rec( const double ONE[] = { 1. }; // Splitting - const int n1 = DREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = DREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/dpbtrf.c b/relapack/src/dpbtrf.c index 6fd0ebe481..9380b28ad6 100644 --- a/relapack/src/dpbtrf.c +++ b/relapack/src/dpbtrf.c @@ -1,8 +1,8 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *, - double *, const int *, double *, const int *, int *); +static void RELAPACK_dpbtrf_rec(const char *, const blasint *, const blasint *, + double *, const blasint *, double *, const blasint *, blasint *); /** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. @@ -12,14 +12,14 @@ static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html * */ void RELAPACK_dpbtrf( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + double *Ab, const blasint *ldAb, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -30,8 +30,8 @@ void RELAPACK_dpbtrf( else if (*ldAb < *kd + 1) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DPBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DPBTRF", &minfo, strlen("DPBTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_dpbtrf( const double ZERO[] = { 0. }; // Allocate work space - const int n1 = DREC_SPLIT(*n); - const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint n1 = DREC_SPLIT(*n); + const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; double *Work = malloc(mWork * nWork * sizeof(double)); LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -58,10 +58,10 @@ void RELAPACK_dpbtrf( /** dpbtrf's recursive compute kernel */ static void RELAPACK_dpbtrf_rec( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - double *Work, const int *ldWork, - int *info + const char *uplo, const blasint *n, const blasint *kd, + double *Ab, const blasint *ldAb, + double *Work, const blasint *ldWork, + blasint *info ){ if (*n <= MAX(CROSSOVER_DPBTRF, 1)) { @@ -75,12 +75,12 @@ static void RELAPACK_dpbtrf_rec( const double MONE[] = { -1. }; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; double *const A = Ab + ((*uplo == 'L') ? 0 : *kd); // Splitting - const int n1 = MIN(DREC_SPLIT(*n), *kd); - const int n2 = *n - n1; + const blasint n1 = MIN(DREC_SPLIT(*n), *kd); + const blasint n2 = *n - n1; // * * // * Ab_BR @@ -99,8 +99,8 @@ static void RELAPACK_dpbtrf_rec( return; // Banded splitting - const int n21 = MIN(n2, *kd - n1); - const int n22 = MIN(n2 - n21, n1); + const blasint n21 = MIN(n2, *kd - n1); + const blasint n22 = MIN(n2 - n21, n1); // n1 n21 n22 // n1 * A_TRl A_TRr diff --git a/relapack/src/dpotrf.c b/relapack/src/dpotrf.c index c14fb3d718..cf326b18fd 100644 --- a/relapack/src/dpotrf.c +++ b/relapack/src/dpotrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_dpotrf_rec(const char *, const int *, double *, - const int *, int *); +static void RELAPACK_dpotrf_rec(const char *, const blasint *, double *, + const blasint *, blasint *); /** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. @@ -11,14 +11,14 @@ static void RELAPACK_dpotrf_rec(const char *, const int *, double *, * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html * */ void RELAPACK_dpotrf( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_dpotrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DPOTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DPOTRF", &minfo, strlen("DPOTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_dpotrf( /** dpotrf's recursive compute kernel */ static void RELAPACK_dpotrf_rec( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ){ if (*n <= MAX(CROSSOVER_DPOTRF, 1)) { @@ -58,8 +58,8 @@ static void RELAPACK_dpotrf_rec( const double MONE[] = { -1. }; // Splitting - const int n1 = DREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = DREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/dsygst.c b/relapack/src/dsygst.c index 0228068cef..f68241e3ab 100644 --- a/relapack/src/dsygst.c +++ b/relapack/src/dsygst.c @@ -3,9 +3,9 @@ #include "stdlib.h" #endif -static void RELAPACK_dsygst_rec(const int *, const char *, const int *, - double *, const int *, const double *, const int *, - double *, const int *, int *); +static void RELAPACK_dsygst_rec(const blasint *, const char *, const blasint *, + double *, const blasint *, const double *, const blasint *, + double *, const blasint *, blasint *); /** DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. @@ -15,14 +15,14 @@ static void RELAPACK_dsygst_rec(const int *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/dc/d04/dsygst_8f.html * */ void RELAPACK_dsygst( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + double *A, const blasint *ldA, const double *B, const blasint *ldB, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (*itype < 1 || *itype > 3) *info = -1; @@ -35,8 +35,8 @@ void RELAPACK_dsygst( else if (*ldB < MAX(1, *n)) *info = -7; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DSYGST", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DSYGST", &minfo, strlen("DSYGST")); return; } @@ -45,10 +45,10 @@ void RELAPACK_dsygst( // Allocate work space double *Work = NULL; - int lWork = 0; + blasint lWork = 0; #if XSYGST_ALLOW_MALLOC - const int n1 = DREC_SPLIT(*n); - lWork = n1 * (*n - n1); + const blasint n1 = DREC_SPLIT(*n); + lWork = abs( n1 * (*n - n1) ); Work = malloc(lWork * sizeof(double)); if (!Work) lWork = 0; @@ -67,9 +67,9 @@ void RELAPACK_dsygst( /** dsygst's recursive compute kernel */ static void RELAPACK_dsygst_rec( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - double *Work, const int *lWork, int *info + const blasint *itype, const char *uplo, const blasint *n, + double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *Work, const blasint *lWork, blasint *info ) { if (*n <= MAX(CROSSOVER_SSYGST, 1)) { @@ -84,14 +84,14 @@ static void RELAPACK_dsygst_rec( const double MONE[] = { -1. }; const double HALF[] = { .5 }; const double MHALF[] = { -.5 }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; // Splitting - const int n1 = DREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = DREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/dsytrf.c b/relapack/src/dsytrf.c index 80b119336a..43d28f94eb 100644 --- a/relapack/src/dsytrf.c +++ b/relapack/src/dsytrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *, - double *, const int *, int *, double *, const int *, int *); +static void RELAPACK_dsytrf_rec(const char *, const blasint *, const blasint *, blasint *, + double *, const blasint *, blasint *, double *, const blasint *, blasint *); /** DSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *, * http://www.netlib.org/lapack/explore-html/dd/df4/dsytrf_8f.html * */ void RELAPACK_dsytrf( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_dsytrf( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DSYTRF", &minfo, strlen("DSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_dsytrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy arguments - int nout; + blasint nout; // Recursive kernel RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_dsytrf( /** dsytrf's recursive compute kernel */ static void RELAPACK_dsytrf_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - double *A, const int *ldA, int *ipiv, - double *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_DSYTRF, 3)) { // Unblocked @@ -96,34 +96,34 @@ static void RELAPACK_dsytrf_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = DREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = DREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_dsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -139,23 +139,23 @@ static void RELAPACK_dsytrf_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + n1; double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_dsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; @@ -182,22 +182,22 @@ static void RELAPACK_dsytrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = DREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = DREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_dsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -213,19 +213,19 @@ static void RELAPACK_dsytrf_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_dsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/dsytrf_rec2.c b/relapack/src/dsytrf_rec2.c index 72ef827b16..6ed1a47a25 100644 --- a/relapack/src/dsytrf_rec2.c +++ b/relapack/src/dsytrf_rec2.c @@ -14,7 +14,7 @@ /* Table of constant values */ -static int c__1 = 1; +static blasint c__1 = 1; static double c_b8 = -1.; static double c_b9 = 1.; @@ -25,33 +25,33 @@ static double c_b9 = 1.; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, int *n, int * - nb, int *kb, double *a, int *lda, int *ipiv, - double *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, blasint *n, blasint * + nb, blasint *kb, double *a, blasint *lda, blasint *ipiv, + double *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; double d__1, d__2, d__3; /* Builtin functions */ double sqrt(double); /* Local variables */ - static int j, k; + static blasint j, k; static double t, r1, d11, d21, d22; - static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; static double alpha; - extern /* Subroutine */ int dscal_(int *, double *, double *, - int *); + extern /* Subroutine */ blasint dscal_(int *, double *, double *, + blasint *); extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int dgemv_(char *, int *, int *, - double *, double *, int *, double *, int *, - double *, double *, int *, ftnlen), dcopy_(int *, - double *, int *, double *, int *), dswap_(int - *, double *, int *, double *, int *); - static int kstep; + extern /* Subroutine */ blasint dgemv_(char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *, + double *, double *, blasint *, ftnlen), dcopy_(int *, + double *, blasint *, double *, blasint *), dswap_(int + *, double *, blasint *, double *, blasint *); + static blasint kstep; static double absakk; - extern int idamax_(int *, double *, int *); + extern blasint idamax_(int *, double *, blasint *); static double colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/dsytrf_rook.c b/relapack/src/dsytrf_rook.c index 19a875c7ad..78fa652abe 100644 --- a/relapack/src/dsytrf_rook.c +++ b/relapack/src/dsytrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int *, - double *, const int *, int *, double *, const int *, int *); +static void RELAPACK_dsytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, + double *, const blasint *, blasint *, double *, const blasint *, blasint *); /** DSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int * http://www.netlib.org/lapack/explore-html/db/df4/dsytrf__rook_8f.html * */ void RELAPACK_dsytrf_rook( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_dsytrf_rook( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DSYTRF", &minfo, strlen("DSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_dsytrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_dsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_dsytrf_rook( /** dsytrf_rook's recursive compute kernel */ static void RELAPACK_dsytrf_rook_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - double *A, const int *ldA, int *ipiv, - double *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_DSYTRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_dsytrf_rook_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = DREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = DREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_dsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_dsytrf_rook_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + n1; double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_dsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_dsytrf_rook_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_dsytrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = DREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = DREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_dsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_dsytrf_rook_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_dsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/dsytrf_rook_rec2.c b/relapack/src/dsytrf_rook_rec2.c index 105ef5ed3e..bdb5c6e29c 100644 --- a/relapack/src/dsytrf_rook_rec2.c +++ b/relapack/src/dsytrf_rook_rec2.c @@ -14,7 +14,7 @@ /* Table of constant values */ -static int c__1 = 1; +static blasint c__1 = 1; static double c_b9 = -1.; static double c_b10 = 1.; @@ -25,39 +25,39 @@ static double c_b10 = 1.; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, int *n, - int *nb, int *kb, double *a, int *lda, int *ipiv, - double *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, blasint *n, + int *nb, blasint *kb, double *a, blasint *lda, blasint *ipiv, + double *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; double d__1; /* Builtin functions */ double sqrt(double); /* Local variables */ - static int j, k, p; + static blasint j, k, p; static double t, r1, d11, d12, d21, d22; - static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static int imax, jmax; + static blasint imax, jmax; static double alpha; - extern /* Subroutine */ int dscal_(int *, double *, double *, - int *); + extern /* Subroutine */ blasint dscal_(int *, double *, double *, + blasint *); extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int dgemv_(char *, int *, int *, - double *, double *, int *, double *, int *, - double *, double *, int *, ftnlen); + extern /* Subroutine */ blasint dgemv_(char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *, + double *, double *, blasint *, ftnlen); static double dtemp, sfmin; - static int itemp; - extern /* Subroutine */ int dcopy_(int *, double *, int *, - double *, int *), dswap_(int *, double *, int - *, double *, int *); - static int kstep; + static blasint itemp; + extern /* Subroutine */ blasint dcopy_(int *, double *, blasint *, + double *, blasint *), dswap_(int *, double *, int + *, double *, blasint *); + static blasint kstep; extern double dlamch_(char *, ftnlen); static double absakk; - extern int idamax_(int *, double *, int *); + extern blasint idamax_(int *, double *, blasint *); static double colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/dtgsyl.c b/relapack/src/dtgsyl.c index c506926af2..9bbc987e71 100644 --- a/relapack/src/dtgsyl.c +++ b/relapack/src/dtgsyl.c @@ -1,11 +1,11 @@ #include "relapack.h" #include -static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *, - const int *, const double *, const int *, const double *, const int *, - double *, const int *, const double *, const int *, const double *, - const int *, double *, const int *, double *, double *, double *, int *, - int *, int *); +static void RELAPACK_dtgsyl_rec(const char *, const blasint *, const blasint *, + const blasint *, const double *, const blasint *, const double *, const blasint *, + double *, const blasint *, const double *, const blasint *, const double *, + const blasint *, double *, const blasint *, double *, double *, double *, blasint *, + blasint *, blasint *); /** DTGSYL solves the generalized Sylvester equation. @@ -15,21 +15,21 @@ static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/db/d88/dtgsyl_8f.html * */ void RELAPACK_dtgsyl( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, + const double *D, const blasint *ldD, const double *E, const blasint *ldE, + double *F, const blasint *ldF, double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info + double *Work, const blasint *lWork, blasint *iWork, blasint *info ) { // Parse arguments - const int notran = LAPACK(lsame)(trans, "N"); - const int tran = LAPACK(lsame)(trans, "T"); + const blasint notran = LAPACK(lsame)(trans, "N"); + const blasint tran = LAPACK(lsame)(trans, "T"); // Compute work buffer size - int lwmin = 1; + blasint lwmin = 1; if (notran && (*ijob == 1 || *ijob == 2)) lwmin = MAX(1, 2 * *m * *n); *info = 0; @@ -58,8 +58,8 @@ void RELAPACK_dtgsyl( else if (*lWork < lwmin && *lWork != -1) *info = -20; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DTGSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DTGSYL", &minfo, strlen("DTGSYL")); return; } @@ -75,8 +75,8 @@ void RELAPACK_dtgsyl( // Constant const double ZERO[] = { 0. }; - int isolve = 1; - int ifunc = 0; + blasint isolve = 1; + blasint ifunc = 0; if (notran) { if (*ijob >= 3) { ifunc = *ijob - 2; @@ -87,12 +87,12 @@ void RELAPACK_dtgsyl( } double scale2; - int iround; + blasint iround; for (iround = 1; iround <= isolve; iround++) { *scale = 1; double dscale = 0; double dsum = 1; - int pq; + blasint pq; RELAPACK_dtgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); if (dscale != 0) { if (*ijob == 1 || *ijob == 3) @@ -121,13 +121,13 @@ void RELAPACK_dtgsyl( /** dtgsyl's recursive vompute kernel */ static void RELAPACK_dtgsyl_rec( - const char *trans, const int *ifunc, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, + const char *trans, const blasint *ifunc, const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, + const double *D, const blasint *ldD, const double *E, const blasint *ldE, + double *F, const blasint *ldF, double *scale, double *dsum, double *dscale, - int *iWork, int *pq, int *info + blasint *iWork, blasint *pq, blasint *info ) { if (*m <= MAX(CROSSOVER_DTGSYL, 1) && *n <= MAX(CROSSOVER_DTGSYL, 1)) { @@ -139,20 +139,20 @@ static void RELAPACK_dtgsyl_rec( // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs double scale1[] = { 1. }; double scale2[] = { 1. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - int m1 = DREC_SPLIT(*m); + blasint m1 = DREC_SPLIT(*m); if (A[m1 + *ldA * (m1 - 1)]) m1++; - const int m2 = *m - m1; + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -210,10 +210,10 @@ static void RELAPACK_dtgsyl_rec( } } else { // Splitting - int n1 = DREC_SPLIT(*n); + blasint n1 = DREC_SPLIT(*n); if (B[n1 + *ldB * (n1 - 1)]) n1++; - const int n2 = *n - n1; + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/dtrsyl.c b/relapack/src/dtrsyl.c index c87b53ae52..7663773007 100644 --- a/relapack/src/dtrsyl.c +++ b/relapack/src/dtrsyl.c @@ -1,8 +1,8 @@ #include "relapack.h" -static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *, - const int *, const int *, const double *, const int *, const double *, - const int *, double *, const int *, double *, int *); +static void RELAPACK_dtrsyl_rec(const char *, const char *, const blasint *, + const blasint *, const blasint *, const double *, const blasint *, const double *, + const blasint *, double *, const blasint *, double *, blasint *); /** DTRSYL solves the real Sylvester matrix equation. @@ -12,20 +12,20 @@ static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d6/d43/dtrsyl_8f.html * */ void RELAPACK_dtrsyl( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, double *scale, + blasint *info ) { // Check arguments - const int notransA = LAPACK(lsame)(tranA, "N"); - const int transA = LAPACK(lsame)(tranA, "T"); - const int ctransA = LAPACK(lsame)(tranA, "C"); - const int notransB = LAPACK(lsame)(tranB, "N"); - const int transB = LAPACK(lsame)(tranB, "T"); - const int ctransB = LAPACK(lsame)(tranB, "C"); + const blasint notransA = LAPACK(lsame)(tranA, "N"); + const blasint transA = LAPACK(lsame)(tranA, "T"); + const blasint ctransA = LAPACK(lsame)(tranA, "C"); + const blasint notransB = LAPACK(lsame)(tranB, "N"); + const blasint transB = LAPACK(lsame)(tranB, "T"); + const blasint ctransB = LAPACK(lsame)(tranB, "C"); *info = 0; if (!transA && !ctransA && !notransA) *info = -1; @@ -44,8 +44,8 @@ void RELAPACK_dtrsyl( else if (*ldC < MAX(1, *m)) *info = -11; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DTRSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DTRSYL", &minfo, strlen("DTRSYL")); return; } @@ -60,11 +60,11 @@ void RELAPACK_dtrsyl( /** dtrsyl's recursive compute kernel */ static void RELAPACK_dtrsyl_rec( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, double *scale, + blasint *info ) { if (*m <= MAX(CROSSOVER_DTRSYL, 1) && *n <= MAX(CROSSOVER_DTRSYL, 1)) { @@ -77,20 +77,20 @@ static void RELAPACK_dtrsyl_rec( const double ONE[] = { 1. }; const double MONE[] = { -1. }; const double MSGN[] = { -*isgn }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs double scale1[] = { 1. }; double scale2[] = { 1. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - int m1 = DREC_SPLIT(*m); + blasint m1 = DREC_SPLIT(*m); if (A[m1 + *ldA * (m1 - 1)]) m1++; - const int m2 = *m - m1; + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -126,10 +126,10 @@ static void RELAPACK_dtrsyl_rec( } } else { // Splitting - int n1 = DREC_SPLIT(*n); + blasint n1 = DREC_SPLIT(*n); if (B[n1 + *ldB * (n1 - 1)]) n1++; - const int n2 = *n - n1; + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/dtrsyl_rec2.c b/relapack/src/dtrsyl_rec2.c index 479c7f340a..50dabf76d7 100644 --- a/relapack/src/dtrsyl_rec2.c +++ b/relapack/src/dtrsyl_rec2.c @@ -14,52 +14,52 @@ /* Table of constant values */ -static int c__1 = 1; -static int c_false = FALSE_; -static int c__2 = 2; +static blasint c__1 = 1; +static blasint c_false = FALSE_; +static blasint c__2 = 2; static double c_b26 = 1.; static double c_b30 = 0.; -static int c_true = TRUE_; +static blasint c_true = TRUE_; -int RELAPACK_dtrsyl_rec2(char *trana, char *tranb, int *isgn, int - *m, int *n, double *a, int *lda, double *b, int * - ldb, double *c__, int *ldc, double *scale, int *info, +int RELAPACK_dtrsyl_rec2(char *trana, char *tranb, blasint *isgn, int + *m, blasint *n, double *a, blasint *lda, double *b, blasint * + ldb, double *c__, blasint *ldc, double *scale, blasint *info, ftnlen trana_len, ftnlen tranb_len) { /* System generated locals */ - int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; double d__1, d__2; /* Local variables */ - static int j, k, l; + static blasint j, k, l; static double x[4] /* was [2][2] */; - static int k1, k2, l1, l2; + static blasint k1, k2, l1, l2; static double a11, db, da11, vec[4] /* was [2][2] */, dum[1], eps, sgn; - extern double ddot_(int *, double *, int *, double *, - int *); - static int ierr; + extern double ddot_(int *, double *, blasint *, double *, + blasint *); + static blasint ierr; static double smin, suml, sumr; - extern /* Subroutine */ int dscal_(int *, double *, double *, - int *); - extern int lsame_(char *, char *, ftnlen, ftnlen); - static int knext, lnext; + extern /* Subroutine */ blasint dscal_(int *, double *, double *, + blasint *); + extern blasint lsame_(char *, char *, ftnlen, ftnlen); + static blasint knext, lnext; static double xnorm; - extern /* Subroutine */ int dlaln2_(int *, int *, int *, - double *, double *, double *, int *, double *, - double *, double *, int *, double *, double * - , double *, int *, double *, double *, int *), - dlasy2_(int *, int *, int *, int *, int *, - double *, int *, double *, int *, double *, - int *, double *, double *, int *, double *, - int *), dlabad_(double *, double *); - extern double dlamch_(char *, ftnlen), dlange_(char *, int *, - int *, double *, int *, double *, ftnlen); + extern /* Subroutine */ blasint dlaln2_(int *, blasint *, blasint *, + double *, double *, double *, blasint *, double *, + double *, double *, blasint *, double *, double * + , double *, blasint *, double *, double *, blasint *), + dlasy2_(int *, blasint *, blasint *, blasint *, blasint *, + double *, blasint *, double *, blasint *, double *, + blasint *, double *, double *, blasint *, double *, + blasint *), dlabad_(double *, double *); + extern double dlamch_(char *, ftnlen), dlange_(char *, blasint *, + blasint *, double *, blasint *, double *, ftnlen); static double scaloc; - extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); + extern /* Subroutine */ blasint xerbla_(char *, blasint *, ftnlen); static double bignum; - static int notrna, notrnb; + static blasint notrna, notrnb; static double smlnum; /* Parameter adjustments */ diff --git a/relapack/src/dtrtri.c b/relapack/src/dtrtri.c index 0462609e9e..72777e7e49 100644 --- a/relapack/src/dtrtri.c +++ b/relapack/src/dtrtri.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_dtrtri_rec(const char *, const char *, const int *, - double *, const int *, int *); +static void RELAPACK_dtrtri_rec(const char *, const char *, const blasint *, + double *, const blasint *, blasint *); /** DTRTRI computes the inverse of a real upper or lower triangular matrix A. @@ -11,16 +11,16 @@ static void RELAPACK_dtrtri_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d5/dba/dtrtri_8f.html * */ void RELAPACK_dtrtri( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int nounit = LAPACK(lsame)(diag, "N"); - const int unit = LAPACK(lsame)(diag, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint nounit = LAPACK(lsame)(diag, "N"); + const blasint unit = LAPACK(lsame)(diag, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -31,8 +31,8 @@ void RELAPACK_dtrtri( else if (*ldA < MAX(1, *n)) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("DTRTRI", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("DTRTRI", &minfo, strlen("DTRTRI")); return; } @@ -42,7 +42,7 @@ void RELAPACK_dtrtri( // check for singularity if (nounit) { - int i; + blasint i; for (i = 0; i < *n; i++) if (A[i + *ldA * i] == 0) { *info = i; @@ -57,9 +57,9 @@ void RELAPACK_dtrtri( /** dtrtri's recursive compute kernel */ static void RELAPACK_dtrtri_rec( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + double *A, const blasint *ldA, + blasint *info ){ if (*n <= MAX(CROSSOVER_DTRTRI, 1)) { @@ -73,8 +73,8 @@ static void RELAPACK_dtrtri_rec( const double MONE[] = { -1. }; // Splitting - const int n1 = DREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = DREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/f2c.c b/relapack/src/f2c.c index 5a34524191..48539c4b9e 100644 --- a/relapack/src/f2c.c +++ b/relapack/src/f2c.c @@ -9,7 +9,7 @@ #endif #endif -void sig_die(const char *s, int kill) { +void sig_die(const char *s, blasint kill) { /* print error message, then clear buffers */ fprintf(stderr, "%s\n", s); diff --git a/relapack/src/f2c.h b/relapack/src/f2c.h index b94ee7c8e1..85337becfa 100644 --- a/relapack/src/f2c.h +++ b/relapack/src/f2c.h @@ -7,6 +7,19 @@ #ifndef F2C_INCLUDE #define F2C_INCLUDE +#ifdef USE64BITINT +typedef BLASLONG blasint; +#if defined(OS_WINDOWS) && defined(__64BIT__) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + + typedef long int integer; typedef unsigned long int uinteger; typedef char *address; diff --git a/relapack/src/lapack.h b/relapack/src/lapack.h index 064276b7e0..776b0589fa 100644 --- a/relapack/src/lapack.h +++ b/relapack/src/lapack.h @@ -1,80 +1,80 @@ #ifndef LAPACK_H #define LAPACK_H -extern int LAPACK(lsame)(const char *, const char *); -extern int LAPACK(xerbla)(const char *, const int *); +extern blasint LAPACK(lsame)(const char *, const char *); +extern blasint LAPACK(xerbla)(const char *, const blasint *, int); -extern void LAPACK(slaswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); -extern void LAPACK(dlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); -extern void LAPACK(claswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); -extern void LAPACK(zlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); +extern void LAPACK(slaswp)(const blasint *, float *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); +extern void LAPACK(dlaswp)(const blasint *, double *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); +extern void LAPACK(claswp)(const blasint *, float *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); +extern void LAPACK(zlaswp)(const blasint *, double *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); -extern void LAPACK(slaset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); -extern void LAPACK(dlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); -extern void LAPACK(claset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); -extern void LAPACK(zlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); +extern void LAPACK(slaset)(const char *, const blasint *, const blasint *, const float *, const float *, float *, const blasint *); +extern void LAPACK(dlaset)(const char *, const blasint *, const blasint *, const double *, const double *, double *, const blasint *); +extern void LAPACK(claset)(const char *, const blasint *, const blasint *, const float *, const float *, float *, const blasint *); +extern void LAPACK(zlaset)(const char *, const blasint *, const blasint *, const double *, const double *, double *, const blasint *); -extern void LAPACK(slacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); -extern void LAPACK(dlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); -extern void LAPACK(clacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); -extern void LAPACK(zlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); +extern void LAPACK(slacpy)(const char *, const blasint *, const blasint *, const float *, const blasint *, float *, const blasint *); +extern void LAPACK(dlacpy)(const char *, const blasint *, const blasint *, const double *, const blasint *, double *, const blasint *); +extern void LAPACK(clacpy)(const char *, const blasint *, const blasint *, const float *, const blasint *, float *, const blasint *); +extern void LAPACK(zlacpy)(const char *, const blasint *, const blasint *, const double *, const blasint *, double *, const blasint *); -extern void LAPACK(slascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); -extern void LAPACK(dlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); -extern void LAPACK(clascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); -extern void LAPACK(zlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); +extern void LAPACK(slascl)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(dlascl)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(clascl)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(zlascl)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(slauu2)(const char *, const int *, float *, const int *, int *); -extern void LAPACK(dlauu2)(const char *, const int *, double *, const int *, int *); -extern void LAPACK(clauu2)(const char *, const int *, float *, const int *, int *); -extern void LAPACK(zlauu2)(const char *, const int *, double *, const int *, int *); +extern void LAPACK(slauu2)(const char *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(dlauu2)(const char *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(clauu2)(const char *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(zlauu2)(const char *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(ssygs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); -extern void LAPACK(dsygs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); -extern void LAPACK(chegs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); -extern void LAPACK(zhegs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); +extern void LAPACK(ssygs2)(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *); +extern void LAPACK(dsygs2)(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *); +extern void LAPACK(chegs2)(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *); +extern void LAPACK(zhegs2)(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *); -extern void LAPACK(strti2)(const char *, const char *, const int *, float *, const int *, int *); -extern void LAPACK(dtrti2)(const char *, const char *, const int *, double *, const int *, int *); -extern void LAPACK(ctrti2)(const char *, const char *, const int *, float *, const int *, int *); -extern void LAPACK(ztrti2)(const char *, const char *, const int *, double *, const int *, int *); +extern void LAPACK(strti2)(const char *, const char *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(dtrti2)(const char *, const char *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(ctrti2)(const char *, const char *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(ztrti2)(const char *, const char *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(spotf2)(const char *, const int *, float *, const int *, int *); -extern void LAPACK(dpotf2)(const char *, const int *, double *, const int *, int *); -extern void LAPACK(cpotf2)(const char *, const int *, float *, const int *, int *); -extern void LAPACK(zpotf2)(const char *, const int *, double *, const int *, int *); +extern void LAPACK(spotf2)(const char *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(dpotf2)(const char *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(cpotf2)(const char *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(zpotf2)(const char *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(spbtf2)(const char *, const int *, const int *, float *, const int *, int *); -extern void LAPACK(dpbtf2)(const char *, const int *, const int *, double *, const int *, int *); -extern void LAPACK(cpbtf2)(const char *, const int *, const int *, float *, const int *, int *); -extern void LAPACK(zpbtf2)(const char *, const int *, const int *, double *, const int *, int *); +extern void LAPACK(spbtf2)(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(dpbtf2)(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(cpbtf2)(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *); +extern void LAPACK(zpbtf2)(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(ssytf2)(const char *, const int *, float *, const int *, int *, int *); -extern void LAPACK(dsytf2)(const char *, const int *, double *, const int *, int *, int *); -extern void LAPACK(csytf2)(const char *, const int *, float *, const int *, int *, int *); -extern void LAPACK(chetf2)(const char *, const int *, float *, const int *, int *, int *); -extern void LAPACK(zsytf2)(const char *, const int *, double *, const int *, int *, int *); -extern void LAPACK(zhetf2)(const char *, const int *, double *, const int *, int *, int *); -extern void LAPACK(ssytf2_rook)(const char *, const int *, float *, const int *, int *, int *); -extern void LAPACK(dsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); -extern void LAPACK(csytf2_rook)(const char *, const int *, float *, const int *, int *, int *); -extern void LAPACK(chetf2_rook)(const char *, const int *, float *, const int *, int *, int *); -extern void LAPACK(zsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); -extern void LAPACK(zhetf2_rook)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(ssytf2)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(dsytf2)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(csytf2)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(chetf2)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(zsytf2)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(zhetf2)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(ssytf2_rook)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(dsytf2_rook)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(csytf2_rook)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(chetf2_rook)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(zsytf2_rook)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(zhetf2_rook)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); -extern void LAPACK(sgetf2)(const int *, const int *, float *, const int *, int *, int *); -extern void LAPACK(dgetf2)(const int *, const int *, double *, const int *, int *, int *); -extern void LAPACK(cgetf2)(const int *, const int *, float *, const int *, int *, int *); -extern void LAPACK(zgetf2)(const int *, const int *, double *, const int *, int *, int *); +extern void LAPACK(sgetf2)(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(dgetf2)(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(cgetf2)(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(zgetf2)(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); -extern void LAPACK(sgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); -extern void LAPACK(dgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); -extern void LAPACK(cgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); -extern void LAPACK(zgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); +extern void LAPACK(sgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(dgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(cgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(zgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); -extern void LAPACK(stgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *, int *, int *); -extern void LAPACK(dtgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *, int *, int *); -extern void LAPACK(ctgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *); -extern void LAPACK(ztgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *); +extern void LAPACK(stgsy2)(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, blasint *, blasint *, blasint *); +extern void LAPACK(dtgsy2)(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, blasint *, blasint *, blasint *); +extern void LAPACK(ctgsy2)(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, blasint *); +extern void LAPACK(ztgsy2)(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, blasint *); #endif /* LAPACK_H */ diff --git a/relapack/src/lapack_wrappers.c b/relapack/src/lapack_wrappers.c index 4885472603..0252f3d92b 100644 --- a/relapack/src/lapack_wrappers.c +++ b/relapack/src/lapack_wrappers.c @@ -6,9 +6,9 @@ #if INCLUDE_SLAUUM void LAPACK(slauum)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { RELAPACK_slauum(uplo, n, A, ldA, info); } @@ -16,9 +16,9 @@ void LAPACK(slauum)( #if INCLUDE_DLAUUM void LAPACK(dlauum)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { RELAPACK_dlauum(uplo, n, A, ldA, info); } @@ -26,9 +26,9 @@ void LAPACK(dlauum)( #if INCLUDE_CLAUUM void LAPACK(clauum)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { RELAPACK_clauum(uplo, n, A, ldA, info); } @@ -36,9 +36,9 @@ void LAPACK(clauum)( #if INCLUDE_ZLAUUM void LAPACK(zlauum)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { RELAPACK_zlauum(uplo, n, A, ldA, info); } @@ -51,9 +51,9 @@ void LAPACK(zlauum)( #if INCLUDE_SSYGST void LAPACK(ssygst)( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + float *A, const blasint *ldA, const float *B, const blasint *ldB, + blasint *info ) { RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); } @@ -61,9 +61,9 @@ void LAPACK(ssygst)( #if INCLUDE_DSYGST void LAPACK(dsygst)( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + double *A, const blasint *ldA, const double *B, const blasint *ldB, + blasint *info ) { RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); } @@ -71,9 +71,9 @@ void LAPACK(dsygst)( #if INCLUDE_CHEGST void LAPACK(chegst)( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + float *A, const blasint *ldA, const float *B, const blasint *ldB, + blasint *info ) { RELAPACK_chegst(itype, uplo, n, A, ldA, B, ldB, info); } @@ -81,9 +81,9 @@ void LAPACK(chegst)( #if INCLUDE_ZHEGST void LAPACK(zhegst)( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + double *A, const blasint *ldA, const double *B, const blasint *ldB, + blasint *info ) { RELAPACK_zhegst(itype, uplo, n, A, ldA, B, ldB, info); } @@ -96,9 +96,9 @@ void LAPACK(zhegst)( #if INCLUDE_STRTRI void LAPACK(strtri)( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { RELAPACK_strtri(uplo, diag, n, A, ldA, info); } @@ -106,9 +106,9 @@ void LAPACK(strtri)( #if INCLUDE_DTRTRI void LAPACK(dtrtri)( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); } @@ -116,9 +116,9 @@ void LAPACK(dtrtri)( #if INCLUDE_CTRTRI void LAPACK(ctrtri)( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); } @@ -126,9 +126,9 @@ void LAPACK(ctrtri)( #if INCLUDE_ZTRTRI void LAPACK(ztrtri)( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); } @@ -141,9 +141,9 @@ void LAPACK(ztrtri)( #if INCLUDE_SPOTRF void LAPACK(spotrf)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { RELAPACK_spotrf(uplo, n, A, ldA, info); } @@ -151,9 +151,9 @@ void LAPACK(spotrf)( #if INCLUDE_DPOTRF void LAPACK(dpotrf)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { RELAPACK_dpotrf(uplo, n, A, ldA, info); } @@ -161,9 +161,9 @@ void LAPACK(dpotrf)( #if INCLUDE_CPOTRF void LAPACK(cpotrf)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { RELAPACK_cpotrf(uplo, n, A, ldA, info); } @@ -171,9 +171,9 @@ void LAPACK(cpotrf)( #if INCLUDE_ZPOTRF void LAPACK(zpotrf)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { RELAPACK_zpotrf(uplo, n, A, ldA, info); } @@ -186,9 +186,9 @@ void LAPACK(zpotrf)( #if INCLUDE_SPBTRF void LAPACK(spbtrf)( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + float *Ab, const blasint *ldAb, + blasint *info ) { RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); } @@ -196,9 +196,9 @@ void LAPACK(spbtrf)( #if INCLUDE_DPBTRF void LAPACK(dpbtrf)( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + double *Ab, const blasint *ldAb, + blasint *info ) { RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); } @@ -206,9 +206,9 @@ void LAPACK(dpbtrf)( #if INCLUDE_CPBTRF void LAPACK(cpbtrf)( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + float *Ab, const blasint *ldAb, + blasint *info ) { RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); } @@ -216,9 +216,9 @@ void LAPACK(cpbtrf)( #if INCLUDE_ZPBTRF void LAPACK(zpbtrf)( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + double *Ab, const blasint *ldAb, + blasint *info ) { RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); } @@ -231,9 +231,9 @@ void LAPACK(zpbtrf)( #if INCLUDE_SSYTRF void LAPACK(ssytrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -241,9 +241,9 @@ void LAPACK(ssytrf)( #if INCLUDE_DSYTRF void LAPACK(dsytrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -251,9 +251,9 @@ void LAPACK(dsytrf)( #if INCLUDE_CSYTRF void LAPACK(csytrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -261,9 +261,9 @@ void LAPACK(csytrf)( #if INCLUDE_ZSYTRF void LAPACK(zsytrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -271,9 +271,9 @@ void LAPACK(zsytrf)( #if INCLUDE_CHETRF void LAPACK(chetrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -281,9 +281,9 @@ void LAPACK(chetrf)( #if INCLUDE_ZHETRF void LAPACK(zhetrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -291,9 +291,9 @@ void LAPACK(zhetrf)( #if INCLUDE_SSYTRF_ROOK void LAPACK(ssytrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -301,9 +301,9 @@ void LAPACK(ssytrf_rook)( #if INCLUDE_DSYTRF_ROOK void LAPACK(dsytrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -311,9 +311,9 @@ void LAPACK(dsytrf_rook)( #if INCLUDE_CSYTRF_ROOK void LAPACK(csytrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -321,9 +321,9 @@ void LAPACK(csytrf_rook)( #if INCLUDE_ZSYTRF_ROOK void LAPACK(zsytrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -331,9 +331,9 @@ void LAPACK(zsytrf_rook)( #if INCLUDE_CHETRF_ROOK void LAPACK(chetrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -341,9 +341,9 @@ void LAPACK(chetrf_rook)( #if INCLUDE_ZHETRF_ROOK void LAPACK(zhetrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -356,9 +356,9 @@ void LAPACK(zhetrf_rook)( #if INCLUDE_SGETRF void LAPACK(sgetrf)( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); } @@ -366,9 +366,9 @@ void LAPACK(sgetrf)( #if INCLUDE_DGETRF void LAPACK(dgetrf)( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); } @@ -376,9 +376,9 @@ void LAPACK(dgetrf)( #if INCLUDE_CGETRF void LAPACK(cgetrf)( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); } @@ -386,9 +386,9 @@ void LAPACK(cgetrf)( #if INCLUDE_ZGETRF void LAPACK(zgetrf)( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); } @@ -401,9 +401,9 @@ void LAPACK(zgetrf)( #if INCLUDE_SGBTRF void LAPACK(sgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + float *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); } @@ -411,9 +411,9 @@ void LAPACK(sgbtrf)( #if INCLUDE_DGBTRF void LAPACK(dgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + double *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); } @@ -421,9 +421,9 @@ void LAPACK(dgbtrf)( #if INCLUDE_CGBTRF void LAPACK(cgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + float *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); } @@ -431,9 +431,9 @@ void LAPACK(cgbtrf)( #if INCLUDE_ZGBTRF void LAPACK(zgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + double *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); } @@ -446,11 +446,11 @@ void LAPACK(zgbtrf)( #if INCLUDE_STRSYL void LAPACK(strsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, float *scale, + blasint *info ) { RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); } @@ -458,11 +458,11 @@ void LAPACK(strsyl)( #if INCLUDE_DTRSYL void LAPACK(dtrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, double *scale, + blasint *info ) { RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); } @@ -470,11 +470,11 @@ void LAPACK(dtrsyl)( #if INCLUDE_CTRSYL void LAPACK(ctrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, float *scale, + blasint *info ) { RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); } @@ -482,11 +482,11 @@ void LAPACK(ctrsyl)( #if INCLUDE_ZTRSYL void LAPACK(ztrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, double *scale, + blasint *info ) { RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); } @@ -499,13 +499,13 @@ void LAPACK(ztrsyl)( #if INCLUDE_STGSYL void LAPACK(stgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, + const float *D, const blasint *ldD, const float *E, const blasint *ldE, + float *F, const blasint *ldF, float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info + float *Work, const blasint *lWork, blasint *iWork, blasint *info ) { RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); } @@ -513,13 +513,13 @@ void LAPACK(stgsyl)( #if INCLUDE_DTGSYL void LAPACK(dtgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, + const double *D, const blasint *ldD, const double *E, const blasint *ldE, + double *F, const blasint *ldF, double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info + double *Work, const blasint *lWork, blasint *iWork, blasint *info ) { RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); } @@ -527,13 +527,13 @@ void LAPACK(dtgsyl)( #if INCLUDE_CTGSYL void LAPACK(ctgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, + const float *D, const blasint *ldD, const float *E, const blasint *ldE, + float *F, const blasint *ldF, float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info + float *Work, const blasint *lWork, blasint *iWork, blasint *info ) { RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); } @@ -541,13 +541,13 @@ void LAPACK(ctgsyl)( #if INCLUDE_ZTGSYL void LAPACK(ztgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, + const double *D, const blasint *ldD, const double *E, const blasint *ldE, + double *F, const blasint *ldF, double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info + double *Work, const blasint *lWork, blasint *iWork, blasint *info ) { RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); } @@ -561,10 +561,10 @@ void LAPACK(ztgsyl)( #if INCLUDE_SGEMMT void LAPACK(sgemmt)( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { RELAPACK_sgemmt(uplo, n, A, ldA, info); } @@ -573,10 +573,10 @@ void LAPACK(sgemmt)( #if INCLUDE_DGEMMT void LAPACK(dgemmt)( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { RELAPACK_dgemmt(uplo, n, A, ldA, info); } @@ -585,10 +585,10 @@ void LAPACK(dgemmt)( #if INCLUDE_CGEMMT void LAPACK(cgemmt)( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { RELAPACK_cgemmt(uplo, n, A, ldA, info); } @@ -597,10 +597,10 @@ void LAPACK(cgemmt)( #if INCLUDE_ZGEMMT void LAPACK(zgemmt)( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { RELAPACK_zgemmt(uplo, n, A, ldA, info); } diff --git a/relapack/src/relapack.h b/relapack/src/relapack.h index 2cb061c323..38c5c30d09 100644 --- a/relapack/src/relapack.h +++ b/relapack/src/relapack.h @@ -1,6 +1,14 @@ #ifndef RELAPACK_INT_H #define RELAPACK_INT_H - +#include +#include "../../config.h" +#if defined(OS_WINDOWS) && defined(__64BIT__) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif #include "../config.h" #include "../inc/relapack.h" @@ -38,23 +46,23 @@ #include "blas.h" // sytrf helper routines -void RELAPACK_ssytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_dsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_csytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_chetrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_zsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_zhetrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_ssytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_dsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_csytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_chetrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); -void RELAPACK_zsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_zhetrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_ssytrf_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_dsytrf_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_csytrf_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_chetrf_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_zsytrf_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_zhetrf_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_ssytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_dsytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_csytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_chetrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); +void RELAPACK_zsytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_zhetrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); // trsyl helper routines -void RELAPACK_strsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); -void RELAPACK_dtrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); -void RELAPACK_ctrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); -void RELAPACK_ztrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); +void RELAPACK_strsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *); +void RELAPACK_dtrsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *); +void RELAPACK_ctrsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *); +void RELAPACK_ztrsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *); #endif /* RELAPACK_INT_H */ diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c index bc20e744b2..3a4de4eced 100644 --- a/relapack/src/sgbtrf.c +++ b/relapack/src/sgbtrf.c @@ -1,9 +1,9 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *, - const int *, float *, const int *, int *, float *, const int *, float *, - const int *, int *); +static void RELAPACK_sgbtrf_rec(const blasint *, const blasint *, const blasint *, + const blasint *, float *, const blasint *, blasint *, float *, const blasint *, float *, + const blasint *, blasint *); /** SGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. @@ -13,11 +13,10 @@ static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/d5/d72/sgbtrf_8f.html * */ void RELAPACK_sgbtrf( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + float *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { - // Check arguments *info = 0; if (*m < 0) @@ -31,8 +30,8 @@ void RELAPACK_sgbtrf( else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SGBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SGBTRF", &minfo, strlen("SGBTRF")); return; } @@ -40,14 +39,14 @@ void RELAPACK_sgbtrf( const float ZERO[] = { 0. }; // Result upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskewg A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; float *const A = Ab + kv; // Zero upper diagonal fill-in elements - int i, j; + blasint i, j; for (j = 0; j < *n; j++) { float *const A_j = A + *ldA * j; for (i = MAX(0, j - kv); i < j - *ku; i++) @@ -55,11 +54,11 @@ void RELAPACK_sgbtrf( } // Allocate work space - const int n1 = SREC_SPLIT(*n); - const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const int nWorkl = (kv > n1) ? n1 : kv; - const int mWorku = (*kl > n1) ? n1 : *kl; - const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint n1 = SREC_SPLIT(*n); + const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const blasint nWorkl = (kv > n1) ? n1 : kv; + const blasint mWorku = (*kl > n1) ? n1 : *kl; + const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; float *Workl = malloc(mWorkl * nWorkl * sizeof(float)); float *Worku = malloc(mWorku * nWorku * sizeof(float)); LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -76,10 +75,10 @@ void RELAPACK_sgbtrf( /** sgbtrf's recursive compute kernel */ static void RELAPACK_sgbtrf_rec( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + float *Ab, const blasint *ldAb, blasint *ipiv, + float *Workl, const blasint *ldWorkl, float *Worku, const blasint *ldWorku, + blasint *info ) { if (*n <= MAX(CROSSOVER_SGBTRF, 1)) { @@ -91,25 +90,25 @@ static void RELAPACK_sgbtrf_rec( // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterators - int i, j; + blasint i, j; // Output upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; float *const A = Ab + kv; // Splitting - const int n1 = MIN(SREC_SPLIT(*n), *kl); - const int n2 = *n - n1; - const int m1 = MIN(n1, *m); - const int m2 = *m - m1; - const int mn1 = MIN(m1, n1); - const int mn2 = MIN(m2, n2); + const blasint n1 = MIN(SREC_SPLIT(*n), *kl); + const blasint n2 = *n - n1; + const blasint m1 = MIN(n1, *m); + const blasint m2 = *m - m1; + const blasint mn1 = MIN(m1, n1); + const blasint mn2 = MIN(m2, n2); // Ab_L * // Ab_BR @@ -129,14 +128,14 @@ static void RELAPACK_sgbtrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // Banded splitting - const int n21 = MIN(n2, kv - n1); - const int n22 = MIN(n2 - n21, n1); - const int m21 = MIN(m2, *kl - m1); - const int m22 = MIN(m2 - m21, m1); + const blasint n21 = MIN(n2, kv - n1); + const blasint n22 = MIN(n2 - n21, n1); + const blasint m21 = MIN(m2, *kl - m1); + const blasint m22 = MIN(m2 - m21, m1); // n1 n21 n22 // m * A_Rl ARr @@ -164,7 +163,7 @@ static void RELAPACK_sgbtrf_rec( // partially redo swaps in A_L for (i = 0; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); @@ -180,7 +179,7 @@ static void RELAPACK_sgbtrf_rec( for (j = 0; j < n22; j++) { float *const A_Rrj = A_Rr + *ldA * j; for (i = j; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { const float tmp = A_Rrj[i]; A_Rrj[i] = A_Rr[ip]; @@ -208,7 +207,7 @@ static void RELAPACK_sgbtrf_rec( // partially undo swaps in A_L for (i = mn1 - 1; i >= 0; i--) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); diff --git a/relapack/src/sgemmt.c b/relapack/src/sgemmt.c index 75f78fabd1..93438858c6 100644 --- a/relapack/src/sgemmt.c +++ b/relapack/src/sgemmt.c @@ -1,12 +1,12 @@ #include "relapack.h" static void RELAPACK_sgemmt_rec(const char *, const char *, const char *, - const int *, const int *, const float *, const float *, const int *, - const float *, const int *, const float *, float *, const int *); + const blasint *, const blasint *, const float *, const float *, const blasint *, + const float *, const blasint *, const float *, float *, const blasint *); static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *, - const int *, const int *, const float *, const float *, const int *, - const float *, const int *, const float *, float *, const int *); + const blasint *, const blasint *, const float *, const float *, const blasint *, + const float *, const blasint *, const float *, float *, const blasint *); /** SGEMMT computes a matrix-matrix product with general matrices but updates @@ -20,10 +20,10 @@ static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *, * */ void RELAPACK_sgemmt( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { #if HAVE_XGEMMT @@ -32,13 +32,13 @@ void RELAPACK_sgemmt( #else // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int notransA = LAPACK(lsame)(transA, "N"); - const int tranA = LAPACK(lsame)(transA, "T"); - const int notransB = LAPACK(lsame)(transB, "N"); - const int tranB = LAPACK(lsame)(transB, "T"); - int info = 0; + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint notransA = LAPACK(lsame)(transA, "N"); + const blasint tranA = LAPACK(lsame)(transA, "T"); + const blasint notransB = LAPACK(lsame)(transB, "N"); + const blasint tranB = LAPACK(lsame)(transB, "T"); + blasint info = 0; if (!lower && !upper) info = 1; else if (!tranA && !notransA) @@ -56,7 +56,7 @@ void RELAPACK_sgemmt( else if (*ldC < MAX(1, *n)) info = 13; if (info) { - LAPACK(xerbla)("SGEMMT", &info); + LAPACK(xerbla)("SGEMMT", &info, strlen("SGEMMT")); return; } @@ -74,10 +74,10 @@ void RELAPACK_sgemmt( /** sgemmt's recursive compute kernel */ static void RELAPACK_sgemmt_rec( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { if (*n <= MAX(CROSSOVER_SGEMMT, 1)) { @@ -87,8 +87,8 @@ static void RELAPACK_sgemmt_rec( } // Splitting - const int n1 = SREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = SREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_T // A_B @@ -124,16 +124,16 @@ static void RELAPACK_sgemmt_rec( /** sgemmt's unblocked compute kernel */ static void RELAPACK_sgemmt_rec2( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC + const blasint *n, const blasint *k, + const float *alpha, const float *A, const blasint *ldA, + const float *B, const blasint *ldB, + const float *beta, float *C, const blasint *ldC ) { - const int incB = (*transB == 'N') ? 1 : *ldB; - const int incC = 1; + const blasint incB = (*transB == 'N') ? 1 : *ldB; + const blasint incC = 1; - int i; + blasint i; for (i = 0; i < *n; i++) { // A_0 // A_i @@ -149,13 +149,13 @@ static void RELAPACK_sgemmt_rec2( float *const C_ii = C + *ldC * i + i; if (*uplo == 'L') { - const int nmi = *n - i; + const blasint nmi = *n - i; if (*transA == 'N') BLAS(sgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); else BLAS(sgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); } else { - const int ip1 = i + 1; + const blasint ip1 = i + 1; if (*transA == 'N') BLAS(sgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); else diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c index 284f8cff67..9d0ff10399 100644 --- a/relapack/src/sgetrf.c +++ b/relapack/src/sgetrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *, - int *, int *); +static void RELAPACK_sgetrf_rec(const blasint *, const blasint *, float *, const blasint *, + blasint *, blasint *); /** SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. @@ -11,9 +11,9 @@ static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *, * http://www.netlib.org/lapack/explore-html/de/de2/sgetrf_8f.html * */ void RELAPACK_sgetrf( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { // Check arguments @@ -25,12 +25,12 @@ void RELAPACK_sgetrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SGETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SGETRF", &minfo, strlen("SGETRF")); return; } - const int sn = MIN(*m, *n); + const blasint sn = MIN(*m, *n); RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -38,10 +38,10 @@ void RELAPACK_sgetrf( if (*m < *n) { // Constants const float ONE[] = { 1. }; - const int iONE[] = { 1. }; + const blasint iONE[] = { 1. }; // Splitting - const int rn = *n - *m; + const blasint rn = *n - *m; // A_L A_R const float *const A_L = A; @@ -57,9 +57,9 @@ void RELAPACK_sgetrf( /** sgetrf's recursive compute kernel */ static void RELAPACK_sgetrf_rec( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { if (*n <= MAX(CROSSOVER_SGETRF, 1)) { @@ -71,12 +71,12 @@ static void RELAPACK_sgetrf_rec( // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Splitting - const int n1 = SREC_SPLIT(*n); - const int n2 = *n - n1; - const int m2 = *m - n1; + const blasint n1 = SREC_SPLIT(*n); + const blasint n2 = *n - n1; + const blasint m2 = *m - n1; // A_L A_R float *const A_L = A; @@ -91,8 +91,8 @@ static void RELAPACK_sgetrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // recursion(A_L, ipiv_T) RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); @@ -111,7 +111,7 @@ static void RELAPACK_sgetrf_rec( // apply pivots to A_BL LAPACK(slaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) ipiv_B[i] += n1; } diff --git a/relapack/src/slauum.c b/relapack/src/slauum.c index 280f141b31..79212817f8 100644 --- a/relapack/src/slauum.c +++ b/relapack/src/slauum.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_slauum_rec(const char *, const int *, float *, - const int *, int *); +static void RELAPACK_slauum_rec(const char *, const blasint *, float *, + const blasint *, blasint *); /** SLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. @@ -11,14 +11,14 @@ static void RELAPACK_slauum_rec(const char *, const int *, float *, * http://www.netlib.org/lapack/explore-html/dd/d5a/slauum_8f.html * */ void RELAPACK_slauum( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_slauum( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SLAUUM", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SLAUUM", &minfo, strlen("SLAUUM")); return; } @@ -42,9 +42,9 @@ void RELAPACK_slauum( /** slauum's recursive compute kernel */ static void RELAPACK_slauum_rec( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { if (*n <= MAX(CROSSOVER_SLAUUM, 1)) { @@ -57,8 +57,8 @@ static void RELAPACK_slauum_rec( const float ONE[] = { 1. }; // Splitting - const int n1 = SREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = SREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/spbtrf.c b/relapack/src/spbtrf.c index ee0a5546e9..26804dcc2f 100644 --- a/relapack/src/spbtrf.c +++ b/relapack/src/spbtrf.c @@ -1,8 +1,8 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_spbtrf_rec(const char *, const int *, const int *, - float *, const int *, float *, const int *, int *); +static void RELAPACK_spbtrf_rec(const char *, const blasint *, const blasint *, + float *, const blasint *, float *, const blasint *, blasint *); /** SPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. @@ -12,14 +12,14 @@ static void RELAPACK_spbtrf_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/d1/d22/spbtrf_8f.html * */ void RELAPACK_spbtrf( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + float *Ab, const blasint *ldAb, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -30,8 +30,8 @@ void RELAPACK_spbtrf( else if (*ldAb < *kd + 1) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SPBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SPBTRF", &minfo, strlen("SPBTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_spbtrf( const float ZERO[] = { 0. }; // Allocate work space - const int n1 = SREC_SPLIT(*n); - const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint n1 = SREC_SPLIT(*n); + const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; float *Work = malloc(mWork * nWork * sizeof(float)); LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -58,10 +58,10 @@ void RELAPACK_spbtrf( /** spbtrf's recursive compute kernel */ static void RELAPACK_spbtrf_rec( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - float *Work, const int *ldWork, - int *info + const char *uplo, const blasint *n, const blasint *kd, + float *Ab, const blasint *ldAb, + float *Work, const blasint *ldWork, + blasint *info ){ if (*n <= MAX(CROSSOVER_SPBTRF, 1)) { @@ -75,12 +75,12 @@ static void RELAPACK_spbtrf_rec( const float MONE[] = { -1. }; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; float *const A = Ab + ((*uplo == 'L') ? 0 : *kd); // Splitting - const int n1 = MIN(SREC_SPLIT(*n), *kd); - const int n2 = *n - n1; + const blasint n1 = MIN(SREC_SPLIT(*n), *kd); + const blasint n2 = *n - n1; // * * // * Ab_BR @@ -99,8 +99,8 @@ static void RELAPACK_spbtrf_rec( return; // Banded splitting - const int n21 = MIN(n2, *kd - n1); - const int n22 = MIN(n2 - n21, *kd); + const blasint n21 = MIN(n2, *kd - n1); + const blasint n22 = MIN(n2 - n21, *kd); // n1 n21 n22 // n1 * A_TRl A_TRr diff --git a/relapack/src/spotrf.c b/relapack/src/spotrf.c index 2a609321be..b22e917f75 100644 --- a/relapack/src/spotrf.c +++ b/relapack/src/spotrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_spotrf_rec(const char *, const int *, float *, - const int *, int *); +static void RELAPACK_spotrf_rec(const char *, const blasint *, float *, + const blasint *, blasint *); /** SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. @@ -11,14 +11,14 @@ static void RELAPACK_spotrf_rec(const char *, const int *, float *, * http://www.netlib.org/lapack/explore-html/d0/da2/spotrf_8f.html * */ void RELAPACK_spotrf( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_spotrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SPOTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SPOTRF", &minfo, strlen("SPOTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_spotrf( /** spotrf's recursive compute kernel */ static void RELAPACK_spotrf_rec( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { if (*n <= MAX(CROSSOVER_SPOTRF, 1)) { @@ -58,8 +58,8 @@ static void RELAPACK_spotrf_rec( const float MONE[] = { -1. }; // Splitting - const int n1 = SREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = SREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/ssygst.c b/relapack/src/ssygst.c index 7f145cdec9..4259f90319 100644 --- a/relapack/src/ssygst.c +++ b/relapack/src/ssygst.c @@ -3,9 +3,9 @@ #include "stdlib.h" #endif -static void RELAPACK_ssygst_rec(const int *, const char *, const int *, - float *, const int *, const float *, const int *, - float *, const int *, int *); +static void RELAPACK_ssygst_rec(const blasint *, const char *, const blasint *, + float *, const blasint *, const float *, const blasint *, + float *, const blasint *, blasint *); /** SSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. @@ -15,14 +15,14 @@ static void RELAPACK_ssygst_rec(const int *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d8/d78/ssygst_8f.html * */ void RELAPACK_ssygst( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + float *A, const blasint *ldA, const float *B, const blasint *ldB, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (*itype < 1 || *itype > 3) *info = -1; @@ -35,8 +35,8 @@ void RELAPACK_ssygst( else if (*ldB < MAX(1, *n)) *info = -7; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SSYGST", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SSYGST", &minfo, strlen("SSYGST")); return; } @@ -45,9 +45,9 @@ void RELAPACK_ssygst( // Allocate work space float *Work = NULL; - int lWork = 0; + blasint lWork = 0; #if XSYGST_ALLOW_MALLOC - const int n1 = SREC_SPLIT(*n); + const blasint n1 = SREC_SPLIT(*n); lWork = n1 * (*n - n1); Work = malloc(lWork * sizeof(float)); if (!Work) @@ -67,9 +67,9 @@ void RELAPACK_ssygst( /** ssygst's recursive compute kernel */ static void RELAPACK_ssygst_rec( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - float *Work, const int *lWork, int *info + const blasint *itype, const char *uplo, const blasint *n, + float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *Work, const blasint *lWork, blasint *info ) { if (*n <= MAX(CROSSOVER_SSYGST, 1)) { @@ -84,14 +84,14 @@ static void RELAPACK_ssygst_rec( const float MONE[] = { -1. }; const float HALF[] = { .5 }; const float MHALF[] = { -.5 }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; // Splitting - const int n1 = SREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = SREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/ssytrf.c b/relapack/src/ssytrf.c index 8a4fad9f2a..9fe7ce4a6e 100644 --- a/relapack/src/ssytrf.c +++ b/relapack/src/ssytrf.c @@ -2,9 +2,8 @@ #if XSYTRF_ALLOW_MALLOC #include #endif - -static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *, - float *, const int *, int *, float *, const int *, int *); +static void RELAPACK_ssytrf_rec(const char *, const blasint *, const blasint *, blasint *, + float *, const blasint *, blasint *, float *, const blasint *, blasint *); /** SSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +13,21 @@ static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *, * http://www.netlib.org/lapack/explore-html/da/de9/ssytrf_8f.html * */ void RELAPACK_ssytrf( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +54,8 @@ void RELAPACK_ssytrf( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SSYTRF", &minfo, strlen("SSYTRF")); return; } @@ -64,7 +63,7 @@ void RELAPACK_ssytrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy arguments - int nout; + blasint nout; // Recursive kernel RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +77,13 @@ void RELAPACK_ssytrf( /** ssytrf's recursive compute kernel */ static void RELAPACK_ssytrf_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - float *A, const int *ldA, int *ipiv, - float *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_SSYTRF, 3)) { // Unblocked @@ -96,34 +95,34 @@ static void RELAPACK_ssytrf_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = SREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = SREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_ssytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -139,23 +138,23 @@ static void RELAPACK_ssytrf_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + n1; float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_ssytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; @@ -182,22 +181,22 @@ static void RELAPACK_ssytrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = SREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = SREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_ssytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -213,19 +212,19 @@ static void RELAPACK_ssytrf_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_ssytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/ssytrf_rec2.c b/relapack/src/ssytrf_rec2.c index edc9269eca..13856f0646 100644 --- a/relapack/src/ssytrf_rec2.c +++ b/relapack/src/ssytrf_rec2.c @@ -14,7 +14,7 @@ /* Table of constant values */ -static int c__1 = 1; +static blasint c__1 = 1; static float c_b8 = -1.f; static float c_b9 = 1.f; @@ -25,32 +25,32 @@ static float c_b9 = 1.f; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, int *n, int * - nb, int *kb, float *a, int *lda, int *ipiv, float *w, - int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, blasint *n, blasint * + nb, blasint *kb, float *a, blasint *lda, blasint *ipiv, float *w, + int *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; float r__1, r__2, r__3; /* Builtin functions */ double sqrt(double); /* Local variables */ - static int j, k; + static blasint j, k; static float t, r1, d11, d21, d22; - static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; static float alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int sscal_(int *, float *, float *, int *), - sgemv_(char *, int *, int *, float *, float *, int *, - float *, int *, float *, float *, int *, ftnlen); - static int kstep; - extern /* Subroutine */ int scopy_(int *, float *, int *, float *, - int *), sswap_(int *, float *, int *, float *, int * + extern /* Subroutine */ blasint sscal_(int *, float *, float *, blasint *), + sgemv_(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *, ftnlen); + static blasint kstep; + extern /* Subroutine */ blasint scopy_(int *, float *, blasint *, float *, + blasint *), sswap_(int *, float *, blasint *, float *, blasint * ); static float absakk; - extern int isamax_(int *, float *, int *); + extern blasint isamax_(int *, float *, blasint *); static float colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/ssytrf_rook.c b/relapack/src/ssytrf_rook.c index 040df24840..abcf29d1cb 100644 --- a/relapack/src/ssytrf_rook.c +++ b/relapack/src/ssytrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int *, - float *, const int *, int *, float *, const int *, int *); +static void RELAPACK_ssytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, + float *, const blasint *, blasint *, float *, const blasint *, blasint *); /** SSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int * http://www.netlib.org/lapack/explore-html/de/da4/ssytrf__rook_8f.html * */ void RELAPACK_ssytrf_rook( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_ssytrf_rook( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("SSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("SSYTRF", &minfo, strlen("SSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_ssytrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_ssytrf_rook( /** ssytrf_rook's recursive compute kernel */ static void RELAPACK_ssytrf_rook_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - float *A, const int *ldA, int *ipiv, - float *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + float *A, const blasint *ldA, blasint *ipiv, + float *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_SSYTRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_ssytrf_rook_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = SREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = SREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_ssytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_ssytrf_rook_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + n1; float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_ssytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_ssytrf_rook_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_ssytrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = SREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = SREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_ssytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_ssytrf_rook_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_ssytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/ssytrf_rook_rec2.c b/relapack/src/ssytrf_rook_rec2.c index 3308826d7e..41659cb3e5 100644 --- a/relapack/src/ssytrf_rook_rec2.c +++ b/relapack/src/ssytrf_rook_rec2.c @@ -14,7 +14,7 @@ /* Table of constant values */ -static int c__1 = 1; +static blasint c__1 = 1; static float c_b9 = -1.f; static float c_b10 = 1.f; @@ -25,39 +25,39 @@ static float c_b10 = 1.f; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, int *n, - int *nb, int *kb, float *a, int *lda, int *ipiv, float * - w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, blasint *n, + int *nb, blasint *kb, float *a, blasint *lda, blasint *ipiv, float * + w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; float r__1; /* Builtin functions */ double sqrt(double); /* Local variables */ - static int j, k, p; + static blasint j, k, p; static float t, r1, d11, d12, d21, d22; - static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static int imax, jmax; + static blasint imax, jmax; static float alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int sscal_(int *, float *, float *, int *); + extern /* Subroutine */ blasint sscal_(int *, float *, float *, blasint *); static float sfmin; - static int itemp; - extern /* Subroutine */ int sgemv_(char *, int *, int *, float *, - float *, int *, float *, int *, float *, float *, int *, + static blasint itemp; + extern /* Subroutine */ blasint sgemv_(char *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *, ftnlen); - static int kstep; + static blasint kstep; static float stemp; - extern /* Subroutine */ int scopy_(int *, float *, int *, float *, - int *), sswap_(int *, float *, int *, float *, int * + extern /* Subroutine */ blasint scopy_(int *, float *, blasint *, float *, + blasint *), sswap_(int *, float *, blasint *, float *, blasint * ); static float absakk; extern double slamch_(char *, ftnlen); - extern int isamax_(int *, float *, int *); + extern blasint isamax_(int *, float *, blasint *); static float colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/stgsyl.c b/relapack/src/stgsyl.c index 1870fb9289..6bace9f173 100644 --- a/relapack/src/stgsyl.c +++ b/relapack/src/stgsyl.c @@ -1,11 +1,11 @@ #include "relapack.h" #include -static void RELAPACK_stgsyl_rec(const char *, const int *, const int *, - const int *, const float *, const int *, const float *, const int *, - float *, const int *, const float *, const int *, const float *, - const int *, float *, const int *, float *, float *, float *, int *, int *, - int *); +static void RELAPACK_stgsyl_rec(const char *, const blasint *, const blasint *, + const blasint *, const float *, const blasint *, const float *, const blasint *, + float *, const blasint *, const float *, const blasint *, const float *, + const blasint *, float *, const blasint *, float *, float *, float *, blasint *, blasint *, + blasint *); /** STGSYL solves the generalized Sylvester equation. @@ -15,21 +15,21 @@ static void RELAPACK_stgsyl_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/dc/d67/stgsyl_8f.html * */ void RELAPACK_stgsyl( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, + const float *D, const blasint *ldD, const float *E, const blasint *ldE, + float *F, const blasint *ldF, float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info + float *Work, const blasint *lWork, blasint *iWork, blasint *info ) { // Parse arguments - const int notran = LAPACK(lsame)(trans, "N"); - const int tran = LAPACK(lsame)(trans, "T"); + const blasint notran = LAPACK(lsame)(trans, "N"); + const blasint tran = LAPACK(lsame)(trans, "T"); // Compute work buffer size - int lwmin = 1; + blasint lwmin = 1; if (notran && (*ijob == 1 || *ijob == 2)) lwmin = MAX(1, 2 * *m * *n); *info = 0; @@ -58,8 +58,8 @@ void RELAPACK_stgsyl( else if (*lWork < lwmin && *lWork != -1) *info = -20; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("STGSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("STGSYL", &minfo, strlen("STGSYL")); return; } @@ -75,8 +75,8 @@ void RELAPACK_stgsyl( // Constant const float ZERO[] = { 0. }; - int isolve = 1; - int ifunc = 0; + blasint isolve = 1; + blasint ifunc = 0; if (notran) { if (*ijob >= 3) { ifunc = *ijob - 2; @@ -87,12 +87,12 @@ void RELAPACK_stgsyl( } float scale2; - int iround; + blasint iround; for (iround = 1; iround <= isolve; iround++) { *scale = 1; float dscale = 0; float dsum = 1; - int pq; + blasint pq; RELAPACK_stgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); if (dscale != 0) { if (*ijob == 1 || *ijob == 3) @@ -121,13 +121,13 @@ void RELAPACK_stgsyl( /** stgsyl's recursive vompute kernel */ static void RELAPACK_stgsyl_rec( - const char *trans, const int *ifunc, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, + const char *trans, const blasint *ifunc, const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, + const float *D, const blasint *ldD, const float *E, const blasint *ldE, + float *F, const blasint *ldF, float *scale, float *dsum, float *dscale, - int *iWork, int *pq, int *info + blasint *iWork, blasint *pq, blasint *info ) { if (*m <= MAX(CROSSOVER_STGSYL, 1) && *n <= MAX(CROSSOVER_STGSYL, 1)) { @@ -139,20 +139,20 @@ static void RELAPACK_stgsyl_rec( // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs float scale1[] = { 1. }; float scale2[] = { 1. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - int m1 = SREC_SPLIT(*m); + blasint m1 = SREC_SPLIT(*m); if (A[m1 + *ldA * (m1 - 1)]) m1++; - const int m2 = *m - m1; + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -210,10 +210,10 @@ static void RELAPACK_stgsyl_rec( } } else { // Splitting - int n1 = SREC_SPLIT(*n); + blasint n1 = SREC_SPLIT(*n); if (B[n1 + *ldB * (n1 - 1)]) n1++; - const int n2 = *n - n1; + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/strsyl.c b/relapack/src/strsyl.c index 83947ef1a0..012fb35486 100644 --- a/relapack/src/strsyl.c +++ b/relapack/src/strsyl.c @@ -1,8 +1,8 @@ #include "relapack.h" -static void RELAPACK_strsyl_rec(const char *, const char *, const int *, - const int *, const int *, const float *, const int *, const float *, - const int *, float *, const int *, float *, int *); +static void RELAPACK_strsyl_rec(const char *, const char *, const blasint *, + const blasint *, const blasint *, const float *, const blasint *, const float *, + const blasint *, float *, const blasint *, float *, blasint *); /** STRSYL solves the real Sylvester matrix equation. @@ -12,20 +12,20 @@ static void RELAPACK_strsyl_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d4/d7d/strsyl_8f.html * */ void RELAPACK_strsyl( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, float *scale, + blasint *info ) { // Check arguments - const int notransA = LAPACK(lsame)(tranA, "N"); - const int transA = LAPACK(lsame)(tranA, "T"); - const int ctransA = LAPACK(lsame)(tranA, "C"); - const int notransB = LAPACK(lsame)(tranB, "N"); - const int transB = LAPACK(lsame)(tranB, "T"); - const int ctransB = LAPACK(lsame)(tranB, "C"); + const blasint notransA = LAPACK(lsame)(tranA, "N"); + const blasint transA = LAPACK(lsame)(tranA, "T"); + const blasint ctransA = LAPACK(lsame)(tranA, "C"); + const blasint notransB = LAPACK(lsame)(tranB, "N"); + const blasint transB = LAPACK(lsame)(tranB, "T"); + const blasint ctransB = LAPACK(lsame)(tranB, "C"); *info = 0; if (!transA && !ctransA && !notransA) *info = -1; @@ -44,8 +44,8 @@ void RELAPACK_strsyl( else if (*ldC < MAX(1, *m)) *info = -11; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("STRSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("STRSYL", &minfo, strlen("STRSYL")); return; } @@ -60,11 +60,11 @@ void RELAPACK_strsyl( /** strsyl's recursive compute kernel */ static void RELAPACK_strsyl_rec( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const float *A, const blasint *ldA, const float *B, const blasint *ldB, + float *C, const blasint *ldC, float *scale, + blasint *info ) { if (*m <= MAX(CROSSOVER_STRSYL, 1) && *n <= MAX(CROSSOVER_STRSYL, 1)) { @@ -77,20 +77,20 @@ static void RELAPACK_strsyl_rec( const float ONE[] = { 1. }; const float MONE[] = { -1. }; const float MSGN[] = { -*isgn }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs float scale1[] = { 1. }; float scale2[] = { 1. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - int m1 = SREC_SPLIT(*m); + blasint m1 = SREC_SPLIT(*m); if (A[m1 + *ldA * (m1 - 1)]) m1++; - const int m2 = *m - m1; + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -126,10 +126,10 @@ static void RELAPACK_strsyl_rec( } } else { // Splitting - int n1 = SREC_SPLIT(*n); + blasint n1 = SREC_SPLIT(*n); if (B[n1 + *ldB * (n1 - 1)]) n1++; - const int n2 = *n - n1; + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/strsyl_rec2.c b/relapack/src/strsyl_rec2.c index 6d40a475d7..37a24c7dc2 100644 --- a/relapack/src/strsyl_rec2.c +++ b/relapack/src/strsyl_rec2.c @@ -14,48 +14,48 @@ /* Table of constant values */ -static int c__1 = 1; -static int c_false = FALSE_; -static int c__2 = 2; +static blasint c__1 = 1; +static blasint c_false = FALSE_; +static blasint c__2 = 2; static float c_b26 = 1.f; static float c_b30 = 0.f; -static int c_true = TRUE_; +static blasint c_true = TRUE_; -void RELAPACK_strsyl_rec2(char *trana, char *tranb, int *isgn, int - *m, int *n, float *a, int *lda, float *b, int *ldb, float * - c__, int *ldc, float *scale, int *info, ftnlen trana_len, +void RELAPACK_strsyl_rec2(char *trana, char *tranb, blasint *isgn, int + *m, blasint *n, float *a, blasint *lda, float *b, blasint *ldb, float * + c__, blasint *ldc, float *scale, blasint *info, ftnlen trana_len, ftnlen tranb_len) { /* System generated locals */ - int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; float r__1, r__2; /* Local variables */ - static int j, k, l; + static blasint j, k, l; static float x[4] /* was [2][2] */; - static int k1, k2, l1, l2; + static blasint k1, k2, l1, l2; static float a11, db, da11, vec[4] /* was [2][2] */, dum[1], eps, sgn; - static int ierr; + static blasint ierr; static float smin; - extern float sdot_(int *, float *, int *, float *, int *); + extern float sdot_(int *, float *, blasint *, float *, blasint *); static float suml, sumr; - extern int lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int sscal_(int *, float *, float *, int *); - static int knext, lnext; + extern blasint lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ blasint sscal_(int *, float *, float *, blasint *); + static blasint knext, lnext; static float xnorm; - extern /* Subroutine */ int slaln2_(int *, int *, int *, float - *, float *, float *, int *, float *, float *, float *, int *, - float *, float *, float *, int *, float *, float *, int *), - slasy2_(int *, int *, int *, int *, int *, - float *, int *, float *, int *, float *, int *, float *, - float *, int *, float *, int *), slabad_(float *, float *); + extern /* Subroutine */ blasint slaln2_(int *, blasint *, blasint *, float + *, float *, float *, blasint *, float *, float *, float *, blasint *, + float *, float *, float *, blasint *, float *, float *, blasint *), + slasy2_(int *, blasint *, blasint *, blasint *, blasint *, + float *, blasint *, float *, blasint *, float *, blasint *, float *, + float *, blasint *, float *, blasint *), slabad_(float *, float *); static float scaloc; - extern float slamch_(char *, ftnlen), slange_(char *, int *, - int *, float *, int *, float *, ftnlen); - extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); + extern float slamch_(char *, ftnlen), slange_(char *, blasint *, + blasint *, float *, blasint *, float *, ftnlen); + extern /* Subroutine */ blasint xerbla_(char *, blasint *, ftnlen); static float bignum; - static int notrna, notrnb; + static blasint notrna, notrnb; static float smlnum; /* Parameter adjustments */ diff --git a/relapack/src/strtri.c b/relapack/src/strtri.c index d35bbd49f4..18d11f5ebc 100644 --- a/relapack/src/strtri.c +++ b/relapack/src/strtri.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_strtri_rec(const char *, const char *, const int *, - float *, const int *, int *); +static void RELAPACK_strtri_rec(const char *, const char *, const blasint *, + float *, const blasint *, blasint *); /** CTRTRI computes the inverse of a real upper or lower triangular matrix A. @@ -11,16 +11,16 @@ static void RELAPACK_strtri_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/de/d76/strtri_8f.html * */ void RELAPACK_strtri( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + float *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int nounit = LAPACK(lsame)(diag, "N"); - const int unit = LAPACK(lsame)(diag, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint nounit = LAPACK(lsame)(diag, "N"); + const blasint unit = LAPACK(lsame)(diag, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -31,8 +31,8 @@ void RELAPACK_strtri( else if (*ldA < MAX(1, *n)) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("STRTRI", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("STRTRI", &minfo, strlen("STRTRI")); return; } @@ -42,7 +42,7 @@ void RELAPACK_strtri( // check for singularity if (nounit) { - int i; + blasint i; for (i = 0; i < *n; i++) if (A[i + *ldA * i] == 0) { *info = i; @@ -57,9 +57,9 @@ void RELAPACK_strtri( /** strtri's recursive compute kernel */ static void RELAPACK_strtri_rec( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + float *A, const blasint *ldA, + blasint *info ){ if (*n <= MAX(CROSSOVER_STRTRI, 1)) { @@ -73,8 +73,8 @@ static void RELAPACK_strtri_rec( const float MONE[] = { -1. }; // Splitting - const int n1 = SREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = SREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c index 3aa6bf5318..0dd3fa7c33 100644 --- a/relapack/src/zgbtrf.c +++ b/relapack/src/zgbtrf.c @@ -1,9 +1,9 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *, - const int *, double *, const int *, int *, double *, const int *, double *, - const int *, int *); +static void RELAPACK_zgbtrf_rec(const blasint *, const blasint *, const blasint *, + const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *, + const blasint *, blasint *); /** ZGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. @@ -13,9 +13,9 @@ static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/dc/dcb/zgbtrf_8f.html * */ void RELAPACK_zgbtrf( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + double *Ab, const blasint *ldAb, blasint *ipiv, + blasint *info ) { // Check arguments @@ -31,8 +31,8 @@ void RELAPACK_zgbtrf( else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZGBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZGBTRF", &minfo, strlen("ZGBTRF")); return; } @@ -40,14 +40,14 @@ void RELAPACK_zgbtrf( const double ZERO[] = { 0., 0. }; // Result upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; double *const A = Ab + 2 * kv; // Zero upper diagonal fill-in elements - int i, j; + blasint i, j; for (j = 0; j < *n; j++) { double *const A_j = A + 2 * *ldA * j; for (i = MAX(0, j - kv); i < j - *ku; i++) @@ -55,11 +55,11 @@ void RELAPACK_zgbtrf( } // Allocate work space - const int n1 = ZREC_SPLIT(*n); - const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const int nWorkl = (kv > n1) ? n1 : kv; - const int mWorku = (*kl > n1) ? n1 : *kl; - const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint n1 = ZREC_SPLIT(*n); + const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const blasint nWorkl = (kv > n1) ? n1 : kv; + const blasint mWorku = (*kl > n1) ? n1 : *kl; + const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double)); double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double)); LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -76,10 +76,10 @@ void RELAPACK_zgbtrf( /** zgbtrf's recursive compute kernel */ static void RELAPACK_zgbtrf_rec( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, - int *info + const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, + double *Ab, const blasint *ldAb, blasint *ipiv, + double *Workl, const blasint *ldWorkl, double *Worku, const blasint *ldWorku, + blasint *info ) { if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) { @@ -91,25 +91,25 @@ static void RELAPACK_zgbtrf_rec( // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterators - int i, j; + blasint i, j; // Output upper band width - const int kv = *ku + *kl; + const blasint kv = *ku + *kl; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; double *const A = Ab + 2 * kv; // Splitting - const int n1 = MIN(ZREC_SPLIT(*n), *kl); - const int n2 = *n - n1; - const int m1 = MIN(n1, *m); - const int m2 = *m - m1; - const int mn1 = MIN(m1, n1); - const int mn2 = MIN(m2, n2); + const blasint n1 = MIN(ZREC_SPLIT(*n), *kl); + const blasint n2 = *n - n1; + const blasint m1 = MIN(n1, *m); + const blasint m2 = *m - m1; + const blasint mn1 = MIN(m1, n1); + const blasint mn2 = MIN(m2, n2); // Ab_L * // Ab_BR @@ -129,14 +129,14 @@ static void RELAPACK_zgbtrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // Banded splitting - const int n21 = MIN(n2, kv - n1); - const int n22 = MIN(n2 - n21, n1); - const int m21 = MIN(m2, *kl - m1); - const int m22 = MIN(m2 - m21, m1); + const blasint n21 = MIN(n2, kv - n1); + const blasint n22 = MIN(n2 - n21, n1); + const blasint m21 = MIN(m2, *kl - m1); + const blasint m22 = MIN(m2 - m21, m1); // n1 n21 n22 // m * A_Rl ARr @@ -164,7 +164,7 @@ static void RELAPACK_zgbtrf_rec( // partially redo swaps in A_L for (i = 0; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); @@ -180,7 +180,7 @@ static void RELAPACK_zgbtrf_rec( for (j = 0; j < n22; j++) { double *const A_Rrj = A_Rr + 2 * *ldA * j; for (i = j; i < mn1; i++) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { const double tmpr = A_Rrj[2 * i]; const double tmpc = A_Rrj[2 * i + 1]; @@ -211,7 +211,7 @@ static void RELAPACK_zgbtrf_rec( // partially undo swaps in A_L for (i = mn1 - 1; i >= 0; i--) { - const int ip = ipiv_T[i] - 1; + const blasint ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); diff --git a/relapack/src/zgemmt.c b/relapack/src/zgemmt.c index aa59302386..f53a3ca6f7 100644 --- a/relapack/src/zgemmt.c +++ b/relapack/src/zgemmt.c @@ -1,12 +1,12 @@ #include "relapack.h" static void RELAPACK_zgemmt_rec(const char *, const char *, const char *, - const int *, const int *, const double *, const double *, const int *, - const double *, const int *, const double *, double *, const int *); + const blasint *, const blasint *, const double *, const double *, const blasint *, + const double *, const blasint *, const double *, double *, const blasint *); static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *, - const int *, const int *, const double *, const double *, const int *, - const double *, const int *, const double *, double *, const int *); + const blasint *, const blasint *, const double *, const double *, const blasint *, + const double *, const blasint *, const double *, double *, const blasint *); /** ZGEMMT computes a matrix-matrix product with general matrices but updates @@ -20,10 +20,10 @@ static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *, * */ void RELAPACK_zgemmt( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { #if HAVE_XGEMMT @@ -32,15 +32,15 @@ void RELAPACK_zgemmt( #else // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int notransA = LAPACK(lsame)(transA, "N"); - const int tranA = LAPACK(lsame)(transA, "T"); - const int ctransA = LAPACK(lsame)(transA, "C"); - const int notransB = LAPACK(lsame)(transB, "N"); - const int tranB = LAPACK(lsame)(transB, "T"); - const int ctransB = LAPACK(lsame)(transB, "C"); - int info = 0; + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint notransA = LAPACK(lsame)(transA, "N"); + const blasint tranA = LAPACK(lsame)(transA, "T"); + const blasint ctransA = LAPACK(lsame)(transA, "C"); + const blasint notransB = LAPACK(lsame)(transB, "N"); + const blasint tranB = LAPACK(lsame)(transB, "T"); + const blasint ctransB = LAPACK(lsame)(transB, "C"); + blasint info = 0; if (!lower && !upper) info = 1; else if (!tranA && !ctransA && !notransA) @@ -58,7 +58,7 @@ void RELAPACK_zgemmt( else if (*ldC < MAX(1, *n)) info = 13; if (info) { - LAPACK(xerbla)("ZGEMMT", &info); + LAPACK(xerbla)("ZGEMMT", &info, strlen("ZGEMMT")); return; } @@ -76,10 +76,10 @@ void RELAPACK_zgemmt( /** zgemmt's recursive compute kernel */ static void RELAPACK_zgemmt_rec( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { if (*n <= MAX(CROSSOVER_ZGEMMT, 1)) { @@ -89,8 +89,8 @@ static void RELAPACK_zgemmt_rec( } // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_T // A_B @@ -126,16 +126,16 @@ static void RELAPACK_zgemmt_rec( /** zgemmt's unblocked compute kernel */ static void RELAPACK_zgemmt_rec2( const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC + const blasint *n, const blasint *k, + const double *alpha, const double *A, const blasint *ldA, + const double *B, const blasint *ldB, + const double *beta, double *C, const blasint *ldC ) { - const int incB = (*transB == 'N') ? 1 : *ldB; - const int incC = 1; + const blasint incB = (*transB == 'N') ? 1 : *ldB; + const blasint incC = 1; - int i; + blasint i; for (i = 0; i < *n; i++) { // A_0 // A_i @@ -151,13 +151,13 @@ static void RELAPACK_zgemmt_rec2( double *const C_ii = C + 2 * *ldC * i + 2 * i; if (*uplo == 'L') { - const int nmi = *n - i; + const blasint nmi = *n - i; if (*transA == 'N') BLAS(zgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); else BLAS(zgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); } else { - const int ip1 = i + 1; + const blasint ip1 = i + 1; if (*transA == 'N') BLAS(zgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); else diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c index cf8921e1f0..121b034018 100644 --- a/relapack/src/zgetrf.c +++ b/relapack/src/zgetrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_zgetrf_rec(const int *, const int *, double *, - const int *, int *, int *); +static void RELAPACK_zgetrf_rec(const blasint *, const blasint *, double *, + const blasint *, blasint *, blasint *); /** ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. @@ -11,9 +11,9 @@ static void RELAPACK_zgetrf_rec(const int *, const int *, double *, * http://www.netlib.org/lapack/explore-html/dd/dd1/zgetrf_8f.html * */ void RELAPACK_zgetrf( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { // Check arguments @@ -25,12 +25,12 @@ void RELAPACK_zgetrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZGETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZGETRF", &minfo, strlen("ZGETRF")); return; } - const int sn = MIN(*m, *n); + const blasint sn = MIN(*m, *n); RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -38,10 +38,10 @@ void RELAPACK_zgetrf( if (*m < *n) { // Constants const double ONE[] = { 1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Splitting - const int rn = *n - *m; + const blasint rn = *n - *m; // A_L A_R const double *const A_L = A; @@ -57,9 +57,9 @@ void RELAPACK_zgetrf( /** zgetrf's recursive compute kernel */ static void RELAPACK_zgetrf_rec( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info + const blasint *m, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + blasint *info ) { if (*n <= MAX(CROSSOVER_ZGETRF, 1)) { @@ -71,12 +71,12 @@ static void RELAPACK_zgetrf_rec( // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1. }; + const blasint iONE[] = { 1. }; // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; - const int m2 = *m - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; + const blasint m2 = *m - n1; // A_L A_R double *const A_L = A; @@ -91,8 +91,8 @@ static void RELAPACK_zgetrf_rec( // ipiv_T // ipiv_B - int *const ipiv_T = ipiv; - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_T = ipiv; + blasint *const ipiv_B = ipiv + n1; // recursion(A_L, ipiv_T) RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); @@ -111,7 +111,7 @@ static void RELAPACK_zgetrf_rec( // apply pivots to A_BL LAPACK(zlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) ipiv_B[i] += n1; } diff --git a/relapack/src/zhegst.c b/relapack/src/zhegst.c index d0ece21481..dc9b7eacee 100644 --- a/relapack/src/zhegst.c +++ b/relapack/src/zhegst.c @@ -3,9 +3,9 @@ #include "stdlib.h" #endif -static void RELAPACK_zhegst_rec(const int *, const char *, const int *, - double *, const int *, const double *, const int *, - double *, const int *, int *); +static void RELAPACK_zhegst_rec(const blasint *, const char *, const blasint *, + double *, const blasint *, const double *, const blasint *, + double *, const blasint *, blasint *); /** ZHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. @@ -15,14 +15,14 @@ static void RELAPACK_zhegst_rec(const int *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/dc/d68/zhegst_8f.html * */ void RELAPACK_zhegst( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info + const blasint *itype, const char *uplo, const blasint *n, + double *A, const blasint *ldA, const double *B, const blasint *ldB, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (*itype < 1 || *itype > 3) *info = -1; @@ -35,8 +35,8 @@ void RELAPACK_zhegst( else if (*ldB < MAX(1, *n)) *info = -7; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZHEGST", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZHEGST", &minfo, strlen("ZHEGST")); return; } @@ -45,9 +45,9 @@ void RELAPACK_zhegst( // Allocate work space double *Work = NULL; - int lWork = 0; + blasint lWork = 0; #if XSYGST_ALLOW_MALLOC - const int n1 = ZREC_SPLIT(*n); + const blasint n1 = ZREC_SPLIT(*n); lWork = n1 * (*n - n1); Work = malloc(lWork * 2 * sizeof(double)); if (!Work) @@ -67,9 +67,9 @@ void RELAPACK_zhegst( /** zhegst's recursive compute kernel */ static void RELAPACK_zhegst_rec( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - double *Work, const int *lWork, int *info + const blasint *itype, const char *uplo, const blasint *n, + double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *Work, const blasint *lWork, blasint *info ) { if (*n <= MAX(CROSSOVER_ZHEGST, 1)) { @@ -84,14 +84,14 @@ static void RELAPACK_zhegst_rec( const double MONE[] = { -1., 0. }; const double HALF[] = { .5, 0. }; const double MHALF[] = { -.5, 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/zhetrf.c b/relapack/src/zhetrf.c index ef4e1f5d5d..3d458fecf8 100644 --- a/relapack/src/zhetrf.c +++ b/relapack/src/zhetrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *, - double *, const int *, int *, double *, const int *, int *); +static void RELAPACK_zhetrf_rec(const char *, const blasint *, const blasint *, blasint *, + double *, const blasint *, blasint *, double *, const blasint *, blasint *); /** ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *, * http://www.netlib.org/lapack/explore-html/d6/dd3/zhetrf_8f.html * */ void RELAPACK_zhetrf( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_zhetrf( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZHETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZHETRF", &minfo, strlen("ZHETRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_zhetrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_zhetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_zhetrf( /** zhetrf's recursive compute kernel */ static void RELAPACK_zhetrf_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - double *A, const int *ldA, int *ipiv, - double *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_ZHETRF, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_zhetrf_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = ZREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = ZREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_zhetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_zhetrf_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + 2 * n1; double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_zhetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_zhetrf_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_zhetrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = ZREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = ZREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_zhetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_zhetrf_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_zhetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/zhetrf_rec2.c b/relapack/src/zhetrf_rec2.c index 867ea64e15..c14cf04406 100644 --- a/relapack/src/zhetrf_rec2.c +++ b/relapack/src/zhetrf_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; -static int c__1 = 1; +static blasint c__1 = 1; /** ZHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, int *n, int * - nb, int *kb, doublecomplex *a, int *lda, int *ipiv, - doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, blasint *n, blasint * + nb, blasint *kb, doublecomplex *a, blasint *lda, blasint *ipiv, + doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; double d__1, d__2, d__3, d__4; doublecomplex z__1, z__2, z__3, z__4; @@ -39,26 +39,26 @@ static int c__1 = 1; doublecomplex *, doublecomplex *); /* Local variables */ - static int j, k; + static blasint j, k; static double t, r1; static doublecomplex d11, d21, d22; - static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; static double alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - static int kstep; - extern /* Subroutine */ int zgemv_(char *, int *, int *, - doublecomplex *, doublecomplex *, int *, doublecomplex *, - int *, doublecomplex *, doublecomplex *, int *, ftnlen), - zcopy_(int *, doublecomplex *, int *, doublecomplex *, - int *), zswap_(int *, doublecomplex *, int *, - doublecomplex *, int *); + static blasint kstep; + extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *, + doublecomplex *, doublecomplex *, blasint *, doublecomplex *, + blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen), + zcopy_(int *, doublecomplex *, blasint *, doublecomplex *, + blasint *), zswap_(int *, doublecomplex *, blasint *, + doublecomplex *, blasint *); static double absakk; - extern /* Subroutine */ int zdscal_(int *, double *, - doublecomplex *, int *); + extern /* Subroutine */ blasint zdscal_(int *, double *, + doublecomplex *, blasint *); static double colmax; - extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) + extern /* Subroutine */ blasint zlacgv_(int *, doublecomplex *, blasint *) ; - extern int izamax_(int *, doublecomplex *, int *); + extern blasint izamax_(int *, doublecomplex *, blasint *); static double rowmax; /* Parameter adjustments */ diff --git a/relapack/src/zhetrf_rook.c b/relapack/src/zhetrf_rook.c index 15ceaeae7a..285aea96e8 100644 --- a/relapack/src/zhetrf_rook.c +++ b/relapack/src/zhetrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int *, - double *, const int *, int *, double *, const int *, int *); +static void RELAPACK_zhetrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, + double *, const blasint *, blasint *, double *, const blasint *, blasint *); /** ZHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int * http://www.netlib.org/lapack/explore-html/d6/d6f/zhetrf__rook_8f.html * */ void RELAPACK_zhetrf_rook( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_zhetrf_rook( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZHETRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZHETRF", &minfo, strlen("ZHETRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_zhetrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_zhetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_zhetrf_rook( /** zhetrf_rook's recursive compute kernel */ static void RELAPACK_zhetrf_rook_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - double *A, const int *ldA, int *ipiv, - double *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_ZHETRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_zhetrf_rook_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = ZREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = ZREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_zhetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_zhetrf_rook_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + 2 * n1; double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_zhetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_zhetrf_rook_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_zhetrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = ZREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = ZREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_zhetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_zhetrf_rook_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_zhetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/zhetrf_rook_rec2.c b/relapack/src/zhetrf_rook_rec2.c index a56ad710b7..e5033ad491 100644 --- a/relapack/src/zhetrf_rook_rec2.c +++ b/relapack/src/zhetrf_rook_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; -static int c__1 = 1; +static blasint c__1 = 1; /** ZHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, int *n, - int *nb, int *kb, doublecomplex *a, int *lda, int * - ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, blasint *n, + int *nb, blasint *kb, doublecomplex *a, blasint *lda, blasint * + ipiv, doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; double d__1, d__2; doublecomplex z__1, z__2, z__3, z__4, z__5; @@ -39,30 +39,30 @@ static int c__1 = 1; doublecomplex *, doublecomplex *); /* Local variables */ - static int j, k, p; + static blasint j, k, p; static double t, r1; static doublecomplex d11, d21, d22; - static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static int imax, jmax; + static blasint imax, jmax; static double alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); static double dtemp, sfmin; - static int itemp, kstep; - extern /* Subroutine */ int zgemv_(char *, int *, int *, - doublecomplex *, doublecomplex *, int *, doublecomplex *, - int *, doublecomplex *, doublecomplex *, int *, ftnlen), - zcopy_(int *, doublecomplex *, int *, doublecomplex *, - int *), zswap_(int *, doublecomplex *, int *, - doublecomplex *, int *); + static blasint itemp, kstep; + extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *, + doublecomplex *, doublecomplex *, blasint *, doublecomplex *, + blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen), + zcopy_(int *, doublecomplex *, blasint *, doublecomplex *, + blasint *), zswap_(int *, doublecomplex *, blasint *, + doublecomplex *, blasint *); extern double dlamch_(char *, ftnlen); static double absakk; - extern /* Subroutine */ int zdscal_(int *, double *, - doublecomplex *, int *); + extern /* Subroutine */ blasint zdscal_(int *, double *, + doublecomplex *, blasint *); static double colmax; - extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) + extern /* Subroutine */ blasint zlacgv_(int *, doublecomplex *, blasint *) ; - extern int izamax_(int *, doublecomplex *, int *); + extern blasint izamax_(int *, doublecomplex *, blasint *); static double rowmax; /* Parameter adjustments */ diff --git a/relapack/src/zlauum.c b/relapack/src/zlauum.c index 490dcc82e9..14fcd92138 100644 --- a/relapack/src/zlauum.c +++ b/relapack/src/zlauum.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_zlauum_rec(const char *, const int *, double *, - const int *, int *); +static void RELAPACK_zlauum_rec(const char *, const blasint *, double *, + const blasint *, blasint *); /** ZLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. @@ -11,14 +11,14 @@ static void RELAPACK_zlauum_rec(const char *, const int *, double *, * http://www.netlib.org/lapack/explore-html/d8/d45/zlauum_8f.html * */ void RELAPACK_zlauum( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_zlauum( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZLAUUM", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZLAUUM", &minfo, strlen("ZLAUUM")); return; } @@ -42,9 +42,9 @@ void RELAPACK_zlauum( /** zlauum's recursive compute kernel */ static void RELAPACK_zlauum_rec( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { if (*n <= MAX(CROSSOVER_ZLAUUM, 1)) { @@ -57,8 +57,8 @@ static void RELAPACK_zlauum_rec( const double ONE[] = { 1., 0. }; // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/zpbtrf.c b/relapack/src/zpbtrf.c index 37e711c9dd..fb0e1e97b5 100644 --- a/relapack/src/zpbtrf.c +++ b/relapack/src/zpbtrf.c @@ -1,8 +1,8 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *, - double *, const int *, double *, const int *, int *); +static void RELAPACK_zpbtrf_rec(const char *, const blasint *, const blasint *, + double *, const blasint *, double *, const blasint *, blasint *); /** ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. @@ -12,14 +12,14 @@ static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/db/da9/zpbtrf_8f.html * */ void RELAPACK_zpbtrf( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info + const char *uplo, const blasint *n, const blasint *kd, + double *Ab, const blasint *ldAb, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -30,8 +30,8 @@ void RELAPACK_zpbtrf( else if (*ldAb < *kd + 1) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZPBTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZPBTRF", &minfo, strlen("ZPBTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_zpbtrf( const double ZERO[] = { 0., 0. }; // Allocate work space - const int n1 = ZREC_SPLIT(*n); - const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint n1 = ZREC_SPLIT(*n); + const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; double *Work = malloc(mWork * nWork * 2 * sizeof(double)); LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -58,10 +58,10 @@ void RELAPACK_zpbtrf( /** zpbtrf's recursive compute kernel */ static void RELAPACK_zpbtrf_rec( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - double *Work, const int *ldWork, - int *info + const char *uplo, const blasint *n, const blasint *kd, + double *Ab, const blasint *ldAb, + double *Work, const blasint *ldWork, + blasint *info ){ if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) { @@ -75,12 +75,12 @@ static void RELAPACK_zpbtrf_rec( const double MONE[] = { -1., 0. }; // Unskew A - const int ldA[] = { *ldAb - 1 }; + const blasint ldA[] = { *ldAb - 1 }; double *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); // Splitting - const int n1 = MIN(ZREC_SPLIT(*n), *kd); - const int n2 = *n - n1; + const blasint n1 = MIN(ZREC_SPLIT(*n), *kd); + const blasint n2 = *n - n1; // * * // * Ab_BR @@ -99,8 +99,8 @@ static void RELAPACK_zpbtrf_rec( return; // Banded splitting - const int n21 = MIN(n2, *kd - n1); - const int n22 = MIN(n2 - n21, *kd); + const blasint n21 = MIN(n2, *kd - n1); + const blasint n22 = MIN(n2 - n21, *kd); // n1 n21 n22 // n1 * A_TRl A_TRr diff --git a/relapack/src/zpotrf.c b/relapack/src/zpotrf.c index 411ac5fc0c..9259279c1f 100644 --- a/relapack/src/zpotrf.c +++ b/relapack/src/zpotrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_zpotrf_rec(const char *, const int *, double *, - const int *, int *); +static void RELAPACK_zpotrf_rec(const char *, const blasint *, double *, + const blasint *, blasint *); /** ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. @@ -11,14 +11,14 @@ static void RELAPACK_zpotrf_rec(const char *, const int *, double *, * http://www.netlib.org/lapack/explore-html/d1/db9/zpotrf_8f.html * */ void RELAPACK_zpotrf( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_zpotrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZPOTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZPOTRF", &minfo, strlen("ZPOTRF")); return; } @@ -42,9 +42,9 @@ void RELAPACK_zpotrf( /** zpotrf's recursive compute kernel */ static void RELAPACK_zpotrf_rec( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { if (*n <= MAX(CROSSOVER_ZPOTRF, 1)) { @@ -58,8 +58,8 @@ static void RELAPACK_zpotrf_rec( const double MONE[] = { -1., 0. }; // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/zsytrf.c b/relapack/src/zsytrf.c index 3be21563a7..f3412ad8f3 100644 --- a/relapack/src/zsytrf.c +++ b/relapack/src/zsytrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *, - double *, const int *, int *, double *, const int *, int *); +static void RELAPACK_zsytrf_rec(const char *, const blasint *, const blasint *, blasint *, + double *, const blasint *, blasint *, double *, const blasint *, blasint *); /** ZSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *, * http://www.netlib.org/lapack/explore-html/da/d94/zsytrf_8f.html * */ void RELAPACK_zsytrf( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_zsytrf( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZSYTRF", &minfo, strlen("ZSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_zsytrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy arguments - int nout; + blasint nout; // Recursive kernel RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_zsytrf( /** zsytrf's recursive compute kernel */ static void RELAPACK_zsytrf_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - double *A, const int *ldA, int *ipiv, - double *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_ZSYTRF, 3)) { // Unblocked @@ -96,34 +96,34 @@ static void RELAPACK_zsytrf_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Loop iterator - int i; + blasint i; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = ZREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = ZREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_zsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -139,23 +139,23 @@ static void RELAPACK_zsytrf_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + 2 * n1; double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_zsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -182,22 +182,22 @@ static void RELAPACK_zsytrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = ZREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = ZREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_zsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -213,19 +213,19 @@ static void RELAPACK_zsytrf_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_zsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/zsytrf_rec2.c b/relapack/src/zsytrf_rec2.c index 33902ee9ed..ff17267c70 100644 --- a/relapack/src/zsytrf_rec2.c +++ b/relapack/src/zsytrf_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; -static int c__1 = 1; +static blasint c__1 = 1; /** ZSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, int *n, int * - nb, int *kb, doublecomplex *a, int *lda, int *ipiv, - doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, blasint *n, blasint * + nb, blasint *kb, doublecomplex *a, blasint *lda, blasint *ipiv, + doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; double d__1, d__2, d__3, d__4; doublecomplex z__1, z__2, z__3; @@ -38,22 +38,22 @@ static int c__1 = 1; void z_div(doublecomplex *, doublecomplex *, doublecomplex *); /* Local variables */ - static int j, k; + static blasint j, k; static doublecomplex t, r1, d11, d21, d22; - static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; static double alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ int zscal_(int *, doublecomplex *, - doublecomplex *, int *); - static int kstep; - extern /* Subroutine */ int zgemv_(char *, int *, int *, - doublecomplex *, doublecomplex *, int *, doublecomplex *, - int *, doublecomplex *, doublecomplex *, int *, ftnlen), - zcopy_(int *, doublecomplex *, int *, doublecomplex *, - int *), zswap_(int *, doublecomplex *, int *, - doublecomplex *, int *); + extern /* Subroutine */ blasint zscal_(int *, doublecomplex *, + doublecomplex *, blasint *); + static blasint kstep; + extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *, + doublecomplex *, doublecomplex *, blasint *, doublecomplex *, + blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen), + zcopy_(int *, doublecomplex *, blasint *, doublecomplex *, + blasint *), zswap_(int *, doublecomplex *, blasint *, + doublecomplex *, blasint *); static double absakk, colmax; - extern int izamax_(int *, doublecomplex *, int *); + extern blasint izamax_(int *, doublecomplex *, blasint *); static double rowmax; /* Parameter adjustments */ diff --git a/relapack/src/zsytrf_rook.c b/relapack/src/zsytrf_rook.c index c598f7b1eb..fc6d736455 100644 --- a/relapack/src/zsytrf_rook.c +++ b/relapack/src/zsytrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int *, - double *, const int *, int *, double *, const int *, int *); +static void RELAPACK_zsytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, + double *, const blasint *, blasint *, double *, const blasint *, blasint *); /** ZSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int * http://www.netlib.org/lapack/explore-html/d6/d6e/zsytrf__rook_8f.html * */ void RELAPACK_zsytrf_rook( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info + const char *uplo, const blasint *n, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *lWork, blasint *info ) { // Required work size - const int cleanlWork = *n * (*n / 2); - int minlWork = cleanlWork; + const blasint cleanlWork = *n * (*n / 2); + blasint minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_zsytrf_rook( #endif if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZSYTRF", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZSYTRF", &minfo, strlen("ZSYTRF")); return; } @@ -64,7 +64,7 @@ void RELAPACK_zsytrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - int nout; + blasint nout; // Recursive kernel RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_zsytrf_rook( /** zsytrf_rook's recursive compute kernel */ static void RELAPACK_zsytrf_rook_rec( - const char *uplo, const int *n_full, const int *n, int *n_out, - double *A, const int *ldA, int *ipiv, - double *Work, const int *ldWork, int *info + const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, + double *A, const blasint *ldA, blasint *ipiv, + double *Work, const blasint *ldWork, blasint *info ) { // top recursion level? - const int top = *n_full == *n; + const blasint top = *n_full == *n; if (*n <= MAX(CROSSOVER_ZSYTRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_zsytrf_rook_rec( return; } - int info1, info2; + blasint info1, info2; // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; - const int n_rest = *n_full - *n; + const blasint n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - int n1 = ZREC_SPLIT(*n); - int n2 = *n - n1; + blasint n1 = ZREC_SPLIT(*n); + blasint n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - int n1_out; + blasint n1_out; RELAPACK_zsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const int n_full2 = *n_full - n1; + const blasint n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_zsytrf_rook_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + 2 * n1; double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const int ldWork_BR = top ? n2 : *ldWork; + const blasint ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - int *const ipiv_B = ipiv + n1; + blasint *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - int n2_out; + blasint n2_out; RELAPACK_zsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_zsytrf_rook_rec( n2 = n2_out; // shift pivots - int i; + blasint i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_zsytrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - int n2 = ZREC_SPLIT(*n); - int n1 = *n - n2; + blasint n2 = ZREC_SPLIT(*n); + blasint n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - int n2_out; + blasint n2_out; RELAPACK_zsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const int n2_diff = n2 - n2_out; + const blasint n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const int n_full1 = *n_full - n2; + const blasint n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_zsytrf_rook_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const int ldWork_L = top ? n1 : *ldWork; + const blasint ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - int n1_out; + blasint n1_out; RELAPACK_zsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const int n_restp1 = n_rest + 1; + const blasint n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/zsytrf_rook_rec2.c b/relapack/src/zsytrf_rook_rec2.c index 9e111fe0cd..4dbf8733af 100644 --- a/relapack/src/zsytrf_rook_rec2.c +++ b/relapack/src/zsytrf_rook_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; -static int c__1 = 1; +static blasint c__1 = 1; /** ZSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. * @@ -24,12 +24,12 @@ static int c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, int *n, - int *nb, int *kb, doublecomplex *a, int *lda, int * - ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, blasint *n, + int *nb, blasint *kb, doublecomplex *a, blasint *lda, blasint * + ipiv, doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len) { /* System generated locals */ - int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; double d__1, d__2; doublecomplex z__1, z__2, z__3, z__4; @@ -38,26 +38,26 @@ static int c__1 = 1; void z_div(doublecomplex *, doublecomplex *, doublecomplex *); /* Local variables */ - static int j, k, p; + static blasint j, k, p; static doublecomplex t, r1, d11, d12, d21, d22; - static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static int imax, jmax; + static blasint imax, jmax; static double alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); static double dtemp, sfmin; - extern /* Subroutine */ int zscal_(int *, doublecomplex *, - doublecomplex *, int *); - static int itemp, kstep; - extern /* Subroutine */ int zgemv_(char *, int *, int *, - doublecomplex *, doublecomplex *, int *, doublecomplex *, - int *, doublecomplex *, doublecomplex *, int *, ftnlen), - zcopy_(int *, doublecomplex *, int *, doublecomplex *, - int *), zswap_(int *, doublecomplex *, int *, - doublecomplex *, int *); + extern /* Subroutine */ blasint zscal_(int *, doublecomplex *, + doublecomplex *, blasint *); + static blasint itemp, kstep; + extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *, + doublecomplex *, doublecomplex *, blasint *, doublecomplex *, + blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen), + zcopy_(int *, doublecomplex *, blasint *, doublecomplex *, + blasint *), zswap_(int *, doublecomplex *, blasint *, + doublecomplex *, blasint *); extern double dlamch_(char *, ftnlen); static double absakk, colmax; - extern int izamax_(int *, doublecomplex *, int *); + extern blasint izamax_(int *, doublecomplex *, blasint *); static double rowmax; /* Parameter adjustments */ diff --git a/relapack/src/ztgsyl.c b/relapack/src/ztgsyl.c index 2c8a35256d..6a41475e86 100644 --- a/relapack/src/ztgsyl.c +++ b/relapack/src/ztgsyl.c @@ -1,10 +1,10 @@ #include "relapack.h" #include -static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *, - const int *, const double *, const int *, const double *, const int *, - double *, const int *, const double *, const int *, const double *, - const int *, double *, const int *, double *, double *, double *, int *); +static void RELAPACK_ztgsyl_rec(const char *, const blasint *, const blasint *, + const blasint *, const double *, const blasint *, const double *, const blasint *, + double *, const blasint *, const double *, const blasint *, const double *, + const blasint *, double *, const blasint *, double *, double *, double *, blasint *); /** ZTGSYL solves the generalized Sylvester equation. @@ -14,21 +14,21 @@ static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *, * http://www.netlib.org/lapack/explore-html/db/d68/ztgsyl_8f.html * */ void RELAPACK_ztgsyl( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, + const char *trans, const blasint *ijob, const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, + const double *D, const blasint *ldD, const double *E, const blasint *ldE, + double *F, const blasint *ldF, double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info + double *Work, const blasint *lWork, blasint *iWork, blasint *info ) { // Parse arguments - const int notran = LAPACK(lsame)(trans, "N"); - const int tran = LAPACK(lsame)(trans, "C"); + const blasint notran = LAPACK(lsame)(trans, "N"); + const blasint tran = LAPACK(lsame)(trans, "C"); // Compute work buffer size - int lwmin = 1; + blasint lwmin = 1; if (notran && (*ijob == 1 || *ijob == 2)) lwmin = MAX(1, 2 * *m * *n); *info = 0; @@ -57,8 +57,8 @@ void RELAPACK_ztgsyl( else if (*lWork < lwmin && *lWork != -1) *info = -20; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZTGSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZTGSYL", &minfo, strlen("ZTGSYL")); return; } @@ -74,8 +74,8 @@ void RELAPACK_ztgsyl( // Constant const double ZERO[] = { 0., 0. }; - int isolve = 1; - int ifunc = 0; + blasint isolve = 1; + blasint ifunc = 0; if (notran) { if (*ijob >= 3) { ifunc = *ijob - 2; @@ -86,7 +86,7 @@ void RELAPACK_ztgsyl( } double scale2; - int iround; + blasint iround; for (iround = 1; iround <= isolve; iround++) { *scale = 1; double dscale = 0; @@ -119,13 +119,13 @@ void RELAPACK_ztgsyl( /** ztgsyl's recursive vompute kernel */ static void RELAPACK_ztgsyl_rec( - const char *trans, const int *ifunc, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, + const char *trans, const blasint *ifunc, const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, + const double *D, const blasint *ldD, const double *E, const blasint *ldE, + double *F, const blasint *ldF, double *scale, double *dsum, double *dscale, - int *info + blasint *info ) { if (*m <= MAX(CROSSOVER_ZTGSYL, 1) && *n <= MAX(CROSSOVER_ZTGSYL, 1)) { @@ -137,18 +137,18 @@ static void RELAPACK_ztgsyl_rec( // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs double scale1[] = { 1., 0. }; double scale2[] = { 1., 0. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - const int m1 = ZREC_SPLIT(*m); - const int m2 = *m - m1; + const blasint m1 = ZREC_SPLIT(*m); + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -206,8 +206,8 @@ static void RELAPACK_ztgsyl_rec( } } else { // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/ztrsyl.c b/relapack/src/ztrsyl.c index 82b2c88031..567ef115a8 100644 --- a/relapack/src/ztrsyl.c +++ b/relapack/src/ztrsyl.c @@ -1,8 +1,8 @@ #include "relapack.h" -static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *, - const int *, const int *, const double *, const int *, const double *, - const int *, double *, const int *, double *, int *); +static void RELAPACK_ztrsyl_rec(const char *, const char *, const blasint *, + const blasint *, const blasint *, const double *, const blasint *, const double *, + const blasint *, double *, const blasint *, double *, blasint *); /** ZTRSYL solves the complex Sylvester matrix equation. @@ -12,18 +12,18 @@ static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d1/d36/ztrsyl_8f.html * */ void RELAPACK_ztrsyl( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, double *scale, + blasint *info ) { // Check arguments - const int notransA = LAPACK(lsame)(tranA, "N"); - const int ctransA = LAPACK(lsame)(tranA, "C"); - const int notransB = LAPACK(lsame)(tranB, "N"); - const int ctransB = LAPACK(lsame)(tranB, "C"); + const blasint notransA = LAPACK(lsame)(tranA, "N"); + const blasint ctransA = LAPACK(lsame)(tranA, "C"); + const blasint notransB = LAPACK(lsame)(tranB, "N"); + const blasint ctransB = LAPACK(lsame)(tranB, "C"); *info = 0; if (!ctransA && !notransA) *info = -1; @@ -42,8 +42,8 @@ void RELAPACK_ztrsyl( else if (*ldC < MAX(1, *m)) *info = -11; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZTRSYL", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZTRSYL", &minfo, strlen("ZTRSYL")); return; } @@ -58,11 +58,11 @@ void RELAPACK_ztrsyl( /** ztrsyl's recursive compute kernel */ static void RELAPACK_ztrsyl_rec( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info + const char *tranA, const char *tranB, const blasint *isgn, + const blasint *m, const blasint *n, + const double *A, const blasint *ldA, const double *B, const blasint *ldB, + double *C, const blasint *ldC, double *scale, + blasint *info ) { if (*m <= MAX(CROSSOVER_ZTRSYL, 1) && *n <= MAX(CROSSOVER_ZTRSYL, 1)) { @@ -75,18 +75,18 @@ static void RELAPACK_ztrsyl_rec( const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; const double MSGN[] = { -*isgn, 0. }; - const int iONE[] = { 1 }; + const blasint iONE[] = { 1 }; // Outputs double scale1[] = { 1., 0. }; double scale2[] = { 1., 0. }; - int info1[] = { 0 }; - int info2[] = { 0 }; + blasint info1[] = { 0 }; + blasint info2[] = { 0 }; if (*m > *n) { // Splitting - const int m1 = ZREC_SPLIT(*m); - const int m2 = *m - m1; + const blasint m1 = ZREC_SPLIT(*m); + const blasint m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -122,8 +122,8 @@ static void RELAPACK_ztrsyl_rec( } } else { // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/ztrsyl_rec2.c b/relapack/src/ztrsyl_rec2.c index 526ab097cd..edc6ffc6bd 100644 --- a/relapack/src/ztrsyl_rec2.c +++ b/relapack/src/ztrsyl_rec2.c @@ -14,16 +14,16 @@ #include "f2c.h" #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES -doublecomplex zdotu_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { - extern void zdotu_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); +doublecomplex zdotu_fun(int *n, doublecomplex *x, blasint *incx, doublecomplex *y, blasint *incy) { + extern void zdotu_(doublecomplex *, blasint *, doublecomplex *, blasint *, doublecomplex *, blasint *); doublecomplex result; zdotu_(&result, n, x, incx, y, incy); return result; } #define zdotu_ zdotu_fun -doublecomplex zdotc_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { - extern void zdotc_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); +doublecomplex zdotc_fun(int *n, doublecomplex *x, blasint *incx, doublecomplex *y, blasint *incy) { + extern void zdotc_(doublecomplex *, blasint *, doublecomplex *, blasint *, doublecomplex *, blasint *); doublecomplex result; zdotc_(&result, n, x, incx, y, incy); return result; @@ -43,7 +43,7 @@ doublecomplex zladiv_fun(doublecomplex *a, doublecomplex *b) { /* Table of constant values */ -static int c__1 = 1; +static blasint c__1 = 1; /** RELAPACK_ZTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) * @@ -51,12 +51,12 @@ static int c__1 = 1; * It serves as an unblocked kernel in the recursive algorithms. * */ /* Subroutine */ void RELAPACK_ztrsyl_rec2(char *trana, char *tranb, int - *isgn, int *m, int *n, doublecomplex *a, int *lda, - doublecomplex *b, int *ldb, doublecomplex *c__, int *ldc, - double *scale, int *info, ftnlen trana_len, ftnlen tranb_len) + *isgn, blasint *m, blasint *n, doublecomplex *a, blasint *lda, + doublecomplex *b, blasint *ldb, doublecomplex *c__, blasint *ldc, + double *scale, blasint *info, ftnlen trana_len, ftnlen tranb_len) { /* System generated locals */ - int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; double d__1, d__2; doublecomplex z__1, z__2, z__3, z__4; @@ -66,7 +66,7 @@ static int c__1 = 1; void d_cnjg(doublecomplex *, doublecomplex *); /* Local variables */ - static int j, k, l; + static blasint j, k, l; static doublecomplex a11; static double db; static doublecomplex x11; @@ -74,23 +74,23 @@ static int c__1 = 1; static doublecomplex vec; static double dum[1], eps, sgn, smin; static doublecomplex suml, sumr; - extern int lsame_(char *, char *, ftnlen, ftnlen); + extern blasint lsame_(char *, char *, ftnlen, ftnlen); /* Double Complex */ doublecomplex zdotc_(int *, - doublecomplex *, int *, doublecomplex *, int *), zdotu_( - int *, doublecomplex *, int *, - doublecomplex *, int *); - extern /* Subroutine */ int dlabad_(double *, double *); + doublecomplex *, blasint *, doublecomplex *, blasint *), zdotu_( + blasint *, doublecomplex *, blasint *, + doublecomplex *, blasint *); + extern /* Subroutine */ blasint dlabad_(double *, double *); extern double dlamch_(char *, ftnlen); static double scaloc; - extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); - extern double zlange_(char *, int *, int *, doublecomplex *, - int *, double *, ftnlen); + extern /* Subroutine */ blasint xerbla_(char *, blasint *, ftnlen); + extern double zlange_(char *, blasint *, blasint *, doublecomplex *, + blasint *, double *, ftnlen); static double bignum; - extern /* Subroutine */ int zdscal_(int *, double *, - doublecomplex *, int *); + extern /* Subroutine */ blasint zdscal_(int *, double *, + doublecomplex *, blasint *); /* Double Complex */ doublecomplex zladiv_(doublecomplex *, doublecomplex *); - static int notrna, notrnb; + static blasint notrna, notrnb; static double smlnum; /* Parameter adjustments */ diff --git a/relapack/src/ztrtri.c b/relapack/src/ztrtri.c index ac9fe7bd48..3f6606d84b 100644 --- a/relapack/src/ztrtri.c +++ b/relapack/src/ztrtri.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_ztrtri_rec(const char *, const char *, const int *, - double *, const int *, int *); +static void RELAPACK_ztrtri_rec(const char *, const char *, const blasint *, + double *, const blasint *, blasint *); /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. @@ -11,16 +11,16 @@ static void RELAPACK_ztrtri_rec(const char *, const char *, const int *, * http://www.netlib.org/lapack/explore-html/d1/d0e/ztrtri_8f.html * */ void RELAPACK_ztrtri( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + double *A, const blasint *ldA, + blasint *info ) { // Check arguments - const int lower = LAPACK(lsame)(uplo, "L"); - const int upper = LAPACK(lsame)(uplo, "U"); - const int nounit = LAPACK(lsame)(diag, "N"); - const int unit = LAPACK(lsame)(diag, "U"); + const blasint lower = LAPACK(lsame)(uplo, "L"); + const blasint upper = LAPACK(lsame)(uplo, "U"); + const blasint nounit = LAPACK(lsame)(diag, "N"); + const blasint unit = LAPACK(lsame)(diag, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -31,8 +31,8 @@ void RELAPACK_ztrtri( else if (*ldA < MAX(1, *n)) *info = -5; if (*info) { - const int minfo = -*info; - LAPACK(xerbla)("ZTRTRI", &minfo); + const blasint minfo = -*info; + LAPACK(xerbla)("ZTRTRI", &minfo, strlen("ZTRTRI")); return; } @@ -42,7 +42,7 @@ void RELAPACK_ztrtri( // check for singularity if (nounit) { - int i; + blasint i; for (i = 0; i < *n; i++) if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { *info = i; @@ -57,9 +57,9 @@ void RELAPACK_ztrtri( /** ztrtri's recursive compute kernel */ static void RELAPACK_ztrtri_rec( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info + const char *uplo, const char *diag, const blasint *n, + double *A, const blasint *ldA, + blasint *info ){ if (*n <= MAX(CROSSOVER_ZTRTRI, 1)) { @@ -73,8 +73,8 @@ static void RELAPACK_ztrtri_rec( const double MONE[] = { -1. }; // Splitting - const int n1 = ZREC_SPLIT(*n); - const int n2 = *n - n1; + const blasint n1 = ZREC_SPLIT(*n); + const blasint n2 = *n - n1; // A_TL A_TR // A_BL A_BR From 0bd956fd21cb1af79ac0c3dfb963bbb1dd8ce384 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 27 Apr 2019 22:49:04 +0200 Subject: [PATCH 184/189] Correct length of name string in xerbla call --- interface/trsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/trsm.c b/interface/trsm.c index f2da285de2..715c83a1f3 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -204,7 +204,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, if (side < 0) info = 1; if (info != 0) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)-1); return; } From 2aad88d5b9ded514d65c257cea818165447e5b78 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 27 Apr 2019 23:01:49 +0200 Subject: [PATCH 185/189] Avoid out-of-bounds accesses in LAPACK EIG tests see https://github.com/Reference-LAPACK/lapack/issues/333 --- lapack-netlib/TESTING/EIG/chet21.f | 3 ++- lapack-netlib/TESTING/EIG/chpt21.f | 2 +- lapack-netlib/TESTING/EIG/zhet21.f | 3 ++- lapack-netlib/TESTING/EIG/zhpt21.f | 3 ++- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/chet21.f b/lapack-netlib/TESTING/EIG/chet21.f index 8dbdb521ed..5aff649042 100644 --- a/lapack-netlib/TESTING/EIG/chet21.f +++ b/lapack-netlib/TESTING/EIG/chet21.f @@ -304,7 +304,8 @@ SUBROUTINE CHET21( ITYPE, UPLO, N, KBAND, A, LDA, D, E, U, LDU, V, 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN - DO 20 J = 1, N - 1 +CMK DO 20 J = 1, N - 1 + DO 20 J = 2, N - 1 CALL CHER2( CUPLO, N, -CMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK, N ) 20 CONTINUE diff --git a/lapack-netlib/TESTING/EIG/chpt21.f b/lapack-netlib/TESTING/EIG/chpt21.f index 4b92794702..e151a8bd8f 100644 --- a/lapack-netlib/TESTING/EIG/chpt21.f +++ b/lapack-netlib/TESTING/EIG/chpt21.f @@ -323,7 +323,7 @@ SUBROUTINE CHPT21( ITYPE, UPLO, N, KBAND, AP, D, E, U, LDU, VP, 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN - DO 20 J = 1, N - 1 + DO 20 J = 2, N - 1 CALL CHPR2( CUPLO, N, -CMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK ) 20 CONTINUE diff --git a/lapack-netlib/TESTING/EIG/zhet21.f b/lapack-netlib/TESTING/EIG/zhet21.f index 32a09741e4..f6cb2d70a0 100644 --- a/lapack-netlib/TESTING/EIG/zhet21.f +++ b/lapack-netlib/TESTING/EIG/zhet21.f @@ -304,7 +304,8 @@ SUBROUTINE ZHET21( ITYPE, UPLO, N, KBAND, A, LDA, D, E, U, LDU, V, 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN - DO 20 J = 1, N - 1 +CMK DO 20 J = 1, N - 1 + DO 20 J = 2, N - 1 CALL ZHER2( CUPLO, N, -DCMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK, N ) 20 CONTINUE diff --git a/lapack-netlib/TESTING/EIG/zhpt21.f b/lapack-netlib/TESTING/EIG/zhpt21.f index f9268661ac..ef9e4418dc 100644 --- a/lapack-netlib/TESTING/EIG/zhpt21.f +++ b/lapack-netlib/TESTING/EIG/zhpt21.f @@ -323,7 +323,8 @@ SUBROUTINE ZHPT21( ITYPE, UPLO, N, KBAND, AP, D, E, U, LDU, VP, 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN - DO 20 J = 1, N - 1 +CMK DO 20 J = 1, N - 1 + DO 20 J = 2, N - 1 CALL ZHPR2( CUPLO, N, -DCMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK ) 20 CONTINUE From 11530b76f7b19fbb2d9089ab8166ab54bde8b423 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Apr 2019 09:58:56 +0200 Subject: [PATCH 186/189] Correct INFO=4 condition --- relapack/src/cgetrf.c | 2 +- relapack/src/dgetrf.c | 5 ++--- relapack/src/sgetrf.c | 7 +------ relapack/src/zgetrf.c | 2 +- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c index 9aab718a0e..878c9ec15b 100644 --- a/relapack/src/cgetrf.c +++ b/relapack/src/cgetrf.c @@ -22,7 +22,7 @@ void RELAPACK_cgetrf( *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *n)) + else if (*ldA < MAX(1, *m)) *info = -4; if (*info) { const blasint minfo = -*info; diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c index c4bce8fc5d..be960fde9e 100644 --- a/relapack/src/dgetrf.c +++ b/relapack/src/dgetrf.c @@ -15,16 +15,15 @@ void RELAPACK_dgetrf( double *A, const blasint *ldA, blasint *ipiv, blasint *info ) { - // Check arguments *info = 0; if (*m < 0) *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *n)) + else if (*ldA < MAX(1, *m)) *info = -4; - if (*info) { + if (*info!=0) { const blasint minfo = -*info; LAPACK(xerbla)("DGETRF", &minfo, strlen("DGETRF")); return; diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c index 9d0ff10399..0231cc166f 100644 --- a/relapack/src/sgetrf.c +++ b/relapack/src/sgetrf.c @@ -1,5 +1,4 @@ #include "relapack.h" - static void RELAPACK_sgetrf_rec(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); @@ -22,16 +21,14 @@ void RELAPACK_sgetrf( *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *n)) + else if (*ldA < MAX(1, *m)) *info = -4; if (*info) { const blasint minfo = -*info; LAPACK(xerbla)("SGETRF", &minfo, strlen("SGETRF")); return; } - const blasint sn = MIN(*m, *n); - RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info); // Right remainder @@ -61,7 +58,6 @@ static void RELAPACK_sgetrf_rec( float *A, const blasint *ldA, blasint *ipiv, blasint *info ) { - if (*n <= MAX(CROSSOVER_SGETRF, 1)) { // Unblocked LAPACK(sgetf2)(m, n, A, ldA, ipiv, info); @@ -77,7 +73,6 @@ static void RELAPACK_sgetrf_rec( const blasint n1 = SREC_SPLIT(*n); const blasint n2 = *n - n1; const blasint m2 = *m - n1; - // A_L A_R float *const A_L = A; float *const A_R = A + *ldA * n1; diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c index 121b034018..b0d14ffb1e 100644 --- a/relapack/src/zgetrf.c +++ b/relapack/src/zgetrf.c @@ -22,7 +22,7 @@ void RELAPACK_zgetrf( *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *n)) + else if (*ldA < MAX(1, *m)) *info = -4; if (*info) { const blasint minfo = -*info; From 2cd463eabdcecce01a379c7aaebbb0c48e21c27d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Apr 2019 10:02:28 +0200 Subject: [PATCH 187/189] Disable reallocation of work array in xSYTRF as it appears to cause memory management problems (seen in the LAPACK tests) --- relapack/config.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/relapack/config.h b/relapack/config.h index 9113a712da..e4fab0a124 100644 --- a/relapack/config.h +++ b/relapack/config.h @@ -36,8 +36,8 @@ // allow malloc in xsygst for improved performance #define XSYGST_ALLOW_MALLOC ALLOW_MALLOC // allow malloc in xsytrf if the passed work buffer is too small -#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC - +//#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC +#define XSYTRF_ALLOW_MALLOC 0 //////////////////////////////// // LAPACK routine replacement // From 1036299da06d4ebd60139529885804fa63400e10 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 29 Apr 2019 00:12:37 +0200 Subject: [PATCH 188/189] Disable repeated recursion on Ab_BR in ReLAPACK xGBTRF due to crashes in LAPACK tests --- relapack/src/cgbtrf.c | 4 +++- relapack/src/dgbtrf.c | 6 ++++-- relapack/src/sgbtrf.c | 20 +++++++++++++------- relapack/src/zgbtrf.c | 12 +++++++----- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c index eddfdedf77..61332c6a6c 100644 --- a/relapack/src/cgbtrf.c +++ b/relapack/src/cgbtrf.c @@ -221,7 +221,9 @@ static void RELAPACK_cgbtrf_rec( } // recursion(Ab_BR, ipiv_B) - RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + //RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + LAPACK(cgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); + if (*info) *info += n1; // shift pivots diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c index f4b443629b..cdf06ad5be 100644 --- a/relapack/src/dgbtrf.c +++ b/relapack/src/dgbtrf.c @@ -1,5 +1,6 @@ #include "relapack.h" -#include "stdlib.h" +#include +#include static void RELAPACK_dgbtrf_rec(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *, const blasint *, blasint *); @@ -218,7 +219,8 @@ static void RELAPACK_dgbtrf_rec( } // recursion(Ab_BR, ipiv_B) - RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); +// RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + LAPACK(dgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); if (*info) *info += n1; // shift pivots diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c index 3a4de4eced..3e3fdf4555 100644 --- a/relapack/src/sgbtrf.c +++ b/relapack/src/sgbtrf.c @@ -27,7 +27,7 @@ void RELAPACK_sgbtrf( *info = -3; else if (*ku < 0) *info = -4; - else if (*ldAb < 2 * *kl + *ku + 1) + else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { const blasint minfo = -*info; @@ -55,15 +55,16 @@ void RELAPACK_sgbtrf( // Allocate work space const blasint n1 = SREC_SPLIT(*n); - const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const blasint nWorkl = (kv > n1) ? n1 : kv; - const blasint mWorku = (*kl > n1) ? n1 : *kl; - const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint mWorkl = abs( (kv > n1) ? MAX(1, *m - *kl) : kv ); + const blasint nWorkl = abs( (kv > n1) ? n1 : kv ); + const blasint mWorku = abs( (*kl > n1) ? n1 : *kl ); + const blasint nWorku = abs( (*kl > n1) ? MAX(0, *n - *kl) : *kl ); float *Workl = malloc(mWorkl * nWorkl * sizeof(float)); float *Worku = malloc(mWorku * nWorku * sizeof(float)); LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); + // Recursive kernel RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); @@ -81,6 +82,7 @@ static void RELAPACK_sgbtrf_rec( blasint *info ) { + if (*n <= MAX(CROSSOVER_SGBTRF, 1)) { // Unblocked LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); @@ -127,7 +129,7 @@ static void RELAPACK_sgbtrf_rec( float *const A_BR = A + *ldA * n1 + m1; // ipiv_T - // ipiv_B + // ipiv_B blasint *const ipiv_T = ipiv; blasint *const ipiv_B = ipiv + n1; @@ -155,6 +157,7 @@ static void RELAPACK_sgbtrf_rec( float *const A_BRbl = A_BR + m21; float *const A_BRbr = A_BR + *ldA * n21 + m21; + // recursion(Ab_L, ipiv_T) RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); @@ -216,8 +219,11 @@ static void RELAPACK_sgbtrf_rec( } } + // recursion(Ab_BR, ipiv_B) - RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); +//cause of infinite recursion here ? +// RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + LAPACK(sgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); if (*info) *info += n1; // shift pivots diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c index 0dd3fa7c33..d4ba417531 100644 --- a/relapack/src/zgbtrf.c +++ b/relapack/src/zgbtrf.c @@ -56,10 +56,10 @@ void RELAPACK_zgbtrf( // Allocate work space const blasint n1 = ZREC_SPLIT(*n); - const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const blasint nWorkl = (kv > n1) ? n1 : kv; - const blasint mWorku = (*kl > n1) ? n1 : *kl; - const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint mWorkl = abs ( (kv > n1) ? MAX(1, *m - *kl) : kv); + const blasint nWorkl = abs ( (kv > n1) ? n1 : kv); + const blasint mWorku = abs ( (*kl > n1) ? n1 : *kl); + const blasint nWorku = abs ( (*kl > n1) ? MAX(0, *n - *kl) : *kl); double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double)); double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double)); LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -221,7 +221,9 @@ static void RELAPACK_zgbtrf_rec( } // recursion(Ab_BR, ipiv_B) - RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + // RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + LAPACK(zgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); + if (*info) *info += n1; // shift pivots From 9763f872fcb841a00926f31c801bfd007a5337b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 29 Apr 2019 19:18:26 +0200 Subject: [PATCH 189/189] Update Changelog with changes from 0.3.6 --- Changelog.txt | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 49b26873a8..8df35d5c3a 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,82 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.6 +29-Apr-2019 + +common: + * the build tools now check that a given cpu TARGET is actually valid + * the build-time check of system features (c_check) has been made + less dependent on particular perl features (this should mainly + benefit building on Windows) + * several problem with the ReLAPACK integration were fixed, + including INTERFACE64 support and building a shared library + * building with CMAKE on BSD systems was improved + * a non-absolute SUM function was added based on the + existing optimized code for ASUM + * CBLAS interfaces to the IxMIN and IxMAX functions were added + * a name clash between LAPACKE and BOOST headers was resolved + * CMAKE builds with OpenMP failed to include the appropriate getrf_parallel + kernels + * a crash on thread (key) deletion with the USE_TLS=1 memory management + option was fixed + * restored several earlier fixes, in particular for OpenMP performance, + building on BSD, and calling fork on CYGWIN, which had inadvertently + been dropped in the 0.3.3 rewrite of the memory management code. + +x86_64: + * the AVX512 DGEMM kernel has been disabled again due to unsolved problems + * building with old versions of MSVC was fixed + * it is now possible to build a static library on Windows with CMAKE + * accessing environment variables on CYGWIN at run time was fixed + * the CMAKE build system now recognizes 32bit userspace on 64bit hardware + * Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected + * building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported + with CMAKE as well + * building for DYNAMIC_ARCH with GENERIC as the default target is now supported + * a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed + * assembly bugs involving undeclared modification of input operands were fixed + in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem, + Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause + test failures or segfaults when compiled with recent versions of gcc from 8 onward. + * a similar bug was fixed in the blas_quickdivide code used to split workloads + in most functions + * a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX + * fixed building on SkylakeX systems when either the compiler or the (emulated) operating + environment does not support AVX512 + * improved GEMM performance on ZEN targets + +x86: + * build failures caused by the recently added checks for AVX512 were fixed + * an inline assembly bug involving undeclared modification of an input argument was + fixed in the blas_quickdivide code used to split workloads in most functions + * a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX + +MIPS32: + * a bug in the IMIN implementation made it return the result of IMAX + +POWER: + * single precision BLAS1/2 functions have received optimized POWER8 kernels + * POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel + * building on PPC970 systems under OSX Leopard or Tiger is now supported + * out-of-bounds memory accesses in the gemm_beta microkernels were fixed + * building a shared library on AIX is now supported for POWER6 + * DYNAMIC_ARCH support has been added for POWER6 and newer + +ARMv7: + * corrected xDOT behaviour with zero INC_X or INC_Y + * a bug in the IMIN implementation made it return the result of IMAX + +ARMv8: + * added support for HiSilicon TSV110 cpus + * the CMAKE build system now recognizes 32bit userspace on 64bit hardware + * cross-compilation with CMAKE now works again + * a bug in the IMIN implementation made it return the result of IMAX + * ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 + +IBM Z: + * optimized microkernels for single precicion BLAS1/2 functions have been added + for both Z13 and Z14 + ==================================================================== Version 0.3.5 31-Dec-2018